From e6c0ca403d7ba814d6e0353578928ce97a4c60b9 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 23 Apr 2015 10:37:37 +0200 Subject: [PATCH] recollindex: do not retry files which previously failed to be indexed, except if they were changed since, or option -k is set --- src/doc/man/recollindex.1 | 25 +++++++-- src/doc/user/usermanual.xml | 36 ++++++++----- src/index/fsindexer.cpp | 43 ++++++++++++--- src/index/fsindexer.h | 9 ++-- src/index/indexer.cpp | 10 ++-- src/index/indexer.h | 11 ++-- src/index/recollindex.cpp | 22 +++++--- src/rcldb/rcldb.cpp | 102 +++++++++++++++++++++++------------- src/rcldb/rcldb.h | 37 ++++++++++--- 9 files changed, 209 insertions(+), 86 deletions(-) diff --git a/src/doc/man/recollindex.1 b/src/doc/man/recollindex.1 index ff029048..761c94a1 100644 --- a/src/doc/man/recollindex.1 +++ b/src/doc/man/recollindex.1 @@ -13,16 +13,19 @@ recollindex \- indexing command for the Recoll full text search system [ .B \-z|\-Z ] +[ +.B \-k +] .br .B recollindex [ .B \-c - + ] .B \-m [ .B \-w - + ] [ .B \-D @@ -34,19 +37,22 @@ recollindex \- indexing command for the Recoll full text search system .B \-C ] [ -.B \-n +.B \-n|-k ] .br .B recollindex [ .B \-c - + ] .B \-i [ .B \-Z ] [ +.B \-k +] +[ .B \-f ] [] @@ -119,6 +125,17 @@ is given, the database will be erased before starting. If option is given, the database will not be reset, but all files will be considered as needing reindexing (in place reset). .PP +By default, +.B recollindex +does not process again files which previously failed to index (for example +because of a missing helper program). This behaviour is new in version +1.21, error files were always retried in previous versions. +If option +.B \-k +is given, +.B recollindex +will try again to process all failed files. +.PP If option .B \-m diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index ea3e5f40..db9a06f1 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -270,6 +270,13 @@ to the indexing command (recollindex or ). + recollindex skips files which caused an + error during a previous pass. This is a performance + optimization, and a new behaviour in version 1.21 (failed files + were always retried by previous versions). The command line + option can be set to retry failed files, for + example after updating a filter. + The following sections give an overview of different aspects of the indexing processes and configuration, with links to detailed sections. @@ -915,20 +922,25 @@ recoll querying while it is rebuilt, which can be a significant advantage if it is very big (some installations need days for a full index rebuild). + + Option will force retrying files + which previously failed to be indexed, for example because + of a missing helper program. + Of special interest also, maybe, are - the and - options. allows - indexing an explicit list of files (given as command line - parameters or read on stdin). - tells + the and + options. allows indexing an explicit + list of files (given as command line parameters or read on + stdin). tells recollindex to ignore file selection - parameters from the configuration. Together, these options allow - building a custom file selection process for some area of the - file system, by adding the top directory to the - skippedPaths list and using an appropriate - file selection method to build the file list to be fed to - recollindex . - Trivial example: + parameters from the configuration. Together, these options + allow building a custom file selection process for some area + of the file system, by adding the top directory to the + skippedPaths list and using an + appropriate file selection method to build the file list to + be fed to recollindex + . Trivial example: + find . -name indexable.txt -print | recollindex -if diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index bf63875f..74947229 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -99,7 +99,8 @@ public: FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) : m_config(cnf), m_db(db), m_updater(updfunc), - m_missing(new FSIFIMissingStore), m_detectxattronly(false) + m_missing(new FSIFIMissingStore), m_detectxattronly(false), + m_noretryfailed(false) #ifdef IDX_THREADS , m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first), m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first) @@ -172,8 +173,10 @@ bool FsIndexer::init() } // Recursively index each directory in the topdirs: -bool FsIndexer::index(bool quickshallow) +bool FsIndexer::index(int flags) { + bool quickshallow = (flags & ConfIndexer::IxFQuickShallow) != 0; + m_noretryfailed = (flags & ConfIndexer::IxFNoRetryFailed) != 0; Chrono chron; if (!init()) return false; @@ -261,7 +264,7 @@ static bool matchesSkipped(const vector& tdl, for (vector::const_iterator it = tdl.begin(); it != tdl.end(); it++) { // the topdirs members are already canonized. - LOGDEB2(("indexfiles:matchesskpp: comparing ancestor [%s] to " + LOGDEB2(("matchesSkipped: comparing ancestor [%s] to " "topdir [%s]\n", mpath.c_str(), it->c_str())); if (!mpath.compare(*it)) { topdir = *it; @@ -324,9 +327,10 @@ goodpath: /** * Index individual files, out of a full tree run. No database purging */ -bool FsIndexer::indexFiles(list& files, ConfIndexer::IxFlag flag) +bool FsIndexer::indexFiles(list& files, int flags) { LOGDEB(("FsIndexer::indexFiles\n")); + m_noretryfailed = (flags & ConfIndexer::IxFNoRetryFailed) != 0; int ret = false; if (!init()) @@ -354,7 +358,7 @@ bool FsIndexer::indexFiles(list& files, ConfIndexer::IxFlag flag) walker.setSkippedNames(m_config->getSkippedNames()); // Check path against indexed areas and skipped names/paths - if (!(flag&ConfIndexer::IxFIgnoreSkip) && + if (!(flags & ConfIndexer::IxFIgnoreSkip) && matchesSkipped(m_tdl, walker, *it)) { it++; continue; @@ -648,8 +652,14 @@ FsIndexer::processonefile(RclConfig *config, makesig(stp, sig); string udi; make_udi(fn, cstr_null, udi); - bool existingDoc; - bool needupdate = m_db->needUpdate(udi, sig, &existingDoc); + unsigned int existingDoc; + string oldsig; + bool needupdate; + if (m_noretryfailed) { + needupdate = m_db->needUpdate(udi, sig, &existingDoc, &oldsig); + } else { + needupdate = m_db->needUpdate(udi, sig, &existingDoc, 0); + } // If ctime (which we use for the sig) differs from mtime, then at most // the extended attributes were changed, no need to index content. @@ -659,7 +669,24 @@ FsIndexer::processonefile(RclConfig *config, // the ctime to avoid this bool xattronly = m_detectxattronly && !m_db->inFullReset() && existingDoc && needupdate && (stp->st_mtime < stp->st_ctime); - + + LOGDEB(("processone: needupdate %d noretry %d existing %d oldsig [%s]\n", + needupdate, m_noretryfailed, existingDoc, oldsig.c_str())); + + // If noretryfailed is set, check for a file which previously + // failed to index, and avoid re-processing it + if (needupdate && m_noretryfailed && existingDoc && + !oldsig.empty() && *oldsig.rbegin() == '+') { + // Check that the sigs are the same except for the '+'. If the file + // actually changed, we always retry (maybe it was fixed) + string nold = oldsig.substr(0, oldsig.size()-1); + if (!nold.compare(sig)) { + LOGDEB(("processone: not retrying previously failed file\n")); + m_db->setExistingFlags(udi, existingDoc); + needupdate = false; + } + } + if (!needupdate) { LOGDEB0(("processone: up to date: %s\n", fn.c_str())); if (m_updater) { diff --git a/src/index/fsindexer.h b/src/index/fsindexer.h index f7ce0cd9..42d65e03 100644 --- a/src/index/fsindexer.h +++ b/src/index/fsindexer.h @@ -60,11 +60,11 @@ class FsIndexer : public FsTreeWalkerCB { * We open the database, * then call a file system walk for each top-level directory. */ - bool index(bool quickshallow = 0); + bool index(int flags); /** Index a list of files. No db cleaning or stemdb updating */ - bool indexFiles(std::list &files, ConfIndexer::IxFlag f = - ConfIndexer::IxFNone); + bool indexFiles(std::list &files, + int f = ConfIndexer::IxFNone); /** Purge a list of files. */ bool purgeFiles(std::list &files); @@ -136,6 +136,9 @@ class FsIndexer : public FsTreeWalkerCB { // needs a config option bool m_detectxattronly; + // No retry of previously failed files + bool m_noretryfailed; + #ifdef IDX_THREADS friend void *FsIndexerDbUpdWorker(void*); friend void *FsIndexerInternfileWorker(void*); diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index b12cff7d..de706d07 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -84,13 +84,13 @@ bool ConfIndexer::firstFsIndexingSequence() } int flushmb = m_db.getFlushMb(); m_db.setFlushMb(2); - m_fsindexer->index(true); + m_fsindexer->index(IxFQuickShallow); m_db.doFlush(); m_db.setFlushMb(flushmb); return true; } -bool ConfIndexer::index(bool resetbefore, ixType typestorun) +bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags) { Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd; if (!m_db.open(mode)) { @@ -106,7 +106,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun) } deleteZ(m_fsindexer); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); - if (!m_fsindexer || !m_fsindexer->index()) { + if (!m_fsindexer || !m_fsindexer->index(flags)) { m_db.close(); return false; } @@ -154,7 +154,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun) return true; } -bool ConfIndexer::indexFiles(list& ifiles, IxFlag flag) +bool ConfIndexer::indexFiles(list& ifiles, int flag) { list myfiles; string origcwd = m_config->getOrigCwd(); @@ -237,7 +237,7 @@ bool ConfIndexer::updateDocs(std::vector &docs, IxFlag flag) return true; } -bool ConfIndexer::purgeFiles(std::list &files, IxFlag flag) +bool ConfIndexer::purgeFiles(std::list &files, int flag) { list myfiles; string origcwd = m_config->getOrigCwd(); diff --git a/src/index/indexer.h b/src/index/indexer.h index f2c9bd16..4c024bac 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -102,10 +102,15 @@ class ConfIndexer { enum IxFlag {IxFNone = 0, IxFIgnoreSkip = 1, // Ignore skipped lists IxFNoWeb = 2, // Do not process the web queue. + // First pass: just do the top files so that the user can + // try searching asap. + IxFQuickShallow = 4, + // Do not retry files which previously failed ('+' sigs) + IxFNoRetryFailed = 8, }; /** Run indexers */ - bool index(bool resetbefore, ixType typestorun); + bool index(bool resetbefore, ixType typestorun, int f = IxFNone); const string &getReason() {return m_reason;} @@ -122,14 +127,14 @@ class ConfIndexer { static vector getStemmerNames(); /** Index a list of files. No db cleaning or stemdb updating */ - bool indexFiles(list &files, IxFlag f = IxFNone); + bool indexFiles(list &files, int f = IxFNone); /** Update index for list of documents given as list of docs (out of query) */ bool updateDocs(vector &docs, IxFlag f = IxFNone); static bool docsToPaths(vector &docs, vector &paths); /** Purge a list of files. */ - bool purgeFiles(list &files, IxFlag f = IxFNone); + bool purgeFiles(list &files, int f = IxFNone); /** Set in place reset mode */ void setInPlaceReset() {m_db.setInPlaceReset();} diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 81ef91d8..57a50ecc 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -68,6 +68,7 @@ static int op_flags; #define OPT_Z 0x10000 #define OPT_n 0x20000 #define OPT_r 0x40000 +#define OPT_k 0x80000 ReExec *o_reexec; @@ -182,7 +183,7 @@ public: { } virtual FsTreeWalker::Status - processone(const string & fn, const struct stat *, FsTreeWalker::CbFlag flg) + processone(const string& fn, const struct stat *, FsTreeWalker::CbFlag flg) { if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwRegular) m_files.push_back(fn); @@ -255,11 +256,12 @@ static const char usage [] = "\n" "recollindex [-h] \n" " Print help\n" -"recollindex [-z|-Z] \n" +"recollindex [-z|-Z] [-k]\n" " Index everything according to configuration file\n" " -z : reset database before starting indexing\n" " -Z : in place reset: consider all documents as changed. Can also\n" " be combined with -i or -r but not -m\n" +" -k : retry files on which we previously failed\n" #ifdef RCL_MONITOR "recollindex -m [-w ] -x [-D] [-C]\n" " Perform real time indexing. Don't become a daemon if -D is set.\n" @@ -282,8 +284,8 @@ static const char usage [] = "recollindex -s \n" " Build stem database for additional language \n" #ifdef FUTURE_IMPROVEMENT -"recollindex -b\n" -" Process the Beagle queue\n" +"recollindex -W\n" +" Process the Web queue\n" #endif #ifdef RCL_USE_ASPELL "recollindex -S\n" @@ -351,6 +353,7 @@ int main(int argc, char **argv) case 'f': op_flags |= OPT_f; break; case 'h': op_flags |= OPT_h; break; case 'i': op_flags |= OPT_i; break; + case 'k': op_flags |= OPT_k; break; case 'l': op_flags |= OPT_l; break; case 'm': op_flags |= OPT_m; break; case 'n': op_flags |= OPT_n; break; @@ -415,6 +418,10 @@ int main(int argc, char **argv) bool rezero((op_flags & OPT_z) != 0); bool inPlaceReset((op_flags & OPT_Z) != 0); + int indexerFlags = ConfIndexer::IxFNone; + if (!(op_flags & OPT_k)) + indexerFlags |= ConfIndexer::IxFNoRetryFailed; + Pidfile pidfile(config->getPidfile()); updater = new MyUpdater(config); @@ -526,8 +533,8 @@ int main(int argc, char **argv) if (!(op_flags & OPT_n)) { makeIndexerOrExit(config, inPlaceReset); LOGDEB(("Recollindex: initial indexing pass before monitoring\n")); - if (!confindexer->index(rezero, ConfIndexer::IxTAll) || - stopindexing) { + if (!confindexer->index(rezero, ConfIndexer::IxTAll, indexerFlags) + || stopindexing) { LOGERR(("recollindex, initial indexing pass failed, " "not going into monitor mode\n")); exit(1); @@ -564,7 +571,8 @@ int main(int argc, char **argv) } else { lockorexit(&pidfile); makeIndexerOrExit(config, inPlaceReset); - bool status = confindexer->index(rezero, ConfIndexer::IxTAll); + bool status = confindexer->index(rezero, ConfIndexer::IxTAll, + indexerFlags); if (!status) cerr << "Indexing failed" << endl; if (!confindexer->getReason().empty()) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 2344961a..31754635 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -789,7 +789,6 @@ bool Db::open(OpenMode mode, OpenError *error) // (now: Xapian 1.2) and the separate objects seem to // trigger other Xapian issues, so the query db is now // a clone of the update one. -// m_ndb->xrdb = Xapian::Database(dir); m_ndb->xrdb = m_ndb->xwdb; LOGDEB(("Db::open: lastdocid: %d\n", m_ndb->xwdb.get_lastdocid())); @@ -1725,23 +1724,70 @@ bool Db::doFlush() return true; } +void Db::setExistingFlags(const string& udi, unsigned int docid) +{ + if (m_mode == DbRO) + return; + if (docid == (unsigned int)-1) { + LOGERR(("Db::setExistingFlags: called with bogus docid !!\n")); + return; + } +#ifdef IDX_THREADS + PTMutexLocker lock(m_ndb->m_mutex); +#endif + i_setExistingFlags(udi, docid); +} + +void Db::i_setExistingFlags(const string& udi, unsigned int docid) +{ + // Set the up to date flag for the document and its subdocs + if (docid >= updated.size()) { + LOGERR(("needUpdate: existing docid beyond " + "updated.size(). Udi [%s], docid %u, " + "updated.size() %u\n", udi.c_str(), + unsigned(docid), (unsigned)updated.size())); + return; + } else { + updated[docid] = true; + } + + // Set the existence flag for all the subdocs (if any) + vector docids; + if (!m_ndb->subDocs(udi, 0, docids)) { + LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n")); + return; + } + for (vector::iterator it = docids.begin(); + it != docids.end(); it++) { + if (*it < updated.size()) { + LOGDEB2(("Db::needUpdate: docid %d set\n", *it)); + updated[*it] = true; + } + } +} + // Test if doc given by udi has changed since last indexed (test sigs) -bool Db::needUpdate(const string &udi, const string& sig, bool *existed) +bool Db::needUpdate(const string &udi, const string& sig, + unsigned int *docidp, string *osigp) { if (m_ndb == 0) return false; + if (osigp) + osigp->clear(); + if (docidp) + *docidp = 0; + // If we are doing an in place or full reset, no need to test. if (o_inPlaceReset || m_mode == DbTrunc) { - // For in place reset, pretend the doc existed, to enable subdoc purge - if (existed) - *existed = o_inPlaceReset; + // For in place reset, pretend the doc existed, to enable + // subdoc purge. The value is only used as a boolean in this case. + if (docidp && o_inPlaceReset) { + *docidp = -1; + } return true; } - if (existed) - *existed = false; - string uniterm = make_uniterm(udi); string ermsg; @@ -1773,8 +1819,9 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed) return true; } - if (existed) - *existed = true; + if (docidp) { + *docidp = *docid; + } // Retrieve old file/doc signature from value string osig; @@ -1785,6 +1832,11 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed) } LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n", osig.c_str(), sig.c_str())); + + if (osigp) { + *osigp = osig; + } + // Compare new/old sig if (sig != osig) { LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s] [%s]\n", @@ -1793,34 +1845,10 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed) return true; } - // Up to date. + // Up to date. Set the existance flags in the map for the doc and + // its subdocs. LOGDEB(("Db::needUpdate:no: [%s]\n", uniterm.c_str())); - - if (m_mode != DbRO) { - // Set the up to date flag for the document and its subdocs - if (*docid >= updated.size()) { - LOGERR(("needUpdate: existing docid beyond " - "updated.size(). Udi [%s], docid %u, " - "updated.size() %u\n", udi.c_str(), - unsigned(*docid), (unsigned)updated.size())); - } else { - updated[*docid] = true; - } - - // Set the existence flag for all the subdocs (if any) - vector docids; - if (!m_ndb->subDocs(udi, 0, docids)) { - LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n")); - return true; - } - for (vector::iterator it = docids.begin(); - it != docids.end(); it++) { - if (*it < updated.size()) { - LOGDEB2(("Db::needUpdate: docid %d set\n", *it)); - updated[*it] = true; - } - } - } + i_setExistingFlags(udi, *docid); return false; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index eb10ae42..ab1ab978 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -228,15 +228,36 @@ class Db { /* Update-related methods ******************************************/ - /** Test if the db entry for the given udi is up to date (by - * comparing the input and stored sigs). This is used both when - * indexing and querying (before opening a document using stale info), + /** Test if the db entry for the given udi is up to date. + * + * This is done by comparing the input and stored sigs. This is + * used both when indexing and querying (before opening a document + * using stale info). + * * **This assumes that the udi pertains to the main index (idxi==0).** - * Side-effect when the db is writeable: set the existence flag - * for the file document and all subdocs if any (for later use by - * 'purge()') + * + * Side-effect when the db is writeable and the document up to + * date: set the existence flag for the file document and all + * subdocs if any (for later use by 'purge()') + * + * @param udi Unique Document Identifier (as chosen by indexer). + * @param sig New signature (as computed by indexer). + * @param xdocid[output] Non-zero if doc existed. Should be considered + * as opaque, to be used for a possible later call to setExistingFlags() + * Note that if inplaceReset is set, the return value is non-zero but not + * an actual docid, it's only used as a flag in this case. + * @param osig[output] old signature. */ - bool needUpdate(const string &udi, const string& sig, bool *existed=0); + bool needUpdate(const string &udi, const string& sig, + unsigned int *xdocid = 0, std::string *osig = 0); + + /** Set the existance flags for the document and its eventual subdocuments + * + * This can be called by the indexer after needUpdate() has returned true, + * if the indexer does not wish to actually re-index (e.g.: the doc is + * known to cause errors). + */ + void setExistingFlags(const string& udi, unsigned int docid); /** Indicate if we are doing a systematic reindex. This complements needUpdate() return */ @@ -488,6 +509,8 @@ private: friend void *DbUpdWorker(void*); #endif // IDX_THREADS + // Internal form of setExistingFlags: no locking + void i_setExistingFlags(const string& udi, unsigned int docid); // Internal form of close, can be called during destruction bool i_close(bool final); // Reinitialize when adding/removing additional dbs