recollindex: do not retry files which previously failed to be indexed, except if they were changed since, or option -k is set

This commit is contained in:
Jean-Francois Dockes 2015-04-23 10:37:37 +02:00
parent 906312519c
commit e6c0ca403d
9 changed files with 209 additions and 86 deletions

View File

@ -13,16 +13,19 @@ recollindex \- indexing command for the Recoll full text search system
[
.B \-z|\-Z
]
[
.B \-k
]
.br
.B recollindex
[
.B \-c
<configdir>
<cd>
]
.B \-m
[
.B \-w
<seconds>
<secs>
]
[
.B \-D
@ -34,19 +37,22 @@ recollindex \- indexing command for the Recoll full text search system
.B \-C
]
[
.B \-n
.B \-n|-k
]
.br
.B recollindex
[
.B \-c
<configdir>
<cd>
]
.B \-i
[
.B \-Z
]
[
.B \-k
]
[
.B \-f
]
[<path [path ...]>]
@ -119,6 +125,17 @@ is given, the database will be erased before starting. If option
is given, the database will not be reset, but all files will be considered
as needing reindexing (in place reset).
.PP
By default,
.B recollindex
does not process again files which previously failed to index (for example
because of a missing helper program). This behaviour is new in version
1.21, error files were always retried in previous versions.
If option
.B \-k
is given,
.B recollindex
will try again to process all failed files.
.PP
If option
.B
\-m

View File

@ -270,6 +270,13 @@
to the indexing command (<command>recollindex</command>
<option>-z</option> or <option>-Z</option>).</para>
<para><command>recollindex</command> skips files which caused an
error during a previous pass. This is a performance
optimization, and a new behaviour in version 1.21 (failed files
were always retried by previous versions). The command line
option <option>-k</option> can be set to retry failed files, for
example after updating a filter.</para>
<para>The following sections give an overview of different
aspects of the indexing processes and configuration, with links
to detailed sections.</para>
@ -915,20 +922,25 @@ recoll
querying while it is rebuilt, which can be a significant
advantage if it is very big (some installations need days
for a full index rebuild).</para>
<para>Option <option>-k</option> will force retrying files
which previously failed to be indexed, for example because
of a missing helper program.</para>
<para>Of special interest also, maybe, are
the <option>-i</option> and
<option>-f</option> options. <option>-i</option> allows
indexing an explicit list of files (given as command line
parameters or read on <literal>stdin</literal>).
<option>-f</option> tells
the <option>-i</option> and <option>-f</option>
options. <option>-i</option> allows indexing an explicit
list of files (given as command line parameters or read on
<literal>stdin</literal>). <option>-f</option> tells
<command>recollindex</command> to ignore file selection
parameters from the configuration. Together, these options allow
building a custom file selection process for some area of the
file system, by adding the top directory to the
<varname>skippedPaths</varname> list and using an appropriate
file selection method to build the file list to be fed to
<command>recollindex</command> <option>-if</option>.
Trivial example:</para>
parameters from the configuration. Together, these options
allow building a custom file selection process for some area
of the file system, by adding the top directory to the
<varname>skippedPaths</varname> list and using an
appropriate file selection method to build the file list to
be fed to <command>recollindex</command>
<option>-if</option>. Trivial example:</para>
<programlisting>
find . -name indexable.txt -print | recollindex -if
</programlisting>

View File

@ -99,7 +99,8 @@ public:
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_updater(updfunc),
m_missing(new FSIFIMissingStore), m_detectxattronly(false)
m_missing(new FSIFIMissingStore), m_detectxattronly(false),
m_noretryfailed(false)
#ifdef IDX_THREADS
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
@ -172,8 +173,10 @@ bool FsIndexer::init()
}
// Recursively index each directory in the topdirs:
bool FsIndexer::index(bool quickshallow)
bool FsIndexer::index(int flags)
{
bool quickshallow = (flags & ConfIndexer::IxFQuickShallow) != 0;
m_noretryfailed = (flags & ConfIndexer::IxFNoRetryFailed) != 0;
Chrono chron;
if (!init())
return false;
@ -261,7 +264,7 @@ static bool matchesSkipped(const vector<string>& tdl,
for (vector<string>::const_iterator it = tdl.begin();
it != tdl.end(); it++) {
// the topdirs members are already canonized.
LOGDEB2(("indexfiles:matchesskpp: comparing ancestor [%s] to "
LOGDEB2(("matchesSkipped: comparing ancestor [%s] to "
"topdir [%s]\n", mpath.c_str(), it->c_str()));
if (!mpath.compare(*it)) {
topdir = *it;
@ -324,9 +327,10 @@ goodpath:
/**
* Index individual files, out of a full tree run. No database purging
*/
bool FsIndexer::indexFiles(list<string>& files, ConfIndexer::IxFlag flag)
bool FsIndexer::indexFiles(list<string>& files, int flags)
{
LOGDEB(("FsIndexer::indexFiles\n"));
m_noretryfailed = (flags & ConfIndexer::IxFNoRetryFailed) != 0;
int ret = false;
if (!init())
@ -354,7 +358,7 @@ bool FsIndexer::indexFiles(list<string>& files, ConfIndexer::IxFlag flag)
walker.setSkippedNames(m_config->getSkippedNames());
// Check path against indexed areas and skipped names/paths
if (!(flag&ConfIndexer::IxFIgnoreSkip) &&
if (!(flags & ConfIndexer::IxFIgnoreSkip) &&
matchesSkipped(m_tdl, walker, *it)) {
it++;
continue;
@ -648,8 +652,14 @@ FsIndexer::processonefile(RclConfig *config,
makesig(stp, sig);
string udi;
make_udi(fn, cstr_null, udi);
bool existingDoc;
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
unsigned int existingDoc;
string oldsig;
bool needupdate;
if (m_noretryfailed) {
needupdate = m_db->needUpdate(udi, sig, &existingDoc, &oldsig);
} else {
needupdate = m_db->needUpdate(udi, sig, &existingDoc, 0);
}
// If ctime (which we use for the sig) differs from mtime, then at most
// the extended attributes were changed, no need to index content.
@ -659,7 +669,24 @@ FsIndexer::processonefile(RclConfig *config,
// the ctime to avoid this
bool xattronly = m_detectxattronly && !m_db->inFullReset() &&
existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
LOGDEB(("processone: needupdate %d noretry %d existing %d oldsig [%s]\n",
needupdate, m_noretryfailed, existingDoc, oldsig.c_str()));
// If noretryfailed is set, check for a file which previously
// failed to index, and avoid re-processing it
if (needupdate && m_noretryfailed && existingDoc &&
!oldsig.empty() && *oldsig.rbegin() == '+') {
// Check that the sigs are the same except for the '+'. If the file
// actually changed, we always retry (maybe it was fixed)
string nold = oldsig.substr(0, oldsig.size()-1);
if (!nold.compare(sig)) {
LOGDEB(("processone: not retrying previously failed file\n"));
m_db->setExistingFlags(udi, existingDoc);
needupdate = false;
}
}
if (!needupdate) {
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
if (m_updater) {

View File

@ -60,11 +60,11 @@ class FsIndexer : public FsTreeWalkerCB {
* We open the database,
* then call a file system walk for each top-level directory.
*/
bool index(bool quickshallow = 0);
bool index(int flags);
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(std::list<std::string> &files, ConfIndexer::IxFlag f =
ConfIndexer::IxFNone);
bool indexFiles(std::list<std::string> &files,
int f = ConfIndexer::IxFNone);
/** Purge a list of files. */
bool purgeFiles(std::list<std::string> &files);
@ -136,6 +136,9 @@ class FsIndexer : public FsTreeWalkerCB {
// needs a config option
bool m_detectxattronly;
// No retry of previously failed files
bool m_noretryfailed;
#ifdef IDX_THREADS
friend void *FsIndexerDbUpdWorker(void*);
friend void *FsIndexerInternfileWorker(void*);

View File

@ -84,13 +84,13 @@ bool ConfIndexer::firstFsIndexingSequence()
}
int flushmb = m_db.getFlushMb();
m_db.setFlushMb(2);
m_fsindexer->index(true);
m_fsindexer->index(IxFQuickShallow);
m_db.doFlush();
m_db.setFlushMb(flushmb);
return true;
}
bool ConfIndexer::index(bool resetbefore, ixType typestorun)
bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags)
{
Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
if (!m_db.open(mode)) {
@ -106,7 +106,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
}
deleteZ(m_fsindexer);
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer || !m_fsindexer->index()) {
if (!m_fsindexer || !m_fsindexer->index(flags)) {
m_db.close();
return false;
}
@ -154,7 +154,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
return true;
}
bool ConfIndexer::indexFiles(list<string>& ifiles, IxFlag flag)
bool ConfIndexer::indexFiles(list<string>& ifiles, int flag)
{
list<string> myfiles;
string origcwd = m_config->getOrigCwd();
@ -237,7 +237,7 @@ bool ConfIndexer::updateDocs(std::vector<Rcl::Doc> &docs, IxFlag flag)
return true;
}
bool ConfIndexer::purgeFiles(std::list<string> &files, IxFlag flag)
bool ConfIndexer::purgeFiles(std::list<string> &files, int flag)
{
list<string> myfiles;
string origcwd = m_config->getOrigCwd();

View File

@ -102,10 +102,15 @@ class ConfIndexer {
enum IxFlag {IxFNone = 0,
IxFIgnoreSkip = 1, // Ignore skipped lists
IxFNoWeb = 2, // Do not process the web queue.
// First pass: just do the top files so that the user can
// try searching asap.
IxFQuickShallow = 4,
// Do not retry files which previously failed ('+' sigs)
IxFNoRetryFailed = 8,
};
/** Run indexers */
bool index(bool resetbefore, ixType typestorun);
bool index(bool resetbefore, ixType typestorun, int f = IxFNone);
const string &getReason() {return m_reason;}
@ -122,14 +127,14 @@ class ConfIndexer {
static vector<string> getStemmerNames();
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(list<string> &files, IxFlag f = IxFNone);
bool indexFiles(list<string> &files, int f = IxFNone);
/** Update index for list of documents given as list of docs (out of query)
*/
bool updateDocs(vector<Rcl::Doc> &docs, IxFlag f = IxFNone);
static bool docsToPaths(vector<Rcl::Doc> &docs, vector<string> &paths);
/** Purge a list of files. */
bool purgeFiles(list<string> &files, IxFlag f = IxFNone);
bool purgeFiles(list<string> &files, int f = IxFNone);
/** Set in place reset mode */
void setInPlaceReset() {m_db.setInPlaceReset();}

View File

@ -68,6 +68,7 @@ static int op_flags;
#define OPT_Z 0x10000
#define OPT_n 0x20000
#define OPT_r 0x40000
#define OPT_k 0x80000
ReExec *o_reexec;
@ -182,7 +183,7 @@ public:
{
}
virtual FsTreeWalker::Status
processone(const string & fn, const struct stat *, FsTreeWalker::CbFlag flg)
processone(const string& fn, const struct stat *, FsTreeWalker::CbFlag flg)
{
if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwRegular)
m_files.push_back(fn);
@ -255,11 +256,12 @@ static const char usage [] =
"\n"
"recollindex [-h] \n"
" Print help\n"
"recollindex [-z|-Z] \n"
"recollindex [-z|-Z] [-k]\n"
" Index everything according to configuration file\n"
" -z : reset database before starting indexing\n"
" -Z : in place reset: consider all documents as changed. Can also\n"
" be combined with -i or -r but not -m\n"
" -k : retry files on which we previously failed\n"
#ifdef RCL_MONITOR
"recollindex -m [-w <secs>] -x [-D] [-C]\n"
" Perform real time indexing. Don't become a daemon if -D is set.\n"
@ -282,8 +284,8 @@ static const char usage [] =
"recollindex -s <lang>\n"
" Build stem database for additional language <lang>\n"
#ifdef FUTURE_IMPROVEMENT
"recollindex -b\n"
" Process the Beagle queue\n"
"recollindex -W\n"
" Process the Web queue\n"
#endif
#ifdef RCL_USE_ASPELL
"recollindex -S\n"
@ -351,6 +353,7 @@ int main(int argc, char **argv)
case 'f': op_flags |= OPT_f; break;
case 'h': op_flags |= OPT_h; break;
case 'i': op_flags |= OPT_i; break;
case 'k': op_flags |= OPT_k; break;
case 'l': op_flags |= OPT_l; break;
case 'm': op_flags |= OPT_m; break;
case 'n': op_flags |= OPT_n; break;
@ -415,6 +418,10 @@ int main(int argc, char **argv)
bool rezero((op_flags & OPT_z) != 0);
bool inPlaceReset((op_flags & OPT_Z) != 0);
int indexerFlags = ConfIndexer::IxFNone;
if (!(op_flags & OPT_k))
indexerFlags |= ConfIndexer::IxFNoRetryFailed;
Pidfile pidfile(config->getPidfile());
updater = new MyUpdater(config);
@ -526,8 +533,8 @@ int main(int argc, char **argv)
if (!(op_flags & OPT_n)) {
makeIndexerOrExit(config, inPlaceReset);
LOGDEB(("Recollindex: initial indexing pass before monitoring\n"));
if (!confindexer->index(rezero, ConfIndexer::IxTAll) ||
stopindexing) {
if (!confindexer->index(rezero, ConfIndexer::IxTAll, indexerFlags)
|| stopindexing) {
LOGERR(("recollindex, initial indexing pass failed, "
"not going into monitor mode\n"));
exit(1);
@ -564,7 +571,8 @@ int main(int argc, char **argv)
} else {
lockorexit(&pidfile);
makeIndexerOrExit(config, inPlaceReset);
bool status = confindexer->index(rezero, ConfIndexer::IxTAll);
bool status = confindexer->index(rezero, ConfIndexer::IxTAll,
indexerFlags);
if (!status)
cerr << "Indexing failed" << endl;
if (!confindexer->getReason().empty())

View File

@ -789,7 +789,6 @@ bool Db::open(OpenMode mode, OpenError *error)
// (now: Xapian 1.2) and the separate objects seem to
// trigger other Xapian issues, so the query db is now
// a clone of the update one.
// m_ndb->xrdb = Xapian::Database(dir);
m_ndb->xrdb = m_ndb->xwdb;
LOGDEB(("Db::open: lastdocid: %d\n",
m_ndb->xwdb.get_lastdocid()));
@ -1725,23 +1724,70 @@ bool Db::doFlush()
return true;
}
void Db::setExistingFlags(const string& udi, unsigned int docid)
{
if (m_mode == DbRO)
return;
if (docid == (unsigned int)-1) {
LOGERR(("Db::setExistingFlags: called with bogus docid !!\n"));
return;
}
#ifdef IDX_THREADS
PTMutexLocker lock(m_ndb->m_mutex);
#endif
i_setExistingFlags(udi, docid);
}
void Db::i_setExistingFlags(const string& udi, unsigned int docid)
{
// Set the up to date flag for the document and its subdocs
if (docid >= updated.size()) {
LOGERR(("needUpdate: existing docid beyond "
"updated.size(). Udi [%s], docid %u, "
"updated.size() %u\n", udi.c_str(),
unsigned(docid), (unsigned)updated.size()));
return;
} else {
updated[docid] = true;
}
// Set the existence flag for all the subdocs (if any)
vector<Xapian::docid> docids;
if (!m_ndb->subDocs(udi, 0, docids)) {
LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n"));
return;
}
for (vector<Xapian::docid>::iterator it = docids.begin();
it != docids.end(); it++) {
if (*it < updated.size()) {
LOGDEB2(("Db::needUpdate: docid %d set\n", *it));
updated[*it] = true;
}
}
}
// Test if doc given by udi has changed since last indexed (test sigs)
bool Db::needUpdate(const string &udi, const string& sig, bool *existed)
bool Db::needUpdate(const string &udi, const string& sig,
unsigned int *docidp, string *osigp)
{
if (m_ndb == 0)
return false;
if (osigp)
osigp->clear();
if (docidp)
*docidp = 0;
// If we are doing an in place or full reset, no need to test.
if (o_inPlaceReset || m_mode == DbTrunc) {
// For in place reset, pretend the doc existed, to enable subdoc purge
if (existed)
*existed = o_inPlaceReset;
// For in place reset, pretend the doc existed, to enable
// subdoc purge. The value is only used as a boolean in this case.
if (docidp && o_inPlaceReset) {
*docidp = -1;
}
return true;
}
if (existed)
*existed = false;
string uniterm = make_uniterm(udi);
string ermsg;
@ -1773,8 +1819,9 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed)
return true;
}
if (existed)
*existed = true;
if (docidp) {
*docidp = *docid;
}
// Retrieve old file/doc signature from value
string osig;
@ -1785,6 +1832,11 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed)
}
LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n",
osig.c_str(), sig.c_str()));
if (osigp) {
*osigp = osig;
}
// Compare new/old sig
if (sig != osig) {
LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s] [%s]\n",
@ -1793,34 +1845,10 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed)
return true;
}
// Up to date.
// Up to date. Set the existance flags in the map for the doc and
// its subdocs.
LOGDEB(("Db::needUpdate:no: [%s]\n", uniterm.c_str()));
if (m_mode != DbRO) {
// Set the up to date flag for the document and its subdocs
if (*docid >= updated.size()) {
LOGERR(("needUpdate: existing docid beyond "
"updated.size(). Udi [%s], docid %u, "
"updated.size() %u\n", udi.c_str(),
unsigned(*docid), (unsigned)updated.size()));
} else {
updated[*docid] = true;
}
// Set the existence flag for all the subdocs (if any)
vector<Xapian::docid> docids;
if (!m_ndb->subDocs(udi, 0, docids)) {
LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n"));
return true;
}
for (vector<Xapian::docid>::iterator it = docids.begin();
it != docids.end(); it++) {
if (*it < updated.size()) {
LOGDEB2(("Db::needUpdate: docid %d set\n", *it));
updated[*it] = true;
}
}
}
i_setExistingFlags(udi, *docid);
return false;
}

View File

@ -228,15 +228,36 @@ class Db {
/* Update-related methods ******************************************/
/** Test if the db entry for the given udi is up to date (by
* comparing the input and stored sigs). This is used both when
* indexing and querying (before opening a document using stale info),
/** Test if the db entry for the given udi is up to date.
*
* This is done by comparing the input and stored sigs. This is
* used both when indexing and querying (before opening a document
* using stale info).
*
* **This assumes that the udi pertains to the main index (idxi==0).**
* Side-effect when the db is writeable: set the existence flag
* for the file document and all subdocs if any (for later use by
* 'purge()')
*
* Side-effect when the db is writeable and the document up to
* date: set the existence flag for the file document and all
* subdocs if any (for later use by 'purge()')
*
* @param udi Unique Document Identifier (as chosen by indexer).
* @param sig New signature (as computed by indexer).
* @param xdocid[output] Non-zero if doc existed. Should be considered
* as opaque, to be used for a possible later call to setExistingFlags()
* Note that if inplaceReset is set, the return value is non-zero but not
* an actual docid, it's only used as a flag in this case.
* @param osig[output] old signature.
*/
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
bool needUpdate(const string &udi, const string& sig,
unsigned int *xdocid = 0, std::string *osig = 0);
/** Set the existance flags for the document and its eventual subdocuments
*
* This can be called by the indexer after needUpdate() has returned true,
* if the indexer does not wish to actually re-index (e.g.: the doc is
* known to cause errors).
*/
void setExistingFlags(const string& udi, unsigned int docid);
/** Indicate if we are doing a systematic reindex. This complements
needUpdate() return */
@ -488,6 +509,8 @@ private:
friend void *DbUpdWorker(void*);
#endif // IDX_THREADS
// Internal form of setExistingFlags: no locking
void i_setExistingFlags(const string& udi, unsigned int docid);
// Internal form of close, can be called during destruction
bool i_close(bool final);
// Reinitialize when adding/removing additional dbs