From f05cae7344741d8e93c859a916184c7a11e00147 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 18 Apr 2013 14:29:32 +0200 Subject: [PATCH] arrange so that a default first indexing run for a given user runs a quick shallow pass, so that queries return some results quickly after indexing starts, avoiding user frustration --- src/index/fsindexer.cpp | 13 ++- src/index/fsindexer.h | 2 +- src/index/indexer.cpp | 43 +++++++++ src/index/indexer.h | 7 ++ src/rcldb/rcldb.cpp | 28 ++++-- src/rcldb/rcldb.h | 13 +++ src/utils/fstreewalk.cpp | 184 ++++++++++++++++++++++++--------------- src/utils/fstreewalk.h | 11 ++- 8 files changed, 214 insertions(+), 87 deletions(-) diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 9df1ce78..3c91e449 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -178,7 +178,7 @@ bool FsIndexer::init() } // Recursively index each directory in the topdirs: -bool FsIndexer::index() +bool FsIndexer::index(bool quickshallow) { Chrono chron; if (!init()) @@ -193,6 +193,11 @@ bool FsIndexer::index() } m_walker.setSkippedPaths(m_config->getSkippedPaths()); + if (quickshallow) { + m_walker.setOpts(m_walker.getOpts() | FsTreeWalker::FtwSkipDotFiles); + m_walker.setMaxDepth(2); + } + for (vector::const_iterator it = m_tdl.begin(); it != m_tdl.end(); it++) { LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), @@ -204,11 +209,13 @@ bool FsIndexer::index() // Adjust the "follow symlinks" option bool follow; + int opts = m_walker.getOpts(); if (m_config->getConfParam("followLinks", &follow) && follow) { - m_walker.setOpts(FsTreeWalker::FtwFollow); + opts |= FsTreeWalker::FtwFollow; } else { - m_walker.setOpts(FsTreeWalker::FtwOptNone); + opts &= ~FsTreeWalker::FtwFollow; } + m_walker.setOpts(opts); int abslen; if (m_config->getConfParam("idxabsmlen", &abslen)) diff --git a/src/index/fsindexer.h b/src/index/fsindexer.h index ad561562..a65630d0 100644 --- a/src/index/fsindexer.h +++ b/src/index/fsindexer.h @@ -60,7 +60,7 @@ class FsIndexer : public FsTreeWalkerCB { * We open the database, * then call a file system walk for each top-level directory. */ - bool index(); + bool index(bool quickshallow = 0); /** Index a list of files. No db cleaning or stemdb updating */ bool indexFiles(std::list &files, ConfIndexer::IxFlag f = diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 4cd9f013..7f15fab3 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -50,6 +50,46 @@ ConfIndexer::~ConfIndexer() deleteZ(m_beagler); } +// Determine if this is likely the first time that the user runs +// indexing. We don't look at the xapiandb as this may have been +// explicitely removed for valid reasons, but at the indexing status +// file, which should be unexistant-or-empty only before any indexing +// has ever run +bool ConfIndexer::runFirstIndexing() +{ + // Indexing status file existing and not empty ? + struct stat st; + if (stat(m_config->getIdxStatusFile().c_str(), &st) == 0 && + st.st_size > 0) { + LOGDEB0(("ConfIndexer::runFirstIndexing: no: status file not empty\n")); + exit(1); + return false; + } + // And only do this if the user has kept the default topdirs (~). + vectortdl = m_config->getTopdirs(); + if (tdl.size() != 1 || tdl[0].compare(path_canon(path_tildexpand("~")))) { + LOGDEB0(("ConfIndexer::runFirstIndexing: no: not home only\n")); + return false; + } + return true; +} + +bool ConfIndexer::firstFsIndexingSequence() +{ + LOGDEB(("ConfIndexer::firstFsIndexingSequence\n")); + deleteZ(m_fsindexer); + m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); + if (!m_fsindexer) { + return false; + } + int flushmb = m_db.getFlushMb(); + m_db.setFlushMb(2); + m_fsindexer->index(true); + m_db.doFlush(); + m_db.setFlushMb(flushmb); + return true; +} + bool ConfIndexer::index(bool resetbefore, ixType typestorun) { Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd; @@ -61,6 +101,9 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun) m_config->setKeyDir(cstr_null); if (typestorun & IxTFs) { + if (runFirstIndexing()) { + firstFsIndexingSequence(); + } deleteZ(m_fsindexer); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); if (!m_fsindexer || !m_fsindexer->index()) { diff --git a/src/index/indexer.h b/src/index/indexer.h index 3536c241..350fa1c9 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -138,6 +138,13 @@ class ConfIndexer { BeagleQueueIndexer *m_beagler; DbIxStatusUpdater *m_updater; string m_reason; + + // The first time we index, we do things a bit differently to + // avoid user frustration (make at least some results available + // fast by using several passes, the first ones to index common + // interesting locations). + bool runFirstIndexing(); + bool firstFsIndexingSequence(); }; #endif /* _INDEXER_H_INCLUDED_ */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 845d1371..4f9cf172 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1266,20 +1266,30 @@ bool Db::maybeflush(off_t moretext) if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n", m_flushMb)); - string ermsg; - try { - m_ndb->xwdb.flush(); - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("Db::add: flush() failed: %s\n", ermsg.c_str())); - return false; - } - m_flushtxtsz = m_curtxtsz; + return doFlush(); } } return true; } +bool Db::doFlush() +{ + if (!m_ndb) { + LOGERR(("Db::doFLush: no ndb??\n")); + return false; + } + string ermsg; + try { + m_ndb->xwdb.flush(); + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("Db::doFlush: flush() failed: %s\n", ermsg.c_str())); + return false; + } + m_flushtxtsz = m_curtxtsz; + return true; +} + // Test if doc given by udi has changed since last indexed (test sigs) bool Db::needUpdate(const string &udi, const string& sig) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 5238ebbc..cdf8b9e6 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -377,6 +377,19 @@ class Db { */ static void setInPlaceReset() {o_inPlaceReset = true;} + /** Flush interval get/set. This is used by the first indexing + pass to override the config value and flush more rapidly + initially so that the user can quickly play with queries */ + int getFlushMb() + { + return m_flushMb; + } + void setFlushMb(int mb) + { + m_flushMb = mb; + } + bool doFlush(); + /* This has to be public for access by embedded Query::Native */ Native *m_ndb; private: diff --git a/src/utils/fstreewalk.cpp b/src/utils/fstreewalk.cpp index b9683e2c..f8853401 100644 --- a/src/utils/fstreewalk.cpp +++ b/src/utils/fstreewalk.cpp @@ -20,6 +20,7 @@ #ifndef TEST_FSTREEWALK +#include #include #include #include @@ -58,8 +59,15 @@ public: }; class FsTreeWalker::Internal { +public: + Internal(int opts) + : options(opts), depthswitch(4), maxdepth(-1), errors(0) + { + } int options; int depthswitch; + int maxdepth; + int basedepth; stringstream reason; vector skippedNames; vector skippedPaths; @@ -74,17 +82,11 @@ class FsTreeWalker::Internal { reason << call << "(" << param << ") : " << errno << " : " << strerror(errno) << endl; } - friend class FsTreeWalker; }; FsTreeWalker::FsTreeWalker(int opts) { - data = new Internal; - if (data) { - data->options = opts; - data->depthswitch = 4; - data->errors = 0; - } + data = new Internal(opts); } FsTreeWalker::~FsTreeWalker() @@ -92,11 +94,30 @@ FsTreeWalker::~FsTreeWalker() delete data; } -void FsTreeWalker::setOpts(Options opts, int depthswitch) +void FsTreeWalker::setOpts(int opts) { if (data) { data->options = opts; - data->depthswitch = depthswitch; + } +} +int FsTreeWalker::getOpts() +{ + if (data) { + return data->options; + } else { + return 0; + } +} +void FsTreeWalker::setDepthSwitch(int ds) +{ + if (data) { + data->depthswitch = ds; + } +} +void FsTreeWalker::setMaxDepth(int md) +{ + if (data) { + data->maxdepth = md; } } @@ -198,8 +219,7 @@ FsTreeWalker::Status FsTreeWalker::walk(const string& _top, data->options |= FtwTravNatural; } - int basedepth = slashcount(top); // Only used for breadthThenDepth - + data->basedepth = slashcount(top); // Only used for breadthxx struct stat st; // We always follow symlinks at this point. Makes more sense. if (stat(top.c_str(), &st) == -1) { @@ -240,7 +260,7 @@ FsTreeWalker::Status FsTreeWalker::walk(const string& _top, if (data->options & FtwTravBreadthThenDepth) { // Check if new depth warrants switch to depth first // traversal (will happen on next loop iteration). - int curdepth = slashcount(dir) - basedepth; + int curdepth = slashcount(dir) - data->basedepth; if (curdepth >= data->depthswitch) { //fprintf(stderr, "SWITCHING TO DEPTH FIRST\n"); data->options &= ~FtwTravMask; @@ -309,6 +329,13 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top, return status; } + + int curdepth = slashcount(top) - data->basedepth; + if (data->maxdepth >= 0 && curdepth >= data->maxdepth) { + LOGDEB1(("FsTreeWalker::iwalk: Maxdepth reached: [%s]\n", top.c_str())); + return status; + } + // This is a directory, read it and process entries: // Detect if directory already seen. This could just be several @@ -345,6 +372,9 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top, while ((ent = readdir(d)) != 0) { string fn; struct stat st; + // Maybe skip dotfiles + if ((data->options & FtwSkipDotFiles) && ent->d_name[0] == '.') + continue; // Skip . and .. if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; @@ -439,6 +469,8 @@ static int op_flags; #define OPT_m 0x80 #define OPT_L 0x100 #define OPT_w 0x200 +#define OPT_M 0x400 +#define OPT_D 0x800 class myCB : public FsTreeWalkerCB { public: @@ -489,6 +521,8 @@ static char usage [] = " -d : use almost depth first (dir files, then subdirs)\n" " -m : use breadth up to 4 deep then switch to -d\n" " -w : unset default FNM_PATHNAME when using fnmatch() to match skipped paths\n" +" -M : limit depth (works with -b/m/d)\n" +" -D : skip dotfiles\n" ; static void Usage(void) @@ -501,70 +535,80 @@ int main(int argc, const char **argv) { vector patterns; vector paths; + int maxdepth = -1; + thisprog = argv[0]; argc--; argv++; + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'b': op_flags |= OPT_b; break; + case 'c': op_flags |= OPT_c; break; + case 'd': op_flags |= OPT_d; break; + case 'D': op_flags |= OPT_D; break; + case 'L': op_flags |= OPT_L; break; + case 'm': op_flags |= OPT_m; break; + case 'M': op_flags |= OPT_M; if (argc < 2) Usage(); + maxdepth = atoi(*(++argv)); + argc--; + goto b1; + case 'p': op_flags |= OPT_p; if (argc < 2) Usage(); + patterns.push_back(*(++argv)); + argc--; + goto b1; + case 'P': op_flags |= OPT_P; if (argc < 2) Usage(); + paths.push_back(*(++argv)); + argc--; + goto b1; + case 'r': op_flags |= OPT_r; break; + case 'w': op_flags |= OPT_w; break; + default: Usage(); break; + } + b1: argc--; argv++; + } - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - /* Cas du "adb - core" */ - Usage(); - while (**argv) - switch (*(*argv)++) { - case 'b': op_flags |= OPT_b; break; - case 'c': op_flags |= OPT_c; break; - case 'd': op_flags |= OPT_d; break; - case 'L': op_flags |= OPT_L; break; - case 'm': op_flags |= OPT_m; break; - case 'p': op_flags |= OPT_p; if (argc < 2) Usage(); - patterns.push_back(*(++argv)); - argc--; - goto b1; - case 'P': op_flags |= OPT_P; if (argc < 2) Usage(); - paths.push_back(*(++argv)); - argc--; - goto b1; - case 'r': op_flags |= OPT_r; break; - case 'w': op_flags |= OPT_w; break; - default: Usage(); break; - } - b1: argc--; argv++; - } + if (argc != 1) + Usage(); + string topdir = *argv++;argc--; - if (argc != 1) - Usage(); - string topdir = *argv++;argc--; + int opt = 0; + if (op_flags & OPT_r) + opt |= FsTreeWalker::FtwNoRecurse; + if (op_flags & OPT_c) + opt |= FsTreeWalker::FtwNoCanon; + if (op_flags & OPT_L) + opt |= FsTreeWalker::FtwFollow; + if (op_flags & OPT_D) + opt |= FsTreeWalker::FtwSkipDotFiles; - int opt = 0; - if (op_flags & OPT_r) - opt |= FsTreeWalker::FtwNoRecurse; - if (op_flags & OPT_c) - opt |= FsTreeWalker::FtwNoCanon; - if (op_flags & OPT_L) - opt |= FsTreeWalker::FtwFollow; + if (op_flags & OPT_b) + opt |= FsTreeWalker::FtwTravBreadth; + else if (op_flags & OPT_d) + opt |= FsTreeWalker::FtwTravFilesThenDirs; + else if (op_flags & OPT_m) + opt |= FsTreeWalker::FtwTravBreadthThenDepth; - if (op_flags & OPT_b) - opt |= FsTreeWalker::FtwTravBreadth; - else if (op_flags & OPT_d) - opt |= FsTreeWalker::FtwTravFilesThenDirs; - else if (op_flags & OPT_m) - opt |= FsTreeWalker::FtwTravBreadthThenDepth; - - string reason; - if (!recollinit(0, 0, reason)) { - fprintf(stderr, "Init failed: %s\n", reason.c_str()); - exit(1); - } - if (op_flags & OPT_w) { - FsTreeWalker::setNoFnmPathname(); - } - FsTreeWalker walker(opt); - walker.setSkippedNames(patterns); - walker.setSkippedPaths(paths); - myCB cb; - walker.walk(topdir, cb); - if (walker.getErrCnt() > 0) - cout << walker.getReason(); + string reason; + if (!recollinit(0, 0, reason)) { + fprintf(stderr, "Init failed: %s\n", reason.c_str()); + exit(1); + } + if (op_flags & OPT_w) { + FsTreeWalker::setNoFnmPathname(); + } + FsTreeWalker walker; + walker.setOpts(opt); + walker.setMaxDepth(maxdepth); + walker.setSkippedNames(patterns); + walker.setSkippedPaths(paths); + myCB cb; + walker.walk(topdir, cb); + if (walker.getErrCnt() > 0) + cout << walker.getReason(); } #endif // TEST_FSTREEWALK diff --git a/src/utils/fstreewalk.h b/src/utils/fstreewalk.h index d5405501..09090439 100644 --- a/src/utils/fstreewalk.h +++ b/src/utils/fstreewalk.h @@ -56,7 +56,7 @@ class FsTreeWalker { enum Status {FtwOk=0, FtwError=1, FtwStop=2, FtwStatAll = FtwError|FtwStop}; enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2, - FtwNoCanon = 4, + FtwNoCanon = 4, FtwSkipDotFiles = 8, // Tree walking options. Natural is close to depth first: process // directory entries as we see them, recursing into subdirectories at // once @@ -76,7 +76,10 @@ class FsTreeWalker { FsTreeWalker(int opts = FtwTravNatural); ~FsTreeWalker(); - void setOpts(Options opts, int depthswitch = 4); + void setOpts(int opts); + int getOpts(); + void setDepthSwitch(int); + void setMaxDepth(int); /** * Begin file system walk. @@ -110,8 +113,8 @@ class FsTreeWalker { private: Status iwalk(const string &dir, struct stat *stp, FsTreeWalkerCB& cb); - class Internal; - Internal *data; + class Internal; + Internal *data; }; class FsTreeWalkerCB {