arrange so that a default first indexing run for a given user runs a quick shallow pass, so that queries return some results quickly after indexing starts, avoiding user frustration

This commit is contained in:
Jean-Francois Dockes 2013-04-18 14:29:32 +02:00
parent 719f37ded7
commit f05cae7344
8 changed files with 214 additions and 87 deletions

View File

@ -178,7 +178,7 @@ bool FsIndexer::init()
} }
// Recursively index each directory in the topdirs: // Recursively index each directory in the topdirs:
bool FsIndexer::index() bool FsIndexer::index(bool quickshallow)
{ {
Chrono chron; Chrono chron;
if (!init()) if (!init())
@ -193,6 +193,11 @@ bool FsIndexer::index()
} }
m_walker.setSkippedPaths(m_config->getSkippedPaths()); m_walker.setSkippedPaths(m_config->getSkippedPaths());
if (quickshallow) {
m_walker.setOpts(m_walker.getOpts() | FsTreeWalker::FtwSkipDotFiles);
m_walker.setMaxDepth(2);
}
for (vector<string>::const_iterator it = m_tdl.begin(); for (vector<string>::const_iterator it = m_tdl.begin();
it != m_tdl.end(); it++) { it != m_tdl.end(); it++) {
LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(),
@ -204,11 +209,13 @@ bool FsIndexer::index()
// Adjust the "follow symlinks" option // Adjust the "follow symlinks" option
bool follow; bool follow;
int opts = m_walker.getOpts();
if (m_config->getConfParam("followLinks", &follow) && follow) { if (m_config->getConfParam("followLinks", &follow) && follow) {
m_walker.setOpts(FsTreeWalker::FtwFollow); opts |= FsTreeWalker::FtwFollow;
} else { } else {
m_walker.setOpts(FsTreeWalker::FtwOptNone); opts &= ~FsTreeWalker::FtwFollow;
} }
m_walker.setOpts(opts);
int abslen; int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen)) if (m_config->getConfParam("idxabsmlen", &abslen))

View File

@ -60,7 +60,7 @@ class FsIndexer : public FsTreeWalkerCB {
* We open the database, * We open the database,
* then call a file system walk for each top-level directory. * then call a file system walk for each top-level directory.
*/ */
bool index(); bool index(bool quickshallow = 0);
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(std::list<std::string> &files, ConfIndexer::IxFlag f = bool indexFiles(std::list<std::string> &files, ConfIndexer::IxFlag f =

View File

@ -50,6 +50,46 @@ ConfIndexer::~ConfIndexer()
deleteZ(m_beagler); deleteZ(m_beagler);
} }
// Determine if this is likely the first time that the user runs
// indexing. We don't look at the xapiandb as this may have been
// explicitely removed for valid reasons, but at the indexing status
// file, which should be unexistant-or-empty only before any indexing
// has ever run
bool ConfIndexer::runFirstIndexing()
{
// Indexing status file existing and not empty ?
struct stat st;
if (stat(m_config->getIdxStatusFile().c_str(), &st) == 0 &&
st.st_size > 0) {
LOGDEB0(("ConfIndexer::runFirstIndexing: no: status file not empty\n"));
exit(1);
return false;
}
// And only do this if the user has kept the default topdirs (~).
vector<string>tdl = m_config->getTopdirs();
if (tdl.size() != 1 || tdl[0].compare(path_canon(path_tildexpand("~")))) {
LOGDEB0(("ConfIndexer::runFirstIndexing: no: not home only\n"));
return false;
}
return true;
}
bool ConfIndexer::firstFsIndexingSequence()
{
LOGDEB(("ConfIndexer::firstFsIndexingSequence\n"));
deleteZ(m_fsindexer);
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer) {
return false;
}
int flushmb = m_db.getFlushMb();
m_db.setFlushMb(2);
m_fsindexer->index(true);
m_db.doFlush();
m_db.setFlushMb(flushmb);
return true;
}
bool ConfIndexer::index(bool resetbefore, ixType typestorun) bool ConfIndexer::index(bool resetbefore, ixType typestorun)
{ {
Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd; Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
@ -61,6 +101,9 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
m_config->setKeyDir(cstr_null); m_config->setKeyDir(cstr_null);
if (typestorun & IxTFs) { if (typestorun & IxTFs) {
if (runFirstIndexing()) {
firstFsIndexingSequence();
}
deleteZ(m_fsindexer); deleteZ(m_fsindexer);
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer || !m_fsindexer->index()) { if (!m_fsindexer || !m_fsindexer->index()) {

View File

@ -138,6 +138,13 @@ class ConfIndexer {
BeagleQueueIndexer *m_beagler; BeagleQueueIndexer *m_beagler;
DbIxStatusUpdater *m_updater; DbIxStatusUpdater *m_updater;
string m_reason; string m_reason;
// The first time we index, we do things a bit differently to
// avoid user frustration (make at least some results available
// fast by using several passes, the first ones to index common
// interesting locations).
bool runFirstIndexing();
bool firstFsIndexingSequence();
}; };
#endif /* _INDEXER_H_INCLUDED_ */ #endif /* _INDEXER_H_INCLUDED_ */

View File

@ -1266,20 +1266,30 @@ bool Db::maybeflush(off_t moretext)
if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) {
LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n", LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n",
m_flushMb)); m_flushMb));
string ermsg; return doFlush();
try {
m_ndb->xwdb.flush();
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::add: flush() failed: %s\n", ermsg.c_str()));
return false;
}
m_flushtxtsz = m_curtxtsz;
} }
} }
return true; return true;
} }
bool Db::doFlush()
{
if (!m_ndb) {
LOGERR(("Db::doFLush: no ndb??\n"));
return false;
}
string ermsg;
try {
m_ndb->xwdb.flush();
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::doFlush: flush() failed: %s\n", ermsg.c_str()));
return false;
}
m_flushtxtsz = m_curtxtsz;
return true;
}
// Test if doc given by udi has changed since last indexed (test sigs) // Test if doc given by udi has changed since last indexed (test sigs)
bool Db::needUpdate(const string &udi, const string& sig) bool Db::needUpdate(const string &udi, const string& sig)
{ {

View File

@ -377,6 +377,19 @@ class Db {
*/ */
static void setInPlaceReset() {o_inPlaceReset = true;} static void setInPlaceReset() {o_inPlaceReset = true;}
/** Flush interval get/set. This is used by the first indexing
pass to override the config value and flush more rapidly
initially so that the user can quickly play with queries */
int getFlushMb()
{
return m_flushMb;
}
void setFlushMb(int mb)
{
m_flushMb = mb;
}
bool doFlush();
/* This has to be public for access by embedded Query::Native */ /* This has to be public for access by embedded Query::Native */
Native *m_ndb; Native *m_ndb;
private: private:

View File

@ -20,6 +20,7 @@
#ifndef TEST_FSTREEWALK #ifndef TEST_FSTREEWALK
#include <stdio.h>
#include <dirent.h> #include <dirent.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <errno.h> #include <errno.h>
@ -58,8 +59,15 @@ public:
}; };
class FsTreeWalker::Internal { class FsTreeWalker::Internal {
public:
Internal(int opts)
: options(opts), depthswitch(4), maxdepth(-1), errors(0)
{
}
int options; int options;
int depthswitch; int depthswitch;
int maxdepth;
int basedepth;
stringstream reason; stringstream reason;
vector<string> skippedNames; vector<string> skippedNames;
vector<string> skippedPaths; vector<string> skippedPaths;
@ -74,17 +82,11 @@ class FsTreeWalker::Internal {
reason << call << "(" << param << ") : " << errno << " : " << reason << call << "(" << param << ") : " << errno << " : " <<
strerror(errno) << endl; strerror(errno) << endl;
} }
friend class FsTreeWalker;
}; };
FsTreeWalker::FsTreeWalker(int opts) FsTreeWalker::FsTreeWalker(int opts)
{ {
data = new Internal; data = new Internal(opts);
if (data) {
data->options = opts;
data->depthswitch = 4;
data->errors = 0;
}
} }
FsTreeWalker::~FsTreeWalker() FsTreeWalker::~FsTreeWalker()
@ -92,11 +94,30 @@ FsTreeWalker::~FsTreeWalker()
delete data; delete data;
} }
void FsTreeWalker::setOpts(Options opts, int depthswitch) void FsTreeWalker::setOpts(int opts)
{ {
if (data) { if (data) {
data->options = opts; data->options = opts;
data->depthswitch = depthswitch; }
}
int FsTreeWalker::getOpts()
{
if (data) {
return data->options;
} else {
return 0;
}
}
void FsTreeWalker::setDepthSwitch(int ds)
{
if (data) {
data->depthswitch = ds;
}
}
void FsTreeWalker::setMaxDepth(int md)
{
if (data) {
data->maxdepth = md;
} }
} }
@ -198,8 +219,7 @@ FsTreeWalker::Status FsTreeWalker::walk(const string& _top,
data->options |= FtwTravNatural; data->options |= FtwTravNatural;
} }
int basedepth = slashcount(top); // Only used for breadthThenDepth data->basedepth = slashcount(top); // Only used for breadthxx
struct stat st; struct stat st;
// We always follow symlinks at this point. Makes more sense. // We always follow symlinks at this point. Makes more sense.
if (stat(top.c_str(), &st) == -1) { if (stat(top.c_str(), &st) == -1) {
@ -240,7 +260,7 @@ FsTreeWalker::Status FsTreeWalker::walk(const string& _top,
if (data->options & FtwTravBreadthThenDepth) { if (data->options & FtwTravBreadthThenDepth) {
// Check if new depth warrants switch to depth first // Check if new depth warrants switch to depth first
// traversal (will happen on next loop iteration). // traversal (will happen on next loop iteration).
int curdepth = slashcount(dir) - basedepth; int curdepth = slashcount(dir) - data->basedepth;
if (curdepth >= data->depthswitch) { if (curdepth >= data->depthswitch) {
//fprintf(stderr, "SWITCHING TO DEPTH FIRST\n"); //fprintf(stderr, "SWITCHING TO DEPTH FIRST\n");
data->options &= ~FtwTravMask; data->options &= ~FtwTravMask;
@ -309,6 +329,13 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top,
return status; return status;
} }
int curdepth = slashcount(top) - data->basedepth;
if (data->maxdepth >= 0 && curdepth >= data->maxdepth) {
LOGDEB1(("FsTreeWalker::iwalk: Maxdepth reached: [%s]\n", top.c_str()));
return status;
}
// This is a directory, read it and process entries: // This is a directory, read it and process entries:
// Detect if directory already seen. This could just be several // Detect if directory already seen. This could just be several
@ -345,6 +372,9 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top,
while ((ent = readdir(d)) != 0) { while ((ent = readdir(d)) != 0) {
string fn; string fn;
struct stat st; struct stat st;
// Maybe skip dotfiles
if ((data->options & FtwSkipDotFiles) && ent->d_name[0] == '.')
continue;
// Skip . and .. // Skip . and ..
if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
continue; continue;
@ -439,6 +469,8 @@ static int op_flags;
#define OPT_m 0x80 #define OPT_m 0x80
#define OPT_L 0x100 #define OPT_L 0x100
#define OPT_w 0x200 #define OPT_w 0x200
#define OPT_M 0x400
#define OPT_D 0x800
class myCB : public FsTreeWalkerCB { class myCB : public FsTreeWalkerCB {
public: public:
@ -489,6 +521,8 @@ static char usage [] =
" -d : use almost depth first (dir files, then subdirs)\n" " -d : use almost depth first (dir files, then subdirs)\n"
" -m : use breadth up to 4 deep then switch to -d\n" " -m : use breadth up to 4 deep then switch to -d\n"
" -w : unset default FNM_PATHNAME when using fnmatch() to match skipped paths\n" " -w : unset default FNM_PATHNAME when using fnmatch() to match skipped paths\n"
" -M <depth>: limit depth (works with -b/m/d)\n"
" -D : skip dotfiles\n"
; ;
static void static void
Usage(void) Usage(void)
@ -501,70 +535,80 @@ int main(int argc, const char **argv)
{ {
vector<string> patterns; vector<string> patterns;
vector<string> paths; vector<string> paths;
int maxdepth = -1;
thisprog = argv[0]; thisprog = argv[0];
argc--; argv++; argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'b': op_flags |= OPT_b; break;
case 'c': op_flags |= OPT_c; break;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'L': op_flags |= OPT_L; break;
case 'm': op_flags |= OPT_m; break;
case 'M': op_flags |= OPT_M; if (argc < 2) Usage();
maxdepth = atoi(*(++argv));
argc--;
goto b1;
case 'p': op_flags |= OPT_p; if (argc < 2) Usage();
patterns.push_back(*(++argv));
argc--;
goto b1;
case 'P': op_flags |= OPT_P; if (argc < 2) Usage();
paths.push_back(*(++argv));
argc--;
goto b1;
case 'r': op_flags |= OPT_r; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
b1: argc--; argv++;
}
while (argc > 0 && **argv == '-') { if (argc != 1)
(*argv)++; Usage();
if (!(**argv)) string topdir = *argv++;argc--;
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'b': op_flags |= OPT_b; break;
case 'c': op_flags |= OPT_c; break;
case 'd': op_flags |= OPT_d; break;
case 'L': op_flags |= OPT_L; break;
case 'm': op_flags |= OPT_m; break;
case 'p': op_flags |= OPT_p; if (argc < 2) Usage();
patterns.push_back(*(++argv));
argc--;
goto b1;
case 'P': op_flags |= OPT_P; if (argc < 2) Usage();
paths.push_back(*(++argv));
argc--;
goto b1;
case 'r': op_flags |= OPT_r; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
b1: argc--; argv++;
}
if (argc != 1) int opt = 0;
Usage(); if (op_flags & OPT_r)
string topdir = *argv++;argc--; opt |= FsTreeWalker::FtwNoRecurse;
if (op_flags & OPT_c)
opt |= FsTreeWalker::FtwNoCanon;
if (op_flags & OPT_L)
opt |= FsTreeWalker::FtwFollow;
if (op_flags & OPT_D)
opt |= FsTreeWalker::FtwSkipDotFiles;
int opt = 0; if (op_flags & OPT_b)
if (op_flags & OPT_r) opt |= FsTreeWalker::FtwTravBreadth;
opt |= FsTreeWalker::FtwNoRecurse; else if (op_flags & OPT_d)
if (op_flags & OPT_c) opt |= FsTreeWalker::FtwTravFilesThenDirs;
opt |= FsTreeWalker::FtwNoCanon; else if (op_flags & OPT_m)
if (op_flags & OPT_L) opt |= FsTreeWalker::FtwTravBreadthThenDepth;
opt |= FsTreeWalker::FtwFollow;
if (op_flags & OPT_b) string reason;
opt |= FsTreeWalker::FtwTravBreadth; if (!recollinit(0, 0, reason)) {
else if (op_flags & OPT_d) fprintf(stderr, "Init failed: %s\n", reason.c_str());
opt |= FsTreeWalker::FtwTravFilesThenDirs; exit(1);
else if (op_flags & OPT_m) }
opt |= FsTreeWalker::FtwTravBreadthThenDepth; if (op_flags & OPT_w) {
FsTreeWalker::setNoFnmPathname();
string reason; }
if (!recollinit(0, 0, reason)) { FsTreeWalker walker;
fprintf(stderr, "Init failed: %s\n", reason.c_str()); walker.setOpts(opt);
exit(1); walker.setMaxDepth(maxdepth);
} walker.setSkippedNames(patterns);
if (op_flags & OPT_w) { walker.setSkippedPaths(paths);
FsTreeWalker::setNoFnmPathname(); myCB cb;
} walker.walk(topdir, cb);
FsTreeWalker walker(opt); if (walker.getErrCnt() > 0)
walker.setSkippedNames(patterns); cout << walker.getReason();
walker.setSkippedPaths(paths);
myCB cb;
walker.walk(topdir, cb);
if (walker.getErrCnt() > 0)
cout << walker.getReason();
} }
#endif // TEST_FSTREEWALK #endif // TEST_FSTREEWALK

View File

@ -56,7 +56,7 @@ class FsTreeWalker {
enum Status {FtwOk=0, FtwError=1, FtwStop=2, enum Status {FtwOk=0, FtwError=1, FtwStop=2,
FtwStatAll = FtwError|FtwStop}; FtwStatAll = FtwError|FtwStop};
enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2, enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2,
FtwNoCanon = 4, FtwNoCanon = 4, FtwSkipDotFiles = 8,
// Tree walking options. Natural is close to depth first: process // Tree walking options. Natural is close to depth first: process
// directory entries as we see them, recursing into subdirectories at // directory entries as we see them, recursing into subdirectories at
// once // once
@ -76,7 +76,10 @@ class FsTreeWalker {
FsTreeWalker(int opts = FtwTravNatural); FsTreeWalker(int opts = FtwTravNatural);
~FsTreeWalker(); ~FsTreeWalker();
void setOpts(Options opts, int depthswitch = 4); void setOpts(int opts);
int getOpts();
void setDepthSwitch(int);
void setMaxDepth(int);
/** /**
* Begin file system walk. * Begin file system walk.
@ -111,7 +114,7 @@ class FsTreeWalker {
private: private:
Status iwalk(const string &dir, struct stat *stp, FsTreeWalkerCB& cb); Status iwalk(const string &dir, struct stat *stp, FsTreeWalkerCB& cb);
class Internal; class Internal;
Internal *data; Internal *data;
}; };
class FsTreeWalkerCB { class FsTreeWalkerCB {