diff --git a/src/configure b/src/configure index 3ad763ca..c4eb9005 100755 --- a/src/configure +++ b/src/configure @@ -791,9 +791,9 @@ Optional Packages: --with-aspell Use aspell spelling package to provide term expansion to other spellings --with-fam Use File Alteration Monitor for almost real time - indexing of modified files. Give directory where fam - library lives as argument if this is not found by - configure. + indexing of modified files. Give the fam/gamin + library as argument (ie: /usr/lib/libfam.so) if + configure does not find the right one. Some influential environment variables: CC C compiler command @@ -1286,11 +1286,11 @@ case $withFam in no);; yes) for dir in /usr/local/lib /usr/lib;do - if test -f $dir/libfam.so ; then famLibDir=$dir;break;fi + if test -f $dir/libfam.so ; then famLib=$dir/libfam.so;break;fi done ;; *) # The argument should be the path to the fam library - famLibDir=$withFam + famLib=$withFam ;; esac @@ -1305,14 +1305,18 @@ cat >>confdefs.h <<\_ACEOF #define RCL_USE_FAM 1 _ACEOF - if test X$famLibDir != X ; then + if test X$famLib != X ; then + famLibDir=`dirname $famLib` famBase=`dirname $famLibDir` + famBLib=`basename $famLib .so | sed -e s/lib//` if test ! -f $famBase/include/fam.h ; then { { echo "$as_me:$LINENO: error: fam.h not found in $famBase/include. Specify --with-fam=no to disable fam support" >&5 echo "$as_me: error: fam.h not found in $famBase/include. Specify --with-fam=no to disable fam support" >&2;} { (exit 1); exit 1; }; } fi - LIBFAM="-L$famLibDir -lfam" + LIBFAM="-L$famLibDir -l$famBLib" + { echo "$as_me:$LINENO: fam library directive: $LIBFAM" >&5 +echo "$as_me: fam library directive: $LIBFAM" >&6;} cat >>confdefs.h <<_ACEOF #define FAM_INCLUDE "$famBase/include/fam.h" diff --git a/src/configure.ac b/src/configure.ac index 8b6955ce..96f48df7 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -55,29 +55,32 @@ fi # Real time monitoring with FAM AC_ARG_WITH(fam, AC_HELP_STRING([--with-fam], - [Use File Alteration Monitor for almost real time indexing of modified files. Give directory where fam library lives as argument if this is not found by configure.]), + [Use File Alteration Monitor for almost real time indexing of modified files. Give the fam/gamin library as argument (ie: /usr/lib/libfam.so) if configure does not find the right one.]), withFam=$withval, withFam=no) case $withFam in no);; yes) for dir in /usr/local/lib /usr/lib;do - if test -f $dir/libfam.so ; then famLibDir=$dir;break;fi + if test -f $dir/libfam.so ; then famLib=$dir/libfam.so;break;fi done ;; *) # The argument should be the path to the fam library - famLibDir=$withFam + famLib=$withFam ;; esac if test X$withFam != Xno ; then AC_DEFINE(RCL_MONITOR, 1, [Real time monitoring option]) AC_DEFINE(RCL_USE_FAM, 1, [Compile the fam interface]) - if test X$famLibDir != X ; then + if test X$famLib != X ; then + famLibDir=`dirname $famLib` famBase=`dirname $famLibDir` + famBLib=`basename $famLib .so | sed -e s/lib//` if test ! -f $famBase/include/fam.h ; then AC_MSG_ERROR([fam.h not found in $famBase/include. Specify --with-fam=no to disable fam support]) fi - LIBFAM="-L$famLibDir -lfam" + LIBFAM="-L$famLibDir -l$famBLib" + AC_MSG_NOTICE([fam library directive: $LIBFAM]) AC_DEFINE_UNQUOTED(FAM_INCLUDE, "$famBase/include/fam.h", [Path to the fam api include file]) else diff --git a/src/index/Makefile b/src/index/Makefile index eb62ce8b..ab537142 100644 --- a/src/index/Makefile +++ b/src/index/Makefile @@ -9,7 +9,8 @@ all: depend $(PROGS) $(BIGLIB) RECOLLINDEX_OBJS= recollindex.o rclmonrcv.o rclmonprc.o $(BIGLIB) $(MIMELIB) recollindex : $(RECOLLINDEX_OBJS) $(CXX) $(ALL_CXXFLAGS) -o recollindex $(RECOLLINDEX_OBJS) \ - $(BSTATIC) $(LIBXAPIAN) $(LIBICONV) $(BDYNAMIC) -lfam $(LIBSYS) + $(BSTATIC) $(LIBXAPIAN) $(LIBICONV) $(BDYNAMIC) \ + $(LIBFAM) $(LIBSYS) recollindex.o : recollindex.cpp $(CXX) $(ALL_CXXFLAGS) -c -o recollindex.o $< rclmonrcv.o : rclmonrcv.cpp diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 2d7e6b60..5b7dbf89 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.38 2006-10-16 15:33:08 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.39 2006-10-22 14:47:13 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -223,13 +223,13 @@ bool DbIndexer::indexFiles(const list &filenames) if (m_config->getConfParam("idxabsmlen", &abslen)) m_db.setAbstractParams(abslen, -1, -1); struct stat stb; - if (stat(it->c_str(), &stb) != 0) { - LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(), + if (lstat(it->c_str(), &stb) != 0) { + LOGERR(("DbIndexer::indexFiles: lstat(%s): %s", it->c_str(), strerror(errno))); continue; } if (!S_ISREG(stb.st_mode)) { - LOGERR(("DbIndexer::indexFiles: %s: not a regular file\n", + LOGDEB2(("DbIndexer::indexFiles: %s: not a regular file\n", it->c_str())); continue; } @@ -257,7 +257,7 @@ bool DbIndexer::indexFiles(const list &filenames) if (processone(*it, &stb, FsTreeWalker::FtwRegular) != FsTreeWalker::FtwOk) { - LOGERR(("DbIndexer::indexFiles: Database error\n")); + LOGERR(("DbIndexer::indexFiles: processone failed\n")); return false; } skipped: @@ -273,6 +273,31 @@ bool DbIndexer::indexFiles(const list &filenames) return true; } + +/** Purge docs for given files out of the database */ +bool DbIndexer::purgeFiles(const list &filenames) +{ + if (!init()) + return false; + + list::const_iterator it; + for (it = filenames.begin(); it != filenames.end(); it++) { + if (!m_db.purgeFile(*it)) { + LOGERR(("DbIndexer::purgeFiles: Database error\n")); + return false; + } + } + + // The close would be done in our destructor, but we want status here + if (!m_db.close()) { + LOGERR(("DbIndexer::purgefiles: error closing database in %s\n", + m_dbdir.c_str())); + return false; + } + return true; +} + + /// This method gets called for every file and directory found by the /// tree walker. /// @@ -308,7 +333,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // without mime type will not be purged from the db, resulting // in possible 'cannot intern file' messages at query time... if (!m_db.needUpdate(fn, stp)) { - LOGDEB(("indexfile: up to date: %s\n", fn.c_str())); + LOGDEB(("processone: up to date: %s\n", fn.c_str())); if (m_updater) { m_updater->status.fn = fn; if (!m_updater->update()) { diff --git a/src/index/indexer.h b/src/index/indexer.h index 5004d1e0..351a9a8e 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -16,7 +16,7 @@ */ #ifndef _INDEXER_H_INCLUDED_ #define _INDEXER_H_INCLUDED_ -/* @(#$Id: indexer.h,v 1.19 2006-10-16 15:33:08 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: indexer.h,v 1.20 2006-10-22 14:47:13 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -116,6 +116,9 @@ class DbIndexer : public FsTreeWalkerCB { /** Index a list of files. No db cleaning or stemdb updating */ bool indexFiles(const std::list &files); + /** Purge a list of files. */ + bool purgeFiles(const std::list &files); + /** Create stem database for given language */ bool createStemDb(const string &lang); @@ -141,8 +144,9 @@ class DbIndexer : public FsTreeWalkerCB { bool init(bool rst = false); }; -/** Helper method in recollindex.cpp for initial checks/setup to index +/** Helper methods in recollindex.cpp for initial checks/setup to index * a list of files (either from the monitor or the command line) */ extern bool indexfiles(RclConfig *config, const list &filenames); +extern bool purgefiles(RclConfig *config, const list &filenames); #endif /* _INDEXER_H_INCLUDED_ */ diff --git a/src/index/rclmonprc.cpp b/src/index/rclmonprc.cpp index 319f4105..df7cef92 100644 --- a/src/index/rclmonprc.cpp +++ b/src/index/rclmonprc.cpp @@ -2,7 +2,7 @@ #ifdef RCL_MONITOR #ifndef lint -static char rcsid[] = "@(#$Id: rclmonprc.cpp,v 1.2 2006-10-17 14:41:59 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmonprc.cpp,v 1.3 2006-10-22 14:47:13 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -117,11 +117,13 @@ RclConfig *RclMonEventQueue::getConfig() return m_data->m_config; } +extern int stopindexing; + bool RclMonEventQueue::ok() { if (m_data == 0) return false; - return m_data->m_ok; + return !stopindexing && m_data->m_ok; } void RclMonEventQueue::setTerminate() @@ -143,11 +145,9 @@ bool RclMonEventQueue::pushEvent(const RclMonEvent &ev) return true; } - pthread_t rcv_thrid; void *rcv_result; extern void *rclMonRcvRun(void *); -extern int stopindexing; bool startMonitor(RclConfig *conf, bool nofork) { @@ -163,7 +163,7 @@ bool startMonitor(RclConfig *conf, bool nofork) LOGDEB(("start_monitoring: entering main loop\n")); while (rclEQ.wait()) { LOGDEB2(("startMonitor: wait returned\n")); - if (stopindexing || !rclEQ.ok()) + if (!rclEQ.ok()) break; list modified; list deleted; @@ -191,11 +191,13 @@ bool startMonitor(RclConfig *conf, bool nofork) // Unlock queue before processing lists rclEQ.unlock(); // Process - indexfiles(conf, modified); + if (!indexfiles(conf, modified)) + break; + if (!purgefiles(conf, deleted)) + break; // Lock queue before waiting again rclEQ.lock(); } - LOGERR(("start_monitoring: rclEQ::wait() failed\n")); - return false; + return true; } #endif // RCL_MONITOR diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp index 2bb81d20..043a0923 100644 --- a/src/index/rclmonrcv.cpp +++ b/src/index/rclmonrcv.cpp @@ -1,7 +1,7 @@ #include "autoconfig.h" #ifdef RCL_MONITOR #ifndef lint -static char rcsid[] = "@(#$Id: rclmonrcv.cpp,v 1.2 2006-10-17 14:41:59 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmonrcv.cpp,v 1.3 2006-10-22 14:47:13 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -35,71 +35,92 @@ static char rcsid[] = "@(#$Id: rclmonrcv.cpp,v 1.2 2006-10-17 14:41:59 dockes Ex */ -/** A small virtual interface for monitors. Suitable to let either of - fam/gamin/ or raw imonitor hide behind */ + +/** A small virtual interface for monitors. Probably suitable to let + either of fam/gamin or raw imonitor hide behind */ class RclMonitor { public: RclMonitor(){} virtual ~RclMonitor() {} virtual bool addWatch(const string& path, const struct stat&) = 0; - virtual bool getEvent(RclMonEvent& ev) = 0; + virtual bool getEvent(RclMonEvent& ev, int secs = -1) = 0; virtual bool ok() = 0; }; -// Monitor factory + +// Monitor factory. We only have one compiled-in kind at a time, no +// need for a 'kind' parameter static RclMonitor *makeMonitor(); -/** Class used to create the directory watches */ +/** This class is a callback for the file system tree walker + class. The callback method alternatively creates the directory + watches and flushes the event queue (to avoid a possible overflow + while we create the watches)*/ class WalkCB : public FsTreeWalkerCB { public: - WalkCB(RclConfig *conf, RclMonitor *mon) - : m_conf(conf), m_mon(mon) + WalkCB(RclConfig *conf, RclMonitor *mon, RclMonEventQueue *queue) + : m_conf(conf), m_mon(mon), m_queue(queue) {} virtual ~WalkCB() {} virtual FsTreeWalker::Status - processone(const string &fn, const struct stat *st, - FsTreeWalker::CbFlag flg) + processone(const string &fn, const struct stat *st, FsTreeWalker::CbFlag flg) { LOGDEB2(("rclMonRcvRun: processone %s m_mon %p m_mon->ok %d\n", fn.c_str(), m_mon, m_mon?m_mon->ok():0)); + // Create watch when entering directory if (flg == FsTreeWalker::FtwDirEnter) { + // Empty whatever events we may already have on queue + while (m_queue->ok() && m_mon->ok()) { + RclMonEvent ev; + if (m_mon->getEvent(ev, 0)) { + m_queue->pushEvent(ev); + } else { + break; + } + } if (!m_mon || !m_mon->ok() || !m_mon->addWatch(fn, *st)) return FsTreeWalker::FtwError; } return FsTreeWalker::FtwOk; } + private: - RclConfig *m_conf; - RclMonitor *m_mon; + RclConfig *m_conf; + RclMonitor *m_mon; + RclMonEventQueue *m_queue; }; -/** Main thread routine: create watches, then wait for events an queue them */ +/** Main thread routine: create watches, then forever wait for and queue events */ void *rclMonRcvRun(void *q) { RclMonEventQueue *queue = (RclMonEventQueue *)q; - RclMonitor *mon; LOGDEB(("rclMonRcvRun: running\n")); + // Create the fam/whatever interface object + RclMonitor *mon; if ((mon = makeMonitor()) == 0) { LOGERR(("rclMonRcvRun: makeMonitor failed\n")); - rclEQ.setTerminate(); + queue->setTerminate(); return 0; } - // Get top directories from config and walk trees to add watches - FsTreeWalker walker; - WalkCB walkcb(queue->getConfig(), mon); + // Get top directories from config list tdl = queue->getConfig()->getTopdirs(); if (tdl.empty()) { LOGERR(("rclMonRcvRun:: top directory list (topdirs param.) not" "found in config or Directory list parse error")); - rclEQ.setTerminate(); + queue->setTerminate(); return 0; } + + // Walk the directory trees to add watches + FsTreeWalker walker; + WalkCB walkcb(queue->getConfig(), mon, queue); for (list::iterator it = tdl.begin(); it != tdl.end(); it++) { queue->getConfig()->setKeyDir(*it); + // Adjust the skipped names according to config walker.clearSkippedNames(); string skipped; if (queue->getConfig()->getConfParam("skippedNames", skipped)) { @@ -112,19 +133,16 @@ void *rclMonRcvRun(void *q) } // Forever wait for monitoring events and add them to queue: - LOGDEB2(("rclMonRcvRun: waiting for events. rclEQ.ok() %d\n", rclEQ.ok())); - while (rclEQ.ok()) { - if (!mon->ok()) - break; + LOGDEB2(("rclMonRcvRun: waiting for events. queue->ok() %d\n", queue->ok())); + while (queue->ok() && mon->ok()) { RclMonEvent ev; if (mon->getEvent(ev)) { - rclEQ.pushEvent(ev); + queue->pushEvent(ev); } - if (!mon->ok()) - break; } + LOGDEB(("rclMonRcvRun: exiting\n")); - rclEQ.setTerminate(); + queue->setTerminate(); return 0; } @@ -133,6 +151,7 @@ void *rclMonRcvRun(void *q) #include #include +// Translate event code to string (debug) static const char *event_name(int code) { static const char *famevent[] = { @@ -149,21 +168,22 @@ static const char *event_name(int code) }; static char unknown_event[20]; - if (code < FAMChanged || code > FAMEndExist) - { + if (code < FAMChanged || code > FAMEndExist) { sprintf(unknown_event, "unknown (%d)", code); return unknown_event; } return famevent[code]; } -// FAM based monitor class +/** FAM based monitor class. We have to keep a record of FAM watch + request numbers to directory names as the event only contain the + request number and file name, not the full path */ class RclFAM : public RclMonitor { public: RclFAM(); virtual ~RclFAM(); virtual bool addWatch(const string& path, const struct stat& st); - virtual bool getEvent(RclMonEvent& ev); + virtual bool getEvent(RclMonEvent& ev, int secs = -1); bool ok() {return m_ok;} private: @@ -213,7 +233,7 @@ bool RclFAM::addWatch(const string& path, const struct stat& st) return true; } -bool RclFAM::getEvent(RclMonEvent& ev) +bool RclFAM::getEvent(RclMonEvent& ev, int secs) { if (!ok()) return false; @@ -224,16 +244,22 @@ bool RclFAM::getEvent(RclMonEvent& ev) FD_ZERO(&readfds); FD_SET(fam_fd, &readfds); - // Note: can't see a reason to set a timeout. Only reason we might - // want out is signal which will break the select call anyway (I - // don't think that there is any system still using the old bsd-type - // syscall re-entrance after signal). LOGDEB(("RclFAM::getEvent: select\n")); - if (select(fam_fd + 1, &readfds, 0, 0, 0) < 0) { + struct timeval timeout; + if (secs >= 0) { + memset(&timeout, 0, sizeof(timeout)); + timeout.tv_sec = secs; + } + int ret; + if ((ret=select(fam_fd + 1, &readfds, 0, 0, secs >= 0 ? &timeout : 0)) < 0) { LOGERR(("RclFAM::getEvent: select failed, errno %d\n", errno)); close(); return false; + } else if (ret == 0) { + // timeout + return false; } + if (!FD_ISSET(fam_fd, &readfds)) return false; @@ -243,8 +269,10 @@ bool RclFAM::getEvent(RclMonEvent& ev) close(); return false; } + map::const_iterator it; - if ((it = m_reqtodir.find(fe.fr.reqnum)) != m_reqtodir.end()) { + if ((fe.filename[0] != '/') && + (it = m_reqtodir.find(fe.fr.reqnum)) != m_reqtodir.end()) { ev.m_path = path_cat(it->second, fe.filename); } else { ev.m_path = fe.filename; @@ -279,7 +307,7 @@ bool RclFAM::getEvent(RclMonEvent& ev) return true; } -// The monitor factory +// The monitor 'factory' static RclMonitor *makeMonitor() { return new RclFAM; diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 9f0c2a63..571cba17 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.24 2006-10-17 14:41:59 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.25 2006-10-22 14:47:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -42,6 +42,27 @@ using namespace std; ConfIndexer *confindexer; DbIndexer *dbindexer; +int stopindexing; +// Mainly used to request indexing stop, we currently do not use the +// current file name +class MyUpdater : public DbIxStatusUpdater { + public: + virtual bool update() { + if (stopindexing) { + return false; + } + return true; + } +}; +MyUpdater updater; + +static void sigcleanup(int sig) +{ + fprintf(stderr, "sigcleanup\n"); + LOGDEB(("sigcleanup\n")); + stopindexing = 1; +} + static bool makeDbIndexer(RclConfig *config) { string dbdir = config->getDbDir(); @@ -57,7 +78,7 @@ static bool makeDbIndexer(RclConfig *config) } if (!dbindexer) - dbindexer = new DbIndexer(config, dbdir); + dbindexer = new DbIndexer(config, dbdir, &updater); return true; } @@ -116,6 +137,40 @@ bool indexfiles(RclConfig *config, const list &filenames) return dbindexer->indexFiles(myfiles); } +// Delete a list of files. +bool purgefiles(RclConfig *config, const list &filenames) +{ + if (filenames.empty()) + return true; + + if (o_tdl.empty()) { + o_tdl = config->getTopdirs(); + if (o_tdl.empty()) { + fprintf(stderr, "Top directory list (topdirs param.) " + "not found in config or Directory list parse error"); + return false; + } + } + + list myfiles; + for (list::const_iterator it = filenames.begin(); + it != filenames.end(); it++) { + myfiles.push_back(path_canon(*it)); + } + + // Note: we should sort the file names against the topdirs here + // and check for different databases. But we can for now only have + // one database per config, so we set the keydir from the first + // file (which is not really needed...), create the indexer/db and + // go: + config->setKeyDir(path_getfather(*myfiles.begin())); + + if (!makeDbIndexer(config) || !dbindexer) + return false; + else + return dbindexer->purgeFiles(myfiles); +} + // Create additional stem database static bool createstemdb(RclConfig *config, const string &lang) { @@ -134,26 +189,6 @@ static void cleanup() dbindexer = 0; } -int stopindexing; -// Mainly used to request indexing stop, we currently do not use the -// current file name -class MyUpdater : public DbIxStatusUpdater { - public: - virtual bool update() { - if (stopindexing) { - return false; - } - return true; - } -}; -MyUpdater updater; - -static void sigcleanup(int sig) -{ - fprintf(stderr, "sigcleanup\n"); - stopindexing = 1; -} - static const char *thisprog; static int op_flags; #define OPT_MOINS 0x1 diff --git a/src/mk/localdefs.in b/src/mk/localdefs.in index 5c84ae46..308b9561 100644 --- a/src/mk/localdefs.in +++ b/src/mk/localdefs.in @@ -5,6 +5,8 @@ XAPIANCXXFLAGS=@XAPIANCXXFLAGS@ LIBICONV=@LIBICONV@ INCICONV=@INCICONV@ +LIBFAM = @LIBFAM@ + prefix = @prefix@ datadir = @datadir@ RECOLL_DATADIR = ${datadir}/recoll diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 37f98f87..bf8a9271 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.80 2006-10-09 16:37:08 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.81 2006-10-22 14:47:13 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -56,18 +56,21 @@ using namespace std; #ifndef NO_NAMESPACES namespace Rcl { #endif - -// Truncate longer path and uniquize with hash . The goal for this is -// to avoid xapian max term length limitations, not to gain space (we -// gain very little even with very short maxlens like 30) + +// Max length for path terms stored for each document. Truncate +// longer path and uniquize with hash. The goal for this is to avoid +// xapian max term length limitations, not to gain space (we gain very +// little even with very short maxlens like 30) #define PATHHASHLEN 150 // Synthetic abstract marker (to discriminate from abstract actually // found in doc) const static string rclSyntAbs = "?!#@"; -// Data for a xapian database. There could actually be 2 different -// ones for indexing or query as there is not much in common. +// A class for data and methods that would have to expose +// Xapian-specific stuff if they were in Rcl::Db. There could actually be +// 2 different ones for indexing or query as there is not much in +// common. class Native { public: Db *m_db; @@ -96,6 +99,35 @@ class Native { Xapian::docid docid, const list& terms); + /** Compute list of subdocuments for a given path (given by hash) */ + bool subDocs(const string &hash, vector& docids) { + + docids.clear(); + string qterm = "Q"+ hash + "|"; + Xapian::Database db = m_iswritable ? wdb: db; + Xapian::TermIterator it = db.allterms_begin(); + it.skip_to(qterm); + string ermsg; + try { + for (;it != db.allterms_end(); it++) { + // If current term does not begin with qterm or has + // another |, not the same file + if ((*it).find(qterm) != 0 || + (*it).find_last_of("|") != qterm.length() -1) + break; + docids.push_back(*(db.postlist_begin(*it))); + } + return true; + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); + } catch (...) { + ermsg= "Unknown error"; + } + LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str())); + return false; + } + + Native(Db *db) : m_db(db), m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) @@ -655,7 +687,6 @@ bool Db::needUpdate(const string &filename, const struct stat *stp) pathHash(filename, hash, PATHHASHLEN); string pterm = "P" + hash; const char *ermsg; - string qterm = "Q"+ hash + "|"; // Look for all documents with this path. We need to look at all // to set their existence flag. We check the update time on the @@ -697,20 +728,16 @@ bool Db::needUpdate(const string &filename, const struct stat *stp) m_ndb->updated[*docid] = true; // Set the existence flag for all the subdocs (if any) - Xapian::TermIterator it = m_ndb->wdb.allterms_begin(); - it.skip_to(qterm); - LOGDEB2(("First qterm: [%s]\n", (*it).c_str())); - for (;it != m_ndb->wdb.allterms_end(); it++) { - // If current term does not begin with qterm or has another |, not - // the same file - if ((*it).find(qterm) != 0 || - (*it).find_last_of("|") != qterm.length() -1) - break; - docid = m_ndb->wdb.postlist_begin(*it); - if (*docid < m_ndb->updated.size()) { - LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n", - *docid, (*it).c_str())); - m_ndb->updated[*docid] = true; + vector docids; + if (!m_ndb->subDocs(hash, docids)) { + LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n")); + return true; + } + for (vector::iterator it = docids.begin(); + it != docids.end(); it++) { + if (*it < m_ndb->updated.size()) { + LOGDEB2(("Db::needUpdate: set flag for docid %d\n", *it)); + m_ndb->updated[*it] = true; } } return false; @@ -764,7 +791,9 @@ bool Db::createStemDb(const string& lang) /** * This is called at the end of an indexing session, to delete the - * documents for files that are no longer there. + * documents for files that are no longer there. This can ONLY be called + * after a full file-system tree walk, else the file existence flags will + * be wrong. */ bool Db::purge() { @@ -806,6 +835,47 @@ bool Db::purge() return true; } +/** Delete document(s) for given filename */ +bool Db::purgeFile(const string &fn) +{ + LOGDEB(("Db:purgeFile: [%s]\n", fn.c_str())); + if (m_ndb == 0) + return false; + Xapian::WritableDatabase db = m_ndb->wdb; + string hash; + pathHash(fn, hash, PATHHASHLEN); + string pterm = "P" + hash; + const char *ermsg = ""; + try { + Xapian::PostingIterator docid = db.postlist_begin(pterm); + if (docid == db.postlist_end(pterm)) + return true; + LOGDEB(("purgeFile: delete docid %d\n", *docid)); + db.delete_document(*docid); + vector docids; + m_ndb->subDocs(hash, docids); + LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size())); + for (vector::iterator it = docids.begin(); + it != docids.end(); it++) { + LOGDEB2(("Db::purgeFile: delete subdoc %d\n", *it)); + db.delete_document(*it); + } + return true; + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); + } catch (const string &s) { + ermsg = s.c_str(); + } catch (const char *s) { + ermsg = s; + } catch (...) { + ermsg = "Caught unknown exception"; + } + if (*ermsg) { + LOGERR(("Db::purgeFile: %s\n", ermsg)); + } + return false; +} + // Splitter callback for breaking query into terms class wsQData : public TextSplitCB { public: @@ -1378,6 +1448,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent) return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms); } + // Retrieve document defined by file name and internal path. bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 40bcaede..abb637aa 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.37 2006-10-09 16:37:08 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.38 2006-10-22 14:47:13 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -123,16 +123,35 @@ class Db { bool close(); bool isopen(); - int docCnt(); /// Return total docs in db + /** Return total docs in db */ + int docCnt(); - // Update-related functions + + /* Update-related functions */ + + /** Add document. The Doc class should have been filled as much as + possible depending on the document type */ bool add(const string &filename, const Doc &doc, const struct stat *stp); + + /** Test if the db entry for the given filename/stat is up to date */ bool needUpdate(const string &filename, const struct stat *stp); + + /** Remove documents that no longer exist in the file system. This + depends on the update map, which is built during + indexation. This should only be called after a full walk of + the file system, else the update map will not be complete, and + many documents will be deleted that shouldn't */ bool purge(); + + /** Delete document(s) for given filename */ + bool purgeFile(const string &filename); + + /** Create stem expansion database for given language. */ bool createStemDb(const string &lang); + /** Delete stem expansion database for given language. */ bool deleteStemDb(const string &lang); - // Query-related functions + /* Query-related functions */ // Parse query string and initialize query bool setQuery(AdvSearchData &q, int opts = QO_NONE, @@ -144,11 +163,11 @@ class Db { // Stem expansion is performed if lang is not empty list completions(const string &s, const string &lang, int max=20); - /// Add extra database for querying + /** Add extra database for querying */ bool addQueryDb(const string &dir); - /// Remove extra database. if dir == "", remove all. + /** Remove extra database. if dir == "", remove all. */ bool rmQueryDb(const string &dir); - /// Tell if directory seems to hold xapian db + /** Tell if directory seems to hold xapian db */ static bool testDbDir(const string &dir); /** Get document at rank i in current query.