From 6ef7b546f21a98cf30cf85e7889a8ccc0a49892c Mon Sep 17 00:00:00 2001 From: dockes Date: Sat, 14 Nov 2009 08:21:45 +0000 Subject: [PATCH] beaglequeue indexFiles --- src/index/beaglequeue.cpp | 39 ++++++++++- src/index/beaglequeue.h | 7 ++ src/index/fsindexer.cpp | 140 ++++++++++++++++++++++++++++---------- src/index/fsindexer.h | 10 ++- src/index/indexer.cpp | 75 +++++++++----------- src/index/indexer.h | 9 +-- src/index/recollindex.cpp | 4 +- src/index/recollindex.h | 4 +- src/rcldb/rcldb.cpp | 8 ++- src/rcldb/rcldb.h | 2 +- 10 files changed, 204 insertions(+), 94 deletions(-) diff --git a/src/index/beaglequeue.cpp b/src/index/beaglequeue.cpp index 23781d77..c14fd057 100644 --- a/src/index/beaglequeue.cpp +++ b/src/index/beaglequeue.cpp @@ -173,7 +173,8 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db, { if (!m_config->getConfParam("beaglequeuedir", m_queuedir)) - m_queuedir = path_tildexpand("~/.beagle/ToIndex"); + m_queuedir = path_tildexpand("~/.beagle/ToIndex/"); + path_catslash(m_queuedir); if (m_db && m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) { string reason; @@ -336,6 +337,42 @@ bool BeagleQueueIndexer::index() return true; } +bool BeagleQueueIndexer::indexFiles(list& files) +{ + if (!m_db) { + LOGERR(("BeagleQueueIndexer::indexfiles no db??\n")); + return false; + } + for (list::iterator it = files.begin(); it != files.end(); it++) { + if (it->empty()) + continue;//?? + string father = path_getfather(*it); + if (father.compare(m_queuedir)) { + LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n", + it->c_str())); + continue; + } + string fn = path_getsimple(*it); + if (fn.empty() || fn.at(0) == '.') + continue; + struct stat st; + if (lstat(it->c_str(), &st) != 0) { + LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n", + it->c_str())); + continue; + } + if (!S_ISREG(st.st_mode)) { + LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n", + it->c_str())); + continue; + } + + processone(*it, &st, FsTreeWalker::FtwRegular); + files.erase(it); + } + return true; +} + FsTreeWalker::Status BeagleQueueIndexer::processone(const string &path, const struct stat *stp, diff --git a/src/index/beaglequeue.h b/src/index/beaglequeue.h index 5b2dc129..6ea7952e 100644 --- a/src/index/beaglequeue.h +++ b/src/index/beaglequeue.h @@ -51,6 +51,13 @@ public: FsTreeWalker::Status processone(const string &, const struct stat *, FsTreeWalker::CbFlag); + /** Index a list of files. No db cleaning or stemdb updating */ + bool indexFiles(list& files); + /** Purge a list of files. No way to do this currently and dont want + * to do anything as this is mostly called by the monitor when *I* delete + * files inside the queue dir */ + bool purgeFiles(list& files) {return true;} + bool getFromCache(const string& udi, Rcl::Doc &doc, string& data, string *hittype = 0); private: diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index bdffe09f..beeb25b0 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -86,18 +86,19 @@ bool FsIndexer::init() return false; } } + if (m_tdl.empty()) { + m_tdl = m_config->getTopdirs(); + if (m_tdl.empty()) { + LOGERR(("FsIndexers: no topdirs list defined\n")); + return false; + } + } return true; } // Recursively index each directory in the topdirs: bool FsIndexer::index() { - list topdirs = m_config->getTopdirs(); - if (topdirs.empty()) { - LOGERR(("FsIndexer::indexTrees: no valid topdirs in config\n")); - return false; - } - if (!init()) return false; @@ -108,8 +109,8 @@ bool FsIndexer::index() m_walker.setSkippedPaths(m_config->getSkippedPaths()); - for (list::const_iterator it = topdirs.begin(); - it != topdirs.end(); it++) { + for (list::const_iterator it = m_tdl.begin(); + it != m_tdl.end(); it++) { LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), getDbDir().c_str())); @@ -151,60 +152,119 @@ bool FsIndexer::index() return true; } +static bool matchesSkipped(const list& tdl, + const list& skpnl, + const list& skppl, + const string& path) +{ + // First check what (if any) topdir this is in: + string td; + for (list::const_iterator it = tdl.begin(); it != tdl.end(); it++) { + if (path.find(*it) == 0) { + td = *it; + break; + } + } + if (td.empty()) { + LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str())); + return true; + } + + // Check path against skippedPaths. If we find a system where + // FNM_LEADING_DIR is undefined (its unposixy), will have to do this for + // all ascendant paths up to the topdir + for (list::const_iterator it = skppl.begin(); + it != skppl.end(); it++) { + if (fnmatch(it->c_str(), path.c_str(), FNM_PATHNAME|FNM_LEADING_DIR) + == 0) { + LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n", + path.c_str())); + return true; + } + } + + // Then check all path components up to the topdir against skippedNames + if (!skpnl.empty()) { + string mpath = path; + while (mpath.length() >= td.length() && mpath.length() > 1) { + string fn = path_getsimple(mpath); + for (list::const_iterator it = skpnl.begin(); + it != skpnl.end(); it++) { + LOGDEB2(("Checking [%s] against [%s]\n", + fn.c_str(), it->c_str())); + if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) { + LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n", + path.c_str())); + return true; + } + } + string::size_type len = mpath.length(); + mpath = path_getfather(mpath); + // getfather normally returns a path ending with /, getsimple + // would then return '' + if (!mpath.empty() && mpath[mpath.size()-1] == '/') + mpath.erase(mpath.size()-1); + // should not be necessary, but lets be prudent. If the + // path did not shorten, something is seriously amiss + // (could be an assert actually) + if (mpath.length() >= len) + return true; + } + } + return false; +} + /** * Index individual files, out of a full tree run. No database purging */ -bool FsIndexer::indexFiles(const list &filenames) +bool FsIndexer::indexFiles(list& files) { if (!init()) return false; - list::const_iterator it; - for (it = filenames.begin(); it != filenames.end(); it++) { - string dir = path_getfather(*it); - m_config->setKeyDir(dir); - int abslen; - if (m_config->getConfParam("idxabsmlen", &abslen)) - m_db->setAbstractParams(abslen, -1, -1); + for (list::iterator it = files.begin(); + it != files.end(); it++) { + struct stat stb; if (lstat(it->c_str(), &stb) != 0) { LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(), strerror(errno))); continue; } - // If we get to indexing directory names one day, will need to test // against dbdir here to avoid modification loops (with rclmon). if (!S_ISREG(stb.st_mode)) { - LOGDEB2(("FsIndexer::indexFiles: %s: not a regular file\n", + LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n", it->c_str())); continue; } + string dir = path_getfather(*it); + m_config->setKeyDir(dir); static string lstdir; - static list skpl; + static list skpnl; + static list skppl; if (lstdir.compare(dir)) { LOGDEB(("Recomputing list of skipped names\n")); - skpl = m_config->getSkippedNames(); + skpnl = m_config->getSkippedNames(); + skppl = m_config->getSkippedPaths(); lstdir = dir; } - if (!skpl.empty()) { - list::const_iterator skit; - string fn = path_getsimple(*it); - for (skit = skpl.begin(); skit != skpl.end(); skit++) { - if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) { - LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str())); - goto skipped; - } - } - } + + // Check path against indexed areas and skipped names/paths + if (matchesSkipped(m_tdl, skpnl, skppl, *it)) + continue; + + int abslen; + if (m_config->getConfParam("idxabsmlen", &abslen)) + m_db->setAbstractParams(abslen, -1, -1); + if (processone(*it, &stb, FsTreeWalker::FtwRegular) != FsTreeWalker::FtwOk) { LOGERR(("FsIndexer::indexFiles: processone failed\n")); return false; } - skipped: - false; // Need a statement here to make compiler happy ?? + files.erase(it); } return true; @@ -212,19 +272,25 @@ bool FsIndexer::indexFiles(const list &filenames) /** Purge docs for given files out of the database */ -bool FsIndexer::purgeFiles(const list &filenames) +bool FsIndexer::purgeFiles(list& files) { if (!init()) return false; - - list::const_iterator it; - for (it = filenames.begin(); it != filenames.end(); it++) { + for (list::iterator it = files.begin(); + it != files.end(); it++) { string udi; make_udi(*it, "", udi); - if (!m_db->purgeFile(udi)) { + // rcldb::purgefile returns true if the udi was either not + // found or deleted, false only in case of actual error + bool existed; + if (!m_db->purgeFile(udi, &existed)) { LOGERR(("FsIndexer::purgeFiles: Database error\n")); return false; } + // If we actually deleted something, take it off the list + if (existed) { + files.erase(it); + } } return true; diff --git a/src/index/fsindexer.h b/src/index/fsindexer.h index b72fd3e4..2d95c34f 100644 --- a/src/index/fsindexer.h +++ b/src/index/fsindexer.h @@ -18,6 +18,11 @@ #define _fsindexer_h_included_ /* @(#$Id: $ (C) 2009 J.F.Dockes */ +#include +#ifndef NO_NAMESPACES +using std::list; +#endif + #include "fstreewalk.h" #include "rcldb.h" @@ -58,10 +63,10 @@ class FsIndexer : public FsTreeWalkerCB { bool index(); /** Index a list of files. No db cleaning or stemdb updating */ - bool indexFiles(const std::list &files); + bool indexFiles(list &files); /** Purge a list of files. */ - bool purgeFiles(const std::list &files); + bool purgeFiles(list &files); /** Tree walker callback method */ FsTreeWalker::Status @@ -74,6 +79,7 @@ class FsIndexer : public FsTreeWalkerCB { string m_tmpdir; string m_reason; DbIxStatusUpdater *m_updater; + list m_tdl; // The configuration can set attribute fields to be inherited by // all files in a file system area. Ie: set "apptag = thunderbird" diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 281aa781..88425769 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -26,6 +26,8 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.71 2008-12-17 08:01:40 dockes Exp #include #include +#include + #include "debuglog.h" #include "indexer.h" #include "fsindexer.h" @@ -104,44 +106,14 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun) return true; } -bool ConfIndexer::initTopDirs() +bool ConfIndexer::indexFiles(std::list &files) { - if (m_tdl.empty()) { - m_tdl = m_config->getTopdirs(); - if (m_tdl.empty()) { - m_reason = "Top directory list (topdirs param.) " - "not found in config or Directory list parse error"; - return false; - } - } - return true; -} - -bool ConfIndexer::indexFiles(const std::list &files) -{ - if (!initTopDirs()) - return false; - list myfiles; for (list::const_iterator it = files.begin(); it != files.end(); it++) { - string fn = path_canon(*it); - bool ok = false; - // Check that this file name belongs to one of our subtrees - for (list::iterator dit = m_tdl.begin(); - dit != m_tdl.end(); dit++) { - if (fn.find(*dit) == 0) { - myfiles.push_back(fn); - ok = true; - break; - } - } - if (!ok) { - m_reason += string("File ") + fn + string(" not in indexed area\n"); - } + myfiles.push_back(path_canon(*it)); } - if (myfiles.empty()) - return true; + myfiles.sort(); if (!m_db.open(Rcl::Db::DbUpd)) { LOGERR(("ConfIndexer: indexFiles error opening database %s\n", @@ -149,9 +121,21 @@ bool ConfIndexer::indexFiles(const std::list &files) return false; } m_config->setKeyDir(""); + bool ret = false; if (!m_fsindexer) m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); - bool ret = m_fsindexer->indexFiles(files); + if (m_fsindexer) + ret = m_fsindexer->indexFiles(files); + + if (m_dobeagle && !myfiles.empty()) { + if (!m_beagler) + m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater); + if (m_beagler) { + ret = ret && m_beagler->indexFiles(myfiles); + } else { + ret = false; + } + } // The close would be done in our destructor, but we want status here if (!m_db.close()) { @@ -162,31 +146,40 @@ bool ConfIndexer::indexFiles(const std::list &files) return ret; } -bool ConfIndexer::purgeFiles(const std::list &files) +bool ConfIndexer::purgeFiles(std::list &files) { - if (!initTopDirs()) - return false; - list myfiles; for (list::const_iterator it = files.begin(); it != files.end(); it++) { myfiles.push_back(path_canon(*it)); } + myfiles.sort(); if (!m_db.open(Rcl::Db::DbUpd)) { LOGERR(("ConfIndexer: purgeFiles error opening database %s\n", m_config->getDbDir().c_str())); return false; } - + bool ret = false; m_config->setKeyDir(""); if (!m_fsindexer) m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); - bool ret = m_fsindexer->purgeFiles(files); + if (m_fsindexer) + ret = m_fsindexer->purgeFiles(myfiles); + + if (m_dobeagle && !myfiles.empty()) { + if (!m_beagler) + m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater); + if (m_beagler) { + ret = ret && m_beagler->purgeFiles(myfiles); + } else { + ret = false; + } + } // The close would be done in our destructor, but we want status here if (!m_db.close()) { - LOGERR(("ConfIndexer::index: error closing database in %s\n", + LOGERR(("ConfIndexer::purgefiles: error closing database in %s\n", m_config->getDbDir().c_str())); return false; } diff --git a/src/index/indexer.h b/src/index/indexer.h index fc37036b..8b6b4953 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -85,10 +85,10 @@ class ConfIndexer { static list getStemmerNames(); /** Index a list of files. No db cleaning or stemdb updating */ - bool indexFiles(const std::list &files); + bool indexFiles(std::list &files); /** Purge a list of files. */ - bool purgeFiles(const std::list &files); + bool purgeFiles(std::list &files); private: RclConfig *m_config; @@ -97,10 +97,7 @@ class ConfIndexer { bool m_dobeagle; BeagleQueueIndexer *m_beagler; DbIxStatusUpdater *m_updater; - string m_reason; - list m_tdl; - - bool initTopDirs(); + string m_reason; }; #endif /* _INDEXER_H_INCLUDED_ */ diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 3ffae840..fed76b7f 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -97,7 +97,7 @@ static bool makeIndexer(RclConfig *config) // this case we're called repeatedly in the same process, and the // confindexer is only created once by makeIndexer (but the db closed and // flushed every time) -bool indexfiles(RclConfig *config, const list &filenames) +bool indexfiles(RclConfig *config, list &filenames) { if (filenames.empty()) return true; @@ -107,7 +107,7 @@ bool indexfiles(RclConfig *config, const list &filenames) } // Delete a list of files. Same comments about call contexts as indexfiles. -bool purgefiles(RclConfig *config, const list &filenames) +bool purgefiles(RclConfig *config, list &filenames) { if (filenames.empty()) return true; diff --git a/src/index/recollindex.h b/src/index/recollindex.h index 591d83e9..b5ca8a98 100644 --- a/src/index/recollindex.h +++ b/src/index/recollindex.h @@ -20,8 +20,8 @@ /** Helper methods in recollindex.cpp for initial checks/setup to index * a list of files (either from the monitor or the command line) */ -extern bool indexfiles(RclConfig *config, const list &filenames); -extern bool purgefiles(RclConfig *config, const list &filenames); +extern bool indexfiles(RclConfig *config, list &filenames); +extern bool purgefiles(RclConfig *config, list &filenames); extern bool createAuxDbs(RclConfig *config); extern int stopindexing; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index e844c652..87f8df03 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1270,7 +1270,7 @@ bool Db::purge() } /* Delete document(s) for given unique identifier (doc and descendents) */ -bool Db::purgeFile(const string &udi) +bool Db::purgeFile(const string &udi, bool *existed) { LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str())); if (m_ndb == 0 || !m_ndb->m_iswritable) @@ -1280,8 +1280,12 @@ bool Db::purgeFile(const string &udi) string ermsg; try { Xapian::PostingIterator docid = db.postlist_begin(uniterm); - if (docid == db.postlist_end(uniterm)) + if (docid == db.postlist_end(uniterm)) { + if (existed) + *existed = false; return true; + } + *existed = true; LOGDEB(("purgeFile: delete docid %d\n", *docid)); db.delete_document(*docid); vector docids; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index bd7c1cc3..7b82156a 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -121,7 +121,7 @@ class Db { const Doc &doc); /** Delete document(s) for given UDI, including subdocs */ - bool purgeFile(const string &udi); + bool purgeFile(const string &udi, bool *existed = 0); /** Remove documents that no longer exist in the file system. This * depends on the update map, which is built during