beaglequeue indexFiles

This commit is contained in:
dockes 2009-11-14 08:21:45 +00:00
parent bbba826c06
commit 6ef7b546f2
10 changed files with 204 additions and 94 deletions

View File

@ -173,7 +173,8 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
{ {
if (!m_config->getConfParam("beaglequeuedir", m_queuedir)) if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
m_queuedir = path_tildexpand("~/.beagle/ToIndex"); m_queuedir = path_tildexpand("~/.beagle/ToIndex/");
path_catslash(m_queuedir);
if (m_db && m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) { if (m_db && m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
string reason; string reason;
@ -336,6 +337,42 @@ bool BeagleQueueIndexer::index()
return true; return true;
} }
bool BeagleQueueIndexer::indexFiles(list<string>& files)
{
if (!m_db) {
LOGERR(("BeagleQueueIndexer::indexfiles no db??\n"));
return false;
}
for (list<string>::iterator it = files.begin(); it != files.end(); it++) {
if (it->empty())
continue;//??
string father = path_getfather(*it);
if (father.compare(m_queuedir)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n",
it->c_str()));
continue;
}
string fn = path_getsimple(*it);
if (fn.empty() || fn.at(0) == '.')
continue;
struct stat st;
if (lstat(it->c_str(), &st) != 0) {
LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n",
it->c_str()));
continue;
}
if (!S_ISREG(st.st_mode)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n",
it->c_str()));
continue;
}
processone(*it, &st, FsTreeWalker::FtwRegular);
files.erase(it);
}
return true;
}
FsTreeWalker::Status FsTreeWalker::Status
BeagleQueueIndexer::processone(const string &path, BeagleQueueIndexer::processone(const string &path,
const struct stat *stp, const struct stat *stp,

View File

@ -51,6 +51,13 @@ public:
FsTreeWalker::Status FsTreeWalker::Status
processone(const string &, const struct stat *, FsTreeWalker::CbFlag); processone(const string &, const struct stat *, FsTreeWalker::CbFlag);
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(list<string>& files);
/** Purge a list of files. No way to do this currently and dont want
* to do anything as this is mostly called by the monitor when *I* delete
* files inside the queue dir */
bool purgeFiles(list<string>& files) {return true;}
bool getFromCache(const string& udi, Rcl::Doc &doc, string& data, bool getFromCache(const string& udi, Rcl::Doc &doc, string& data,
string *hittype = 0); string *hittype = 0);
private: private:

View File

@ -86,18 +86,19 @@ bool FsIndexer::init()
return false; return false;
} }
} }
if (m_tdl.empty()) {
m_tdl = m_config->getTopdirs();
if (m_tdl.empty()) {
LOGERR(("FsIndexers: no topdirs list defined\n"));
return false;
}
}
return true; return true;
} }
// Recursively index each directory in the topdirs: // Recursively index each directory in the topdirs:
bool FsIndexer::index() bool FsIndexer::index()
{ {
list<string> topdirs = m_config->getTopdirs();
if (topdirs.empty()) {
LOGERR(("FsIndexer::indexTrees: no valid topdirs in config\n"));
return false;
}
if (!init()) if (!init())
return false; return false;
@ -108,8 +109,8 @@ bool FsIndexer::index()
m_walker.setSkippedPaths(m_config->getSkippedPaths()); m_walker.setSkippedPaths(m_config->getSkippedPaths());
for (list<string>::const_iterator it = topdirs.begin(); for (list<string>::const_iterator it = m_tdl.begin();
it != topdirs.end(); it++) { it != m_tdl.end(); it++) {
LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(),
getDbDir().c_str())); getDbDir().c_str()));
@ -151,60 +152,119 @@ bool FsIndexer::index()
return true; return true;
} }
static bool matchesSkipped(const list<string>& tdl,
const list<string>& skpnl,
const list<string>& skppl,
const string& path)
{
// First check what (if any) topdir this is in:
string td;
for (list<string>::const_iterator it = tdl.begin(); it != tdl.end(); it++) {
if (path.find(*it) == 0) {
td = *it;
break;
}
}
if (td.empty()) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str()));
return true;
}
// Check path against skippedPaths. If we find a system where
// FNM_LEADING_DIR is undefined (its unposixy), will have to do this for
// all ascendant paths up to the topdir
for (list<string>::const_iterator it = skppl.begin();
it != skppl.end(); it++) {
if (fnmatch(it->c_str(), path.c_str(), FNM_PATHNAME|FNM_LEADING_DIR)
== 0) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n",
path.c_str()));
return true;
}
}
// Then check all path components up to the topdir against skippedNames
if (!skpnl.empty()) {
string mpath = path;
while (mpath.length() >= td.length() && mpath.length() > 1) {
string fn = path_getsimple(mpath);
for (list<string>::const_iterator it = skpnl.begin();
it != skpnl.end(); it++) {
LOGDEB2(("Checking [%s] against [%s]\n",
fn.c_str(), it->c_str()));
if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n",
path.c_str()));
return true;
}
}
string::size_type len = mpath.length();
mpath = path_getfather(mpath);
// getfather normally returns a path ending with /, getsimple
// would then return ''
if (!mpath.empty() && mpath[mpath.size()-1] == '/')
mpath.erase(mpath.size()-1);
// should not be necessary, but lets be prudent. If the
// path did not shorten, something is seriously amiss
// (could be an assert actually)
if (mpath.length() >= len)
return true;
}
}
return false;
}
/** /**
* Index individual files, out of a full tree run. No database purging * Index individual files, out of a full tree run. No database purging
*/ */
bool FsIndexer::indexFiles(const list<string> &filenames) bool FsIndexer::indexFiles(list<string>& files)
{ {
if (!init()) if (!init())
return false; return false;
list<string>::const_iterator it; for (list<string>::iterator it = files.begin();
for (it = filenames.begin(); it != filenames.end(); it++) { it != files.end(); it++) {
string dir = path_getfather(*it);
m_config->setKeyDir(dir);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db->setAbstractParams(abslen, -1, -1);
struct stat stb; struct stat stb;
if (lstat(it->c_str(), &stb) != 0) { if (lstat(it->c_str(), &stb) != 0) {
LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(), LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(),
strerror(errno))); strerror(errno)));
continue; continue;
} }
// If we get to indexing directory names one day, will need to test // If we get to indexing directory names one day, will need to test
// against dbdir here to avoid modification loops (with rclmon). // against dbdir here to avoid modification loops (with rclmon).
if (!S_ISREG(stb.st_mode)) { if (!S_ISREG(stb.st_mode)) {
LOGDEB2(("FsIndexer::indexFiles: %s: not a regular file\n", LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n",
it->c_str())); it->c_str()));
continue; continue;
} }
string dir = path_getfather(*it);
m_config->setKeyDir(dir);
static string lstdir; static string lstdir;
static list<string> skpl; static list<string> skpnl;
static list<string> skppl;
if (lstdir.compare(dir)) { if (lstdir.compare(dir)) {
LOGDEB(("Recomputing list of skipped names\n")); LOGDEB(("Recomputing list of skipped names\n"));
skpl = m_config->getSkippedNames(); skpnl = m_config->getSkippedNames();
skppl = m_config->getSkippedPaths();
lstdir = dir; lstdir = dir;
} }
if (!skpl.empty()) {
list<string>::const_iterator skit; // Check path against indexed areas and skipped names/paths
string fn = path_getsimple(*it); if (matchesSkipped(m_tdl, skpnl, skppl, *it))
for (skit = skpl.begin(); skit != skpl.end(); skit++) { continue;
if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str())); int abslen;
goto skipped; if (m_config->getConfParam("idxabsmlen", &abslen))
} m_db->setAbstractParams(abslen, -1, -1);
}
}
if (processone(*it, &stb, FsTreeWalker::FtwRegular) != if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
FsTreeWalker::FtwOk) { FsTreeWalker::FtwOk) {
LOGERR(("FsIndexer::indexFiles: processone failed\n")); LOGERR(("FsIndexer::indexFiles: processone failed\n"));
return false; return false;
} }
skipped: files.erase(it);
false; // Need a statement here to make compiler happy ??
} }
return true; return true;
@ -212,19 +272,25 @@ bool FsIndexer::indexFiles(const list<string> &filenames)
/** Purge docs for given files out of the database */ /** Purge docs for given files out of the database */
bool FsIndexer::purgeFiles(const list<string> &filenames) bool FsIndexer::purgeFiles(list<string>& files)
{ {
if (!init()) if (!init())
return false; return false;
for (list<string>::iterator it = files.begin();
list<string>::const_iterator it; it != files.end(); it++) {
for (it = filenames.begin(); it != filenames.end(); it++) {
string udi; string udi;
make_udi(*it, "", udi); make_udi(*it, "", udi);
if (!m_db->purgeFile(udi)) { // rcldb::purgefile returns true if the udi was either not
// found or deleted, false only in case of actual error
bool existed;
if (!m_db->purgeFile(udi, &existed)) {
LOGERR(("FsIndexer::purgeFiles: Database error\n")); LOGERR(("FsIndexer::purgeFiles: Database error\n"));
return false; return false;
} }
// If we actually deleted something, take it off the list
if (existed) {
files.erase(it);
}
} }
return true; return true;

View File

@ -18,6 +18,11 @@
#define _fsindexer_h_included_ #define _fsindexer_h_included_
/* @(#$Id: $ (C) 2009 J.F.Dockes */ /* @(#$Id: $ (C) 2009 J.F.Dockes */
#include <list>
#ifndef NO_NAMESPACES
using std::list;
#endif
#include "fstreewalk.h" #include "fstreewalk.h"
#include "rcldb.h" #include "rcldb.h"
@ -58,10 +63,10 @@ class FsIndexer : public FsTreeWalkerCB {
bool index(); bool index();
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<string> &files); bool indexFiles(list<string> &files);
/** Purge a list of files. */ /** Purge a list of files. */
bool purgeFiles(const std::list<string> &files); bool purgeFiles(list<string> &files);
/** Tree walker callback method */ /** Tree walker callback method */
FsTreeWalker::Status FsTreeWalker::Status
@ -74,6 +79,7 @@ class FsIndexer : public FsTreeWalkerCB {
string m_tmpdir; string m_tmpdir;
string m_reason; string m_reason;
DbIxStatusUpdater *m_updater; DbIxStatusUpdater *m_updater;
list<string> m_tdl;
// The configuration can set attribute fields to be inherited by // The configuration can set attribute fields to be inherited by
// all files in a file system area. Ie: set "apptag = thunderbird" // all files in a file system area. Ie: set "apptag = thunderbird"

View File

@ -26,6 +26,8 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.71 2008-12-17 08:01:40 dockes Exp
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
#include <algorithm>
#include "debuglog.h" #include "debuglog.h"
#include "indexer.h" #include "indexer.h"
#include "fsindexer.h" #include "fsindexer.h"
@ -104,44 +106,14 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
return true; return true;
} }
bool ConfIndexer::initTopDirs() bool ConfIndexer::indexFiles(std::list<string> &files)
{ {
if (m_tdl.empty()) {
m_tdl = m_config->getTopdirs();
if (m_tdl.empty()) {
m_reason = "Top directory list (topdirs param.) "
"not found in config or Directory list parse error";
return false;
}
}
return true;
}
bool ConfIndexer::indexFiles(const std::list<string> &files)
{
if (!initTopDirs())
return false;
list<string> myfiles; list<string> myfiles;
for (list<string>::const_iterator it = files.begin(); for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) { it != files.end(); it++) {
string fn = path_canon(*it); myfiles.push_back(path_canon(*it));
bool ok = false;
// Check that this file name belongs to one of our subtrees
for (list<string>::iterator dit = m_tdl.begin();
dit != m_tdl.end(); dit++) {
if (fn.find(*dit) == 0) {
myfiles.push_back(fn);
ok = true;
break;
}
}
if (!ok) {
m_reason += string("File ") + fn + string(" not in indexed area\n");
}
} }
if (myfiles.empty()) myfiles.sort();
return true;
if (!m_db.open(Rcl::Db::DbUpd)) { if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: indexFiles error opening database %s\n", LOGERR(("ConfIndexer: indexFiles error opening database %s\n",
@ -149,9 +121,21 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
return false; return false;
} }
m_config->setKeyDir(""); m_config->setKeyDir("");
bool ret = false;
if (!m_fsindexer) if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->indexFiles(files); if (m_fsindexer)
ret = m_fsindexer->indexFiles(files);
if (m_dobeagle && !myfiles.empty()) {
if (!m_beagler)
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (m_beagler) {
ret = ret && m_beagler->indexFiles(myfiles);
} else {
ret = false;
}
}
// The close would be done in our destructor, but we want status here // The close would be done in our destructor, but we want status here
if (!m_db.close()) { if (!m_db.close()) {
@ -162,31 +146,40 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
return ret; return ret;
} }
bool ConfIndexer::purgeFiles(const std::list<string> &files) bool ConfIndexer::purgeFiles(std::list<string> &files)
{ {
if (!initTopDirs())
return false;
list<string> myfiles; list<string> myfiles;
for (list<string>::const_iterator it = files.begin(); for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) { it != files.end(); it++) {
myfiles.push_back(path_canon(*it)); myfiles.push_back(path_canon(*it));
} }
myfiles.sort();
if (!m_db.open(Rcl::Db::DbUpd)) { if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: purgeFiles error opening database %s\n", LOGERR(("ConfIndexer: purgeFiles error opening database %s\n",
m_config->getDbDir().c_str())); m_config->getDbDir().c_str()));
return false; return false;
} }
bool ret = false;
m_config->setKeyDir(""); m_config->setKeyDir("");
if (!m_fsindexer) if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->purgeFiles(files); if (m_fsindexer)
ret = m_fsindexer->purgeFiles(myfiles);
if (m_dobeagle && !myfiles.empty()) {
if (!m_beagler)
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (m_beagler) {
ret = ret && m_beagler->purgeFiles(myfiles);
} else {
ret = false;
}
}
// The close would be done in our destructor, but we want status here // The close would be done in our destructor, but we want status here
if (!m_db.close()) { if (!m_db.close()) {
LOGERR(("ConfIndexer::index: error closing database in %s\n", LOGERR(("ConfIndexer::purgefiles: error closing database in %s\n",
m_config->getDbDir().c_str())); m_config->getDbDir().c_str()));
return false; return false;
} }

View File

@ -85,10 +85,10 @@ class ConfIndexer {
static list<string> getStemmerNames(); static list<string> getStemmerNames();
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<string> &files); bool indexFiles(std::list<string> &files);
/** Purge a list of files. */ /** Purge a list of files. */
bool purgeFiles(const std::list<string> &files); bool purgeFiles(std::list<string> &files);
private: private:
RclConfig *m_config; RclConfig *m_config;
@ -97,10 +97,7 @@ class ConfIndexer {
bool m_dobeagle; bool m_dobeagle;
BeagleQueueIndexer *m_beagler; BeagleQueueIndexer *m_beagler;
DbIxStatusUpdater *m_updater; DbIxStatusUpdater *m_updater;
string m_reason; string m_reason;
list<string> m_tdl;
bool initTopDirs();
}; };
#endif /* _INDEXER_H_INCLUDED_ */ #endif /* _INDEXER_H_INCLUDED_ */

View File

@ -97,7 +97,7 @@ static bool makeIndexer(RclConfig *config)
// this case we're called repeatedly in the same process, and the // this case we're called repeatedly in the same process, and the
// confindexer is only created once by makeIndexer (but the db closed and // confindexer is only created once by makeIndexer (but the db closed and
// flushed every time) // flushed every time)
bool indexfiles(RclConfig *config, const list<string> &filenames) bool indexfiles(RclConfig *config, list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;
@ -107,7 +107,7 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
} }
// Delete a list of files. Same comments about call contexts as indexfiles. // Delete a list of files. Same comments about call contexts as indexfiles.
bool purgefiles(RclConfig *config, const list<string> &filenames) bool purgefiles(RclConfig *config, list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;

View File

@ -20,8 +20,8 @@
/** Helper methods in recollindex.cpp for initial checks/setup to index /** Helper methods in recollindex.cpp for initial checks/setup to index
* a list of files (either from the monitor or the command line) */ * a list of files (either from the monitor or the command line) */
extern bool indexfiles(RclConfig *config, const list<string> &filenames); extern bool indexfiles(RclConfig *config, list<string> &filenames);
extern bool purgefiles(RclConfig *config, const list<string> &filenames); extern bool purgefiles(RclConfig *config, list<string> &filenames);
extern bool createAuxDbs(RclConfig *config); extern bool createAuxDbs(RclConfig *config);
extern int stopindexing; extern int stopindexing;

View File

@ -1270,7 +1270,7 @@ bool Db::purge()
} }
/* Delete document(s) for given unique identifier (doc and descendents) */ /* Delete document(s) for given unique identifier (doc and descendents) */
bool Db::purgeFile(const string &udi) bool Db::purgeFile(const string &udi, bool *existed)
{ {
LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str())); LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str()));
if (m_ndb == 0 || !m_ndb->m_iswritable) if (m_ndb == 0 || !m_ndb->m_iswritable)
@ -1280,8 +1280,12 @@ bool Db::purgeFile(const string &udi)
string ermsg; string ermsg;
try { try {
Xapian::PostingIterator docid = db.postlist_begin(uniterm); Xapian::PostingIterator docid = db.postlist_begin(uniterm);
if (docid == db.postlist_end(uniterm)) if (docid == db.postlist_end(uniterm)) {
if (existed)
*existed = false;
return true; return true;
}
*existed = true;
LOGDEB(("purgeFile: delete docid %d\n", *docid)); LOGDEB(("purgeFile: delete docid %d\n", *docid));
db.delete_document(*docid); db.delete_document(*docid);
vector<Xapian::docid> docids; vector<Xapian::docid> docids;

View File

@ -121,7 +121,7 @@ class Db {
const Doc &doc); const Doc &doc);
/** Delete document(s) for given UDI, including subdocs */ /** Delete document(s) for given UDI, including subdocs */
bool purgeFile(const string &udi); bool purgeFile(const string &udi, bool *existed = 0);
/** Remove documents that no longer exist in the file system. This /** Remove documents that no longer exist in the file system. This
* depends on the update map, which is built during * depends on the update map, which is built during