beaglequeue indexFiles

This commit is contained in:
dockes 2009-11-14 08:21:45 +00:00
parent bbba826c06
commit 6ef7b546f2
10 changed files with 204 additions and 94 deletions

View File

@ -173,7 +173,8 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
{
if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
m_queuedir = path_tildexpand("~/.beagle/ToIndex");
m_queuedir = path_tildexpand("~/.beagle/ToIndex/");
path_catslash(m_queuedir);
if (m_db && m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
string reason;
@ -336,6 +337,42 @@ bool BeagleQueueIndexer::index()
return true;
}
bool BeagleQueueIndexer::indexFiles(list<string>& files)
{
if (!m_db) {
LOGERR(("BeagleQueueIndexer::indexfiles no db??\n"));
return false;
}
for (list<string>::iterator it = files.begin(); it != files.end(); it++) {
if (it->empty())
continue;//??
string father = path_getfather(*it);
if (father.compare(m_queuedir)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n",
it->c_str()));
continue;
}
string fn = path_getsimple(*it);
if (fn.empty() || fn.at(0) == '.')
continue;
struct stat st;
if (lstat(it->c_str(), &st) != 0) {
LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n",
it->c_str()));
continue;
}
if (!S_ISREG(st.st_mode)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n",
it->c_str()));
continue;
}
processone(*it, &st, FsTreeWalker::FtwRegular);
files.erase(it);
}
return true;
}
FsTreeWalker::Status
BeagleQueueIndexer::processone(const string &path,
const struct stat *stp,

View File

@ -51,6 +51,13 @@ public:
FsTreeWalker::Status
processone(const string &, const struct stat *, FsTreeWalker::CbFlag);
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(list<string>& files);
/** Purge a list of files. No way to do this currently and dont want
* to do anything as this is mostly called by the monitor when *I* delete
* files inside the queue dir */
bool purgeFiles(list<string>& files) {return true;}
bool getFromCache(const string& udi, Rcl::Doc &doc, string& data,
string *hittype = 0);
private:

View File

@ -86,18 +86,19 @@ bool FsIndexer::init()
return false;
}
}
if (m_tdl.empty()) {
m_tdl = m_config->getTopdirs();
if (m_tdl.empty()) {
LOGERR(("FsIndexers: no topdirs list defined\n"));
return false;
}
}
return true;
}
// Recursively index each directory in the topdirs:
bool FsIndexer::index()
{
list<string> topdirs = m_config->getTopdirs();
if (topdirs.empty()) {
LOGERR(("FsIndexer::indexTrees: no valid topdirs in config\n"));
return false;
}
if (!init())
return false;
@ -108,8 +109,8 @@ bool FsIndexer::index()
m_walker.setSkippedPaths(m_config->getSkippedPaths());
for (list<string>::const_iterator it = topdirs.begin();
it != topdirs.end(); it++) {
for (list<string>::const_iterator it = m_tdl.begin();
it != m_tdl.end(); it++) {
LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(),
getDbDir().c_str()));
@ -151,60 +152,119 @@ bool FsIndexer::index()
return true;
}
static bool matchesSkipped(const list<string>& tdl,
const list<string>& skpnl,
const list<string>& skppl,
const string& path)
{
// First check what (if any) topdir this is in:
string td;
for (list<string>::const_iterator it = tdl.begin(); it != tdl.end(); it++) {
if (path.find(*it) == 0) {
td = *it;
break;
}
}
if (td.empty()) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str()));
return true;
}
// Check path against skippedPaths. If we find a system where
// FNM_LEADING_DIR is undefined (its unposixy), will have to do this for
// all ascendant paths up to the topdir
for (list<string>::const_iterator it = skppl.begin();
it != skppl.end(); it++) {
if (fnmatch(it->c_str(), path.c_str(), FNM_PATHNAME|FNM_LEADING_DIR)
== 0) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n",
path.c_str()));
return true;
}
}
// Then check all path components up to the topdir against skippedNames
if (!skpnl.empty()) {
string mpath = path;
while (mpath.length() >= td.length() && mpath.length() > 1) {
string fn = path_getsimple(mpath);
for (list<string>::const_iterator it = skpnl.begin();
it != skpnl.end(); it++) {
LOGDEB2(("Checking [%s] against [%s]\n",
fn.c_str(), it->c_str()));
if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) {
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n",
path.c_str()));
return true;
}
}
string::size_type len = mpath.length();
mpath = path_getfather(mpath);
// getfather normally returns a path ending with /, getsimple
// would then return ''
if (!mpath.empty() && mpath[mpath.size()-1] == '/')
mpath.erase(mpath.size()-1);
// should not be necessary, but lets be prudent. If the
// path did not shorten, something is seriously amiss
// (could be an assert actually)
if (mpath.length() >= len)
return true;
}
}
return false;
}
/**
* Index individual files, out of a full tree run. No database purging
*/
bool FsIndexer::indexFiles(const list<string> &filenames)
bool FsIndexer::indexFiles(list<string>& files)
{
if (!init())
return false;
list<string>::const_iterator it;
for (it = filenames.begin(); it != filenames.end(); it++) {
string dir = path_getfather(*it);
m_config->setKeyDir(dir);
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db->setAbstractParams(abslen, -1, -1);
for (list<string>::iterator it = files.begin();
it != files.end(); it++) {
struct stat stb;
if (lstat(it->c_str(), &stb) != 0) {
LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(),
strerror(errno)));
continue;
}
// If we get to indexing directory names one day, will need to test
// against dbdir here to avoid modification loops (with rclmon).
if (!S_ISREG(stb.st_mode)) {
LOGDEB2(("FsIndexer::indexFiles: %s: not a regular file\n",
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n",
it->c_str()));
continue;
}
string dir = path_getfather(*it);
m_config->setKeyDir(dir);
static string lstdir;
static list<string> skpl;
static list<string> skpnl;
static list<string> skppl;
if (lstdir.compare(dir)) {
LOGDEB(("Recomputing list of skipped names\n"));
skpl = m_config->getSkippedNames();
skpnl = m_config->getSkippedNames();
skppl = m_config->getSkippedPaths();
lstdir = dir;
}
if (!skpl.empty()) {
list<string>::const_iterator skit;
string fn = path_getsimple(*it);
for (skit = skpl.begin(); skit != skpl.end(); skit++) {
if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
goto skipped;
}
}
}
// Check path against indexed areas and skipped names/paths
if (matchesSkipped(m_tdl, skpnl, skppl, *it))
continue;
int abslen;
if (m_config->getConfParam("idxabsmlen", &abslen))
m_db->setAbstractParams(abslen, -1, -1);
if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
FsTreeWalker::FtwOk) {
LOGERR(("FsIndexer::indexFiles: processone failed\n"));
return false;
}
skipped:
false; // Need a statement here to make compiler happy ??
files.erase(it);
}
return true;
@ -212,19 +272,25 @@ bool FsIndexer::indexFiles(const list<string> &filenames)
/** Purge docs for given files out of the database */
bool FsIndexer::purgeFiles(const list<string> &filenames)
bool FsIndexer::purgeFiles(list<string>& files)
{
if (!init())
return false;
list<string>::const_iterator it;
for (it = filenames.begin(); it != filenames.end(); it++) {
for (list<string>::iterator it = files.begin();
it != files.end(); it++) {
string udi;
make_udi(*it, "", udi);
if (!m_db->purgeFile(udi)) {
// rcldb::purgefile returns true if the udi was either not
// found or deleted, false only in case of actual error
bool existed;
if (!m_db->purgeFile(udi, &existed)) {
LOGERR(("FsIndexer::purgeFiles: Database error\n"));
return false;
}
// If we actually deleted something, take it off the list
if (existed) {
files.erase(it);
}
}
return true;

View File

@ -18,6 +18,11 @@
#define _fsindexer_h_included_
/* @(#$Id: $ (C) 2009 J.F.Dockes */
#include <list>
#ifndef NO_NAMESPACES
using std::list;
#endif
#include "fstreewalk.h"
#include "rcldb.h"
@ -58,10 +63,10 @@ class FsIndexer : public FsTreeWalkerCB {
bool index();
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<string> &files);
bool indexFiles(list<string> &files);
/** Purge a list of files. */
bool purgeFiles(const std::list<string> &files);
bool purgeFiles(list<string> &files);
/** Tree walker callback method */
FsTreeWalker::Status
@ -74,6 +79,7 @@ class FsIndexer : public FsTreeWalkerCB {
string m_tmpdir;
string m_reason;
DbIxStatusUpdater *m_updater;
list<string> m_tdl;
// The configuration can set attribute fields to be inherited by
// all files in a file system area. Ie: set "apptag = thunderbird"

View File

@ -26,6 +26,8 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.71 2008-12-17 08:01:40 dockes Exp
#include <unistd.h>
#include <errno.h>
#include <algorithm>
#include "debuglog.h"
#include "indexer.h"
#include "fsindexer.h"
@ -104,44 +106,14 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
return true;
}
bool ConfIndexer::initTopDirs()
bool ConfIndexer::indexFiles(std::list<string> &files)
{
if (m_tdl.empty()) {
m_tdl = m_config->getTopdirs();
if (m_tdl.empty()) {
m_reason = "Top directory list (topdirs param.) "
"not found in config or Directory list parse error";
return false;
}
}
return true;
}
bool ConfIndexer::indexFiles(const std::list<string> &files)
{
if (!initTopDirs())
return false;
list<string> myfiles;
for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) {
string fn = path_canon(*it);
bool ok = false;
// Check that this file name belongs to one of our subtrees
for (list<string>::iterator dit = m_tdl.begin();
dit != m_tdl.end(); dit++) {
if (fn.find(*dit) == 0) {
myfiles.push_back(fn);
ok = true;
break;
}
}
if (!ok) {
m_reason += string("File ") + fn + string(" not in indexed area\n");
}
myfiles.push_back(path_canon(*it));
}
if (myfiles.empty())
return true;
myfiles.sort();
if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: indexFiles error opening database %s\n",
@ -149,9 +121,21 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
return false;
}
m_config->setKeyDir("");
bool ret = false;
if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->indexFiles(files);
if (m_fsindexer)
ret = m_fsindexer->indexFiles(files);
if (m_dobeagle && !myfiles.empty()) {
if (!m_beagler)
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (m_beagler) {
ret = ret && m_beagler->indexFiles(myfiles);
} else {
ret = false;
}
}
// The close would be done in our destructor, but we want status here
if (!m_db.close()) {
@ -162,31 +146,40 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
return ret;
}
bool ConfIndexer::purgeFiles(const std::list<string> &files)
bool ConfIndexer::purgeFiles(std::list<string> &files)
{
if (!initTopDirs())
return false;
list<string> myfiles;
for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) {
myfiles.push_back(path_canon(*it));
}
myfiles.sort();
if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: purgeFiles error opening database %s\n",
m_config->getDbDir().c_str()));
return false;
}
bool ret = false;
m_config->setKeyDir("");
if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->purgeFiles(files);
if (m_fsindexer)
ret = m_fsindexer->purgeFiles(myfiles);
if (m_dobeagle && !myfiles.empty()) {
if (!m_beagler)
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (m_beagler) {
ret = ret && m_beagler->purgeFiles(myfiles);
} else {
ret = false;
}
}
// The close would be done in our destructor, but we want status here
if (!m_db.close()) {
LOGERR(("ConfIndexer::index: error closing database in %s\n",
LOGERR(("ConfIndexer::purgefiles: error closing database in %s\n",
m_config->getDbDir().c_str()));
return false;
}

View File

@ -85,10 +85,10 @@ class ConfIndexer {
static list<string> getStemmerNames();
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<string> &files);
bool indexFiles(std::list<string> &files);
/** Purge a list of files. */
bool purgeFiles(const std::list<string> &files);
bool purgeFiles(std::list<string> &files);
private:
RclConfig *m_config;
@ -97,10 +97,7 @@ class ConfIndexer {
bool m_dobeagle;
BeagleQueueIndexer *m_beagler;
DbIxStatusUpdater *m_updater;
string m_reason;
list<string> m_tdl;
bool initTopDirs();
string m_reason;
};
#endif /* _INDEXER_H_INCLUDED_ */

View File

@ -97,7 +97,7 @@ static bool makeIndexer(RclConfig *config)
// this case we're called repeatedly in the same process, and the
// confindexer is only created once by makeIndexer (but the db closed and
// flushed every time)
bool indexfiles(RclConfig *config, const list<string> &filenames)
bool indexfiles(RclConfig *config, list<string> &filenames)
{
if (filenames.empty())
return true;
@ -107,7 +107,7 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
}
// Delete a list of files. Same comments about call contexts as indexfiles.
bool purgefiles(RclConfig *config, const list<string> &filenames)
bool purgefiles(RclConfig *config, list<string> &filenames)
{
if (filenames.empty())
return true;

View File

@ -20,8 +20,8 @@
/** Helper methods in recollindex.cpp for initial checks/setup to index
* a list of files (either from the monitor or the command line) */
extern bool indexfiles(RclConfig *config, const list<string> &filenames);
extern bool purgefiles(RclConfig *config, const list<string> &filenames);
extern bool indexfiles(RclConfig *config, list<string> &filenames);
extern bool purgefiles(RclConfig *config, list<string> &filenames);
extern bool createAuxDbs(RclConfig *config);
extern int stopindexing;

View File

@ -1270,7 +1270,7 @@ bool Db::purge()
}
/* Delete document(s) for given unique identifier (doc and descendents) */
bool Db::purgeFile(const string &udi)
bool Db::purgeFile(const string &udi, bool *existed)
{
LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str()));
if (m_ndb == 0 || !m_ndb->m_iswritable)
@ -1280,8 +1280,12 @@ bool Db::purgeFile(const string &udi)
string ermsg;
try {
Xapian::PostingIterator docid = db.postlist_begin(uniterm);
if (docid == db.postlist_end(uniterm))
if (docid == db.postlist_end(uniterm)) {
if (existed)
*existed = false;
return true;
}
*existed = true;
LOGDEB(("purgeFile: delete docid %d\n", *docid));
db.delete_document(*docid);
vector<Xapian::docid> docids;

View File

@ -121,7 +121,7 @@ class Db {
const Doc &doc);
/** Delete document(s) for given UDI, including subdocs */
bool purgeFile(const string &udi);
bool purgeFile(const string &udi, bool *existed = 0);
/** Remove documents that no longer exist in the file system. This
* depends on the update map, which is built during