integrate beaglequeueindexer for indexing. Work remains for indexfiles() at least

This commit is contained in:
dockes 2009-11-13 09:07:18 +00:00
parent d1e3f156ee
commit 4503971dd0
7 changed files with 357 additions and 154 deletions

View File

@ -17,6 +17,10 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
#endif #endif
#include "autoconfig.h"
#include <sys/types.h>
#include "autoconfig.h" #include "autoconfig.h"
#include "pathut.h" #include "pathut.h"
#include "debuglog.h" #include "debuglog.h"
@ -27,9 +31,14 @@ static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
#include "internfile.h" #include "internfile.h"
#include "wipedir.h" #include "wipedir.h"
#include "circache.h" #include "circache.h"
#include "indexer.h"
#include "readfile.h"
#include "conftree.h"
#include "transcode.h"
#include <vector> #include <vector>
#include <fstream> #include <fstream>
#include <sstream>
using namespace std; using namespace std;
#include <sys/stat.h> #include <sys/stat.h>
@ -42,9 +51,7 @@ class BeagleDotFile {
public: public:
BeagleDotFile(RclConfig *conf, const string& fn) BeagleDotFile(RclConfig *conf, const string& fn)
: m_conf(conf), m_fn(fn) : m_conf(conf), m_fn(fn)
{ { }
}
bool readLine(string& line) bool readLine(string& line)
{ {
@ -92,13 +99,20 @@ public:
return false; return false;
doc.mimetype = line; doc.mimetype = line;
if (doc.mimetype.empty() && // We set the bookmarks mtype as html, the text is empty
!stringlowercmp("bookmark", doc.meta[keybght])) // anyway, so that the html viewer will be called on 'Open'
doc.mimetype = "text/plain"; bool isbookmark = false;
if (!stringlowercmp("bookmark", doc.meta[keybght])) {
isbookmark = true;
doc.mimetype = "text/html";
}
string confstr; string confstr;
string ss(" "); string ss(" ");
// Read the rest: fields and keywords // Read the rest: fields and keywords. We do a little
// massaging of the input lines, then use a ConfSimple to
// parse, and finally insert the key/value pairs into the doc
// meta[] array
for (;;) { for (;;) {
if (!readLine(line)) { if (!readLine(line)) {
// Eof hopefully // Eof hopefully
@ -109,7 +123,6 @@ public:
line = line.substr(2); line = line.substr(2);
confstr += line + "\n"; confstr += line + "\n";
} }
ConfSimple fields(confstr, 1); ConfSimple fields(confstr, 1);
list<string> names = fields.getNames(""); list<string> names = fields.getNames("");
for (list<string>::iterator it = names.begin(); for (list<string>::iterator it = names.begin();
@ -118,23 +131,50 @@ public:
fields.get(*it, value, ""); fields.get(*it, value, "");
if (!value.compare("undefined") || !value.compare("null")) if (!value.compare("undefined") || !value.compare("null"))
continue; continue;
string *valuep = &value;
string cvalue;
if (isbookmark) {
// It appears that bookmarks are stored in the users'
// locale charset (not too sure). No idea what to do
// for other types, would have to check the plugin.
string charset = m_conf->getDefCharset(true);
transcode(value, cvalue, charset, "UTF-8");
valuep = &cvalue;
}
string caname = m_conf->fieldCanon(*it); string caname = m_conf->fieldCanon(*it);
doc.meta[caname].append(ss + value); doc.meta[caname].append(ss + *valuep);
} }
// Finally build the confsimple that we will save to the
// cache, out of document fields. This could also be done in
// parallel with the doc.meta build above, but simpler this way.
for (map<string,string>::const_iterator it = doc.meta.begin();
it != doc.meta.end(); it++) {
m_fields.set((*it).first, (*it).second, "");
}
m_fields.set("url", doc.url, "");
m_fields.set("mimetype", doc.mimetype, "");
return true; return true;
} }
RclConfig *m_conf; RclConfig *m_conf;
ConfSimple m_fields;
string m_fn; string m_fn;
ifstream m_input; ifstream m_input;
}; };
const string badtmpdirname = "/no/such/dir/really/can/exist"; const string badtmpdirname = "/no/such/dir/really/can/exist";
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf) BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
: m_config(cnf), m_db(cnf) DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc)
{ {
if (!m_config->getConfParam("beaglequeuedir", m_queuedir)) if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
m_queuedir = path_tildexpand("~/.beagle/ToIndex"); m_queuedir = path_tildexpand("~/.beagle/ToIndex");
if (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) { if (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
string reason; string reason;
if (!maketmpdir(m_tmpdir, reason)) { if (!maketmpdir(m_tmpdir, reason)) {
@ -143,12 +183,20 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf)
m_tmpdir = badtmpdirname; m_tmpdir = badtmpdirname;
} }
} }
Rcl::Db::OpenMode mode = Rcl::Db::DbUpd;
if (!m_db.open(mode)) { string ccdir;
LOGERR(("BeagleQueueIndexer: error opening database %s\n", m_config->getConfParam("webcachedir", ccdir);
m_config->getDbDir().c_str())); if (ccdir.empty())
return; ccdir = "webcache";
} ccdir = path_tildexpand(ccdir);
// If not an absolute path, compute relative to config dir
if (ccdir.at(0) != '/')
ccdir = path_cat(m_config->getConfDir(), ccdir);
int maxmbs = 20;
m_config->getConfParam("webcachemaxmbs", &maxmbs);
m_cache = new CirCache(ccdir);
m_cache->create(off_t(maxmbs)*1000*1024, true);
} }
BeagleQueueIndexer::~BeagleQueueIndexer() BeagleQueueIndexer::~BeagleQueueIndexer()
@ -161,13 +209,106 @@ BeagleQueueIndexer::~BeagleQueueIndexer()
m_tmpdir.c_str())); m_tmpdir.c_str()));
} }
} }
m_db.close(); deleteZ(m_cache);
} }
bool BeagleQueueIndexer::processqueue() bool BeagleQueueIndexer::indexFromCache(const string& udi)
{
string dict, data;
// This is horribly inefficient and needs fixing either by saving
// the offsets during the forward scan, or using an auxiliary isam
// map
if (!m_cache->get(udi, dict, data))
return false;
ConfSimple cf(dict, 1);
string hittype;
if (!cf.get(keybght, hittype, "")) {
LOGERR(("BeagleIndexer::index: cc entry has no hit type\n"));
return false;
}
// Build a doc from saved metadata
Rcl::Doc dotdoc;
cf.get("url", dotdoc.url, "");
cf.get("mimetype", dotdoc.mimetype, "");
cf.get("fmtime", dotdoc.fmtime, "");
cf.get("fbytes", dotdoc.fbytes, "");
dotdoc.sig = "";
list<string> names = cf.getNames("");
for (list<string>::const_iterator it = names.begin();
it != names.end(); it++) {
cf.get(*it, dotdoc.meta[*it], "");
}
if (!stringlowercmp("bookmark", hittype)) {
// Just index the dotdoc
return m_db->addOrUpdate(udi, "", dotdoc);
} else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
(dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
return true;
} else {
Rcl::Doc doc;
FileInterner interner(data, m_config, m_tmpdir,
FileInterner::FIF_doUseInputMimetype,
dotdoc.mimetype);
string ipath;
FileInterner::Status fis = interner.internfile(doc, ipath);
if (fis != FileInterner::FIDone) {
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
return false;
}
doc.mimetype = dotdoc.mimetype;
doc.fmtime = dotdoc.fmtime;
doc.url = dotdoc.url;
doc.fbytes = dotdoc.fbytes;
doc.sig = "";
return m_db->addOrUpdate(udi, "", doc);
}
}
bool BeagleQueueIndexer::index()
{ {
LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n", LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n",
m_queuedir.c_str())); m_queuedir.c_str()));
m_config->setKeyDir(m_queuedir);
// First walk the cache to set the existence flags. We do not
// actually check uptodateness because all files in the cache are
// supposedly already indexed.
//TBD: change this as the cache needs reindexing after an index reset!
// Also, we need to read the cache backwards so that the newest
// version of each file gets indexed? Or find a way to index
// multiple versions ?
bool eof;
if (!m_cache->rewind(eof)) {
if (!eof)
return false;
}
vector<string> alludis;
alludis.reserve(20000);
while (m_cache->next(eof)) {
string dict;
m_cache->getcurrentdict(dict);
ConfSimple cf(dict, 1);
string udi;
if (!cf.get("udi", udi, ""))
continue;
alludis.push_back(udi);
}
for (vector<string>::reverse_iterator it = alludis.rbegin();
it != alludis.rend(); it++) {
if (m_db->needUpdate(*it, "")) {
indexFromCache(*it);
}
}
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse); FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
walker.addSkippedName(".*"); walker.addSkippedName(".*");
@ -181,12 +322,15 @@ BeagleQueueIndexer::processone(const string &path,
const struct stat *stp, const struct stat *stp,
FsTreeWalker::CbFlag flg) FsTreeWalker::CbFlag flg)
{ {
bool dounlink = false;
if (flg != FsTreeWalker::FtwRegular) if (flg != FsTreeWalker::FtwRegular)
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
string dotpath = path_cat(path_getfather(path), string dotpath = path_cat(path_getfather(path),
string(".") + path_getsimple(path)); string(".") + path_getsimple(path));
LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str())); LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str()));
BeagleDotFile dotfile(m_config, dotpath); BeagleDotFile dotfile(m_config, dotpath);
Rcl::Doc dotdoc; Rcl::Doc dotdoc;
string udi, udipath; string udi, udipath;
@ -205,12 +349,32 @@ BeagleQueueIndexer::processone(const string &path,
// We only process bookmarks or text/html and text/plain files. // We only process bookmarks or text/html and text/plain files.
if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) { if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) {
// For bookmarks, we just index the doc that was built from the
// metadata.
if (dotdoc.fmtime.empty())
dotdoc.fmtime = ascdate;
char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size);
dotdoc.fbytes = cbuf;
// Document signature for up to date checks: none.
dotdoc.sig = "";
// doc fields not in meta, needing saving to the cache
dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
if (!m_db->addOrUpdate(udi, "", dotdoc))
return FsTreeWalker::FtwError;
} else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) || } else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
(dotdoc.mimetype.compare("text/html") && (dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) { dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n", LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str())); dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
// Unlink them anyway
dounlink = true;
goto out; goto out;
} else { } else {
Rcl::Doc doc; Rcl::Doc doc;
@ -230,17 +394,34 @@ BeagleQueueIndexer::processone(const string &path,
char cbuf[100]; char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size); sprintf(cbuf, "%ld", (long)stp->st_size);
doc.fbytes = cbuf; doc.fbytes = cbuf;
// Document signature for up to date checks: none. The file is // Document signature for up to date checks: none.
// going to be deleted anyway. We always reindex what comes in
// the queue. It would probably be possible to extract some
// http data to avoid this.
doc.sig = ""; doc.sig = "";
doc.url = dotdoc.url; doc.url = dotdoc.url;
if (!m_db.addOrUpdate(udi, "", doc))
// doc fields not in meta, needing saving to the cache
dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
if (!m_db->addOrUpdate(udi, "", doc))
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
} }
// Copy to cache
{
stringstream o;
dotfile.m_fields.write(o);
string fdata;
file_to_string(path, fdata);
if (!m_cache->put(udi, o.str(), fdata))
goto out;
}
dounlink = true;
out: out:
// unlink(path.c_str()); if (dounlink) {
// unlink(dotpath.c_str()); unlink(path.c_str());
unlink(dotpath.c_str());
}
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }

View File

@ -28,21 +28,30 @@
#include "fstreewalk.h" #include "fstreewalk.h"
#include "rcldb.h" #include "rcldb.h"
class DbIxStatusUpdater;
class CirCache;
class BeagleQueueIndexer : public FsTreeWalkerCB { class BeagleQueueIndexer : public FsTreeWalkerCB {
public: public:
BeagleQueueIndexer(RclConfig *cnf); BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
DbIxStatusUpdater *updfunc = 0);
~BeagleQueueIndexer(); ~BeagleQueueIndexer();
bool processqueue(); bool index();
FsTreeWalker::Status FsTreeWalker::Status
processone(const string &, const struct stat *, FsTreeWalker::CbFlag); processone(const string &, const struct stat *, FsTreeWalker::CbFlag);
private: private:
RclConfig *m_config; RclConfig *m_config;
Rcl::Db m_db; Rcl::Db *m_db;
string m_queuedir; CirCache *m_cache;
string m_tmpdir; string m_queuedir;
string m_tmpdir;
DbIxStatusUpdater *m_updater;
bool indexFromCache(const string& udi);
}; };
#endif /* _beaglequeue_h_included_ */ #endif /* _beaglequeue_h_included_ */

View File

@ -90,7 +90,7 @@ bool FsIndexer::init()
} }
// Recursively index each directory in the topdirs: // Recursively index each directory in the topdirs:
bool FsIndexer::index(bool resetbefore) bool FsIndexer::index()
{ {
list<string> topdirs = m_config->getTopdirs(); list<string> topdirs = m_config->getTopdirs();
if (topdirs.empty()) { if (topdirs.empty()) {
@ -376,7 +376,6 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
// We'll change the signature to ensure that the indexing will // We'll change the signature to ensure that the indexing will
// be retried every time. // be retried every time.
// Internal access path for multi-document files // Internal access path for multi-document files
if (ipath.empty()) if (ipath.empty())
hadNullIpath = true; hadNullIpath = true;

View File

@ -55,7 +55,7 @@ class FsIndexer : public FsTreeWalkerCB {
* We create the temporary directory, open the database, * We create the temporary directory, open the database,
* then call a file system walk for each top-level directory. * then call a file system walk for each top-level directory.
*/ */
bool index(bool resetbefore); bool index();
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<string> &files); bool indexFiles(const std::list<string> &files);

View File

@ -28,37 +28,63 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.71 2008-12-17 08:01:40 dockes Exp
#include "debuglog.h" #include "debuglog.h"
#include "indexer.h" #include "indexer.h"
#include "fsindexer.h"
#include "beaglequeue.h"
#ifdef RCL_USE_ASPELL #ifdef RCL_USE_ASPELL
#include "rclaspell.h" #include "rclaspell.h"
#endif #endif
ConfIndexer::ConfIndexer(RclConfig *cnf, DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(cnf), m_fsindexer(0),
m_dobeagle(false), m_beagler(0),
m_updater(updfunc)
{
m_config->getConfParam("processbeaglequeue", &m_dobeagle);
}
ConfIndexer::~ConfIndexer() ConfIndexer::~ConfIndexer()
{ {
deleteZ(m_fsindexer); deleteZ(m_fsindexer);
deleteZ(m_beagler);
} }
bool ConfIndexer::index(bool resetbefore) bool ConfIndexer::index(bool resetbefore, ixType typestorun)
{ {
Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd; Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
if (!m_db.open(mode)) { if (!m_db.open(mode)) {
LOGERR(("ConfIndexer: error opening database %s\n", LOGERR(("ConfIndexer: error opening database %s : %s\n",
m_config->getDbDir().c_str())); m_config->getDbDir().c_str(), m_db.getReason().c_str()));
return false; return false;
} }
m_config->setKeyDir(""); m_config->setKeyDir("");
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); if (typestorun & IxTFs) {
bool ret = m_fsindexer->index(resetbefore); deleteZ(m_fsindexer);
deleteZ(m_fsindexer); m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer || !m_fsindexer->index()) {
if (m_updater) { return false;
m_updater->status.fn.erase(); }
m_updater->status.phase = DbIxStatus::DBIXS_PURGE; }
m_updater->update();
if (m_dobeagle && (typestorun & IxTBeagleQueue)) {
deleteZ(m_beagler);
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (!m_beagler || !m_beagler->index()) {
return false;
}
}
if (typestorun == IxTAll) {
// Get rid of all database entries that don't exist in the
// filesystem anymore. Only if all *configured* indexers ran.
if (m_updater) {
m_updater->status.fn.erase();
m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
m_updater->update();
}
m_db.purge();
} }
// Get rid of all database entries that don't exist in the
// filesystem anymore.
m_db.purge();
if (m_updater) { if (m_updater) {
m_updater->status.phase = DbIxStatus::DBIXS_CLOSING; m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
@ -78,17 +104,55 @@ bool ConfIndexer::index(bool resetbefore)
return true; return true;
} }
bool ConfIndexer::initTopDirs()
{
if (m_tdl.empty()) {
m_tdl = m_config->getTopdirs();
if (m_tdl.empty()) {
m_reason = "Top directory list (topdirs param.) "
"not found in config or Directory list parse error";
return false;
}
}
return true;
}
bool ConfIndexer::indexFiles(const std::list<string> &files) bool ConfIndexer::indexFiles(const std::list<string> &files)
{ {
if (!initTopDirs())
return false;
list<string> myfiles;
for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) {
string fn = path_canon(*it);
bool ok = false;
// Check that this file name belongs to one of our subtrees
for (list<string>::iterator dit = m_tdl.begin();
dit != m_tdl.end(); dit++) {
if (fn.find(*dit) == 0) {
myfiles.push_back(fn);
ok = true;
break;
}
}
if (!ok) {
m_reason += string("File ") + fn + string(" not in indexed area\n");
}
}
if (myfiles.empty())
return true;
if (!m_db.open(Rcl::Db::DbUpd)) { if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: indexFiles error opening database %s\n", LOGERR(("ConfIndexer: indexFiles error opening database %s\n",
m_config->getDbDir().c_str())); m_config->getDbDir().c_str()));
return false; return false;
} }
m_config->setKeyDir(""); m_config->setKeyDir("");
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->indexFiles(files); bool ret = m_fsindexer->indexFiles(files);
deleteZ(m_fsindexer);
// The close would be done in our destructor, but we want status here // The close would be done in our destructor, but we want status here
if (!m_db.close()) { if (!m_db.close()) {
LOGERR(("ConfIndexer::index: error closing database in %s\n", LOGERR(("ConfIndexer::index: error closing database in %s\n",
@ -100,15 +164,26 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
bool ConfIndexer::purgeFiles(const std::list<string> &files) bool ConfIndexer::purgeFiles(const std::list<string> &files)
{ {
if (!initTopDirs())
return false;
list<string> myfiles;
for (list<string>::const_iterator it = files.begin();
it != files.end(); it++) {
myfiles.push_back(path_canon(*it));
}
if (!m_db.open(Rcl::Db::DbUpd)) { if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR(("ConfIndexer: purgeFiles error opening database %s\n", LOGERR(("ConfIndexer: purgeFiles error opening database %s\n",
m_config->getDbDir().c_str())); m_config->getDbDir().c_str()));
return false; return false;
} }
m_config->setKeyDir(""); m_config->setKeyDir("");
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater); if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
bool ret = m_fsindexer->purgeFiles(files); bool ret = m_fsindexer->purgeFiles(files);
deleteZ(m_fsindexer);
// The close would be done in our destructor, but we want status here // The close would be done in our destructor, but we want status here
if (!m_db.close()) { if (!m_db.close()) {
LOGERR(("ConfIndexer::index: error closing database in %s\n", LOGERR(("ConfIndexer::index: error closing database in %s\n",
@ -159,7 +234,7 @@ bool ConfIndexer::createStemDb(const string &lang)
// module, either from a configuration variable or the NLS environment. // module, either from a configuration variable or the NLS environment.
bool ConfIndexer::createAspellDict() bool ConfIndexer::createAspellDict()
{ {
LOGDEB2(("FsIndexer::createAspellDict()\n")); LOGDEB2(("ConfIndexer::createAspellDict()\n"));
#ifdef RCL_USE_ASPELL #ifdef RCL_USE_ASPELL
// For the benefit of the real-time indexer, we only initialize // For the benefit of the real-time indexer, we only initialize
// noaspell from the configuration once. It can then be set to // noaspell from the configuration once. It can then be set to
@ -180,14 +255,14 @@ bool ConfIndexer::createAspellDict()
Aspell aspell(m_config); Aspell aspell(m_config);
string reason; string reason;
if (!aspell.init(reason)) { if (!aspell.init(reason)) {
LOGERR(("FsIndexer::createAspellDict: aspell init failed: %s\n", LOGERR(("ConfIndexer::createAspellDict: aspell init failed: %s\n",
reason.c_str())); reason.c_str()));
noaspell = true; noaspell = true;
return false; return false;
} }
LOGDEB(("FsIndexer::createAspellDict: creating dictionary\n")); LOGDEB(("ConfIndexer::createAspellDict: creating dictionary\n"));
if (!aspell.buildDict(m_db, reason)) { if (!aspell.buildDict(m_db, reason)) {
LOGERR(("FsIndexer::createAspellDict: aspell buildDict failed: %s\n", LOGERR(("ConfIndexer::createAspellDict: aspell buildDict failed: %s\n",
reason.c_str())); reason.c_str()));
noaspell = true; noaspell = true;
return false; return false;

View File

@ -29,10 +29,10 @@ using std::map;
#endif #endif
#include "rclconfig.h" #include "rclconfig.h"
#include "fsindexer.h" #include "rcldb.h"
/* Forward decl for lower level indexing object */ class FsIndexer;
class DbIndexer; class BeagleQueueIndexer;
class DbIxStatus { class DbIxStatus {
public: public:
@ -55,27 +55,20 @@ class DbIxStatusUpdater {
}; };
/** /**
The top level indexing object. Processes the configuration, then invokes * The top level indexing object. Processes the configuration, then invokes
file system walking to populate/update the database(s). * file system walking or other to populate/update the database(s).
Fiction:
Multiple top-level directories can be listed in the
configuration. Each can be indexed to a different
database. Directories are first grouped by database, then an
internal class (DbIndexer) is used to process each group.
Fact: we've had one db per config forever. The multidb/config code has been
kept around for no good reason, this fiction only affects indexer.cpp
*/ */
class ConfIndexer { class ConfIndexer {
public: public:
enum runStatus {IndexerOk, IndexerError}; enum runStatus {IndexerOk, IndexerError};
ConfIndexer(RclConfig *cnf, DbIxStatusUpdater *updfunc = 0) ConfIndexer(RclConfig *cnf, DbIxStatusUpdater *updfunc = 0);
: m_config(cnf), m_db(cnf), m_fsindexer(0), m_updater(updfunc)
{}
virtual ~ConfIndexer(); virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */ // Indexer types. Maybe we'll have something more dynamic one day
bool index(bool resetbefore = false); enum ixType {IxTNone, IxTFs=1, IxTBeagleQueue=2,
IxTAll = IxTFs | IxTBeagleQueue};
/** Run indexers */
bool index(bool resetbefore, ixType typestorun);
const string &getReason() {return m_reason;} const string &getReason() {return m_reason;}
@ -101,8 +94,13 @@ class ConfIndexer {
RclConfig *m_config; RclConfig *m_config;
Rcl::Db m_db; Rcl::Db m_db;
FsIndexer *m_fsindexer; FsIndexer *m_fsindexer;
DbIxStatusUpdater *m_updater; bool m_dobeagle;
BeagleQueueIndexer *m_beagler;
DbIxStatusUpdater *m_updater;
string m_reason; string m_reason;
list<string> m_tdl;
bool initTopDirs();
}; };
#endif /* _INDEXER_H_INCLUDED_ */ #endif /* _INDEXER_H_INCLUDED_ */

View File

@ -83,66 +83,27 @@ static bool makeIndexer(RclConfig *config)
{ {
if (!confindexer) if (!confindexer)
confindexer = new ConfIndexer(config, &updater); confindexer = new ConfIndexer(config, &updater);
return confindexer ? true : false; if (!confindexer) {
cerr << "Cannot create indexer" << endl;
exit(1);
}
return true;
} }
// The list of top directories/files wont change during program run,
// let's cache it:
static list<string> o_tdl;
// Index a list of files. We just check that they belong to one of the // Index a list of files. We just check that they belong to one of the
// topdirs subtrees, and call the indexer method. // topdirs subtrees, and call the indexer method.
// //
// This is called either from the command line or from the monitor. In // This is called either from the command line or from the monitor. In
// this case we're called repeatedly in the same process, and the // this case we're called repeatedly in the same process, and the
// confindexer is only created once by makeIndexer (but the db is // confindexer is only created once by makeIndexer (but the db closed and
// flushed anyway) // flushed every time)
bool indexfiles(RclConfig *config, const list<string> &filenames) bool indexfiles(RclConfig *config, const list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;
if (o_tdl.empty()) {
o_tdl = config->getTopdirs();
if (o_tdl.empty()) {
fprintf(stderr, "Top directory list (topdirs param.) "
"not found in config or Directory list parse error");
return false;
}
}
list<string> myfiles;
for (list<string>::const_iterator it = filenames.begin();
it != filenames.end(); it++) {
string fn = path_canon(*it);
bool ok = false;
// Check that this file name belongs to one of our subtrees
for (list<string>::iterator dit = o_tdl.begin();
dit != o_tdl.end(); dit++) {
if (fn.find(*dit) == 0) {
myfiles.push_back(fn);
ok = true;
break;
}
}
if (!ok) {
fprintf(stderr, "File %s not in indexed area\n", fn.c_str());
}
}
if (myfiles.empty())
return true;
// Note: we should sort the file names against the topdirs here
// and check for different databases. But we can for now only have
// one database per config, so we set the keydir from the first
// file (which is not really needed...), create the indexer/db and
// go:
config->setKeyDir(path_getfather(*myfiles.begin()));
if (!makeIndexer(config)) if (!makeIndexer(config))
return false; return false;
return confindexer->indexFiles(filenames);
return confindexer->indexFiles(myfiles);
} }
// Delete a list of files. Same comments about call contexts as indexfiles. // Delete a list of files. Same comments about call contexts as indexfiles.
@ -150,32 +111,9 @@ bool purgefiles(RclConfig *config, const list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;
if (o_tdl.empty()) {
o_tdl = config->getTopdirs();
if (o_tdl.empty()) {
fprintf(stderr, "Top directory list (topdirs param.) "
"not found in config or Directory list parse error");
return false;
}
}
list<string> myfiles;
for (list<string>::const_iterator it = filenames.begin();
it != filenames.end(); it++) {
myfiles.push_back(path_canon(*it));
}
// Note: we should sort the file names against the topdirs here
// and check for different databases. But we can for now only have
// one database per config, so we set the keydir from the first
// file (which is not really needed...), create the indexer/db and
// go:
config->setKeyDir(path_getfather(*myfiles.begin()));
if (!makeIndexer(config)) if (!makeIndexer(config))
return false; return false;
return confindexer->purgeFiles(myfiles); return confindexer->purgeFiles(filenames);
} }
// Create stemming and spelling databases // Create stemming and spelling databases
@ -343,12 +281,14 @@ int main(int argc, const char **argv)
filenames.push_back(*argv++); filenames.push_back(*argv++);
} }
} }
bool status;
if (op_flags & OPT_i) if (op_flags & OPT_i)
exit(!indexfiles(config, filenames)); status = indexfiles(config, filenames);
else else
exit(!purgefiles(config, filenames)); status = purgefiles(config, filenames);
if (!confindexer->getReason().empty())
cerr << confindexer->getReason() << endl;
exit(status ? 0 : 1);
} else if (op_flags & OPT_l) { } else if (op_flags & OPT_l) {
if (argc != 0) if (argc != 0)
Usage(); Usage();
@ -400,14 +340,15 @@ int main(int argc, const char **argv)
exit(!confindexer->createAspellDict()); exit(!confindexer->createAspellDict());
#endif // ASPELL #endif // ASPELL
} else if (op_flags & OPT_b) { } else if (op_flags & OPT_b) {
BeagleQueueIndexer beagler(config); cerr << "Not yet" << endl;
bool status = beagler.processqueue(); return 1;
return !status;
} else { } else {
confindexer = new ConfIndexer(config, &updater); confindexer = new ConfIndexer(config, &updater);
bool status = confindexer->index(rezero); bool status = confindexer->index(rezero, ConfIndexer::IxTAll);
if (!status) if (!status)
cerr << "Indexing failed" << endl; cerr << "Indexing failed" << endl;
if (!confindexer->getReason().empty())
cerr << confindexer->getReason() << endl;
return !status; return !status;
} }
} }