separated out the cache access part from beaglequeueindexer. this avoids having to link the pure query programs with indexing code

This commit is contained in:
Jean-Francois Dockes 2010-02-05 12:46:41 +01:00
parent de63a00540
commit b87a23bfca
10 changed files with 146 additions and 76 deletions

View File

@ -0,0 +1,60 @@
#include "autoconfig.h"
#include "beaglequeuecache.h"
#include "circache.h"
#include "debuglog.h"
#include "rclconfig.h"
#include "pathut.h"
#include "rcldoc.h"
BeagleQueueCache::BeagleQueueCache(RclConfig *cnf)
{
string ccdir;
cnf->getConfParam("webcachedir", ccdir);
if (ccdir.empty())
ccdir = "webcache";
ccdir = path_tildexpand(ccdir);
// If not an absolute path, compute relative to config dir
if (ccdir.at(0) != '/')
ccdir = path_cat(cnf->getConfDir(), ccdir);
int maxmbs = 40;
cnf->getConfParam("webcachemaxmbs", &maxmbs);
m_cache = new CirCache(ccdir);
m_cache->create(off_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE);
}
BeagleQueueCache::~BeagleQueueCache()
{
delete m_cache;
}
// Read document from cache. Return the metadata as an Rcl::Doc
// @param htt Beagle Hit Type
bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc,
string& data, string *htt)
{
string dict;
if (!m_cache->get(udi, dict, data))
return false;
ConfSimple cf(dict, 1);
if (htt)
cf.get(Rcl::Doc::keybght, *htt, "");
// Build a doc from saved metadata
cf.get("url", dotdoc.url, "");
cf.get("mimetype", dotdoc.mimetype, "");
cf.get("fmtime", dotdoc.fmtime, "");
cf.get("fbytes", dotdoc.fbytes, "");
dotdoc.sig = "";
list<string> names = cf.getNames("");
for (list<string>::const_iterator it = names.begin();
it != names.end(); it++) {
cf.get(*it, dotdoc.meta[*it], "");
}
dotdoc.meta[Rcl::Doc::keyudi] = udi;
return true;
}

View File

@ -0,0 +1,34 @@
#ifndef _beaglequeuecache_h_included_
#define _beaglequeuecache_h_included_
/* @(#$Id: $ (C) 2009 J.F.Dockes */
#include <string>
using std::string;
class RclConfig;
namespace Rcl {
class Db;
class Doc;
}
class CirCache;
/**
* Manage the CirCache for the Beagle Queue indexer. Separated from the main
* indexer code because it's also used for querying (getting the data for a
* preview
*/
class BeagleQueueCache {
public:
BeagleQueueCache(RclConfig *config);
~BeagleQueueCache();
bool getFromCache(const string& udi, Rcl::Doc &doc, string& data,
string *hittype = 0);
// We could write proxies for all the circache ops, but why bother?
CirCache *cc() {return m_cache;}
private:
CirCache *m_cache;
};
#endif /* _beaglequeuecache_h_included_ */

View File

@ -27,11 +27,12 @@ static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
#include "debuglog.h"
#include "fstreewalk.h"
#include "beaglequeue.h"
#include "beaglequeuecache.h"
#include "circache.h"
#include "smallut.h"
#include "fileudi.h"
#include "internfile.h"
#include "wipedir.h"
#include "circache.h"
#include "indexer.h"
#include "readfile.h"
#include "conftree.h"
@ -45,8 +46,6 @@ using namespace std;
#include <sys/stat.h>
const string keybght("beagleHitType");
// Beagle creates a file named .xxx (where xxx is the name for the main file
// in the queue), to hold external metadata (http or created by Beagle).
@ -100,7 +99,7 @@ public:
doc.url = line;
if (!readLine(line))
return false;
doc.meta[keybght] = line;
doc.meta[Rcl::Doc::keybght] = line;
if (!readLine(line))
return false;
doc.mimetype = line;
@ -108,7 +107,7 @@ public:
// We set the bookmarks mtype as html (the text is empty
// anyway), so that the html viewer will be called on 'Open'
bool isbookmark = false;
if (!stringlowercmp("bookmark", doc.meta[keybght])) {
if (!stringlowercmp("bookmark", doc.meta[Rcl::Doc::keybght])) {
isbookmark = true;
doc.mimetype = "text/html";
}
@ -197,20 +196,7 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
m_tmpdir = badtmpdirname;
}
}
string ccdir;
m_config->getConfParam("webcachedir", ccdir);
if (ccdir.empty())
ccdir = "webcache";
ccdir = path_tildexpand(ccdir);
// If not an absolute path, compute relative to config dir
if (ccdir.at(0) != '/')
ccdir = path_cat(m_config->getConfDir(), ccdir);
int maxmbs = 40;
m_config->getConfParam("webcachemaxmbs", &maxmbs);
m_cache = new CirCache(ccdir);
m_cache->create(off_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE);
m_cache = new BeagleQueueCache(cnf);
}
BeagleQueueIndexer::~BeagleQueueIndexer()
@ -226,36 +212,6 @@ BeagleQueueIndexer::~BeagleQueueIndexer()
deleteZ(m_cache);
}
// Read document from cache. Return the metadata as an Rcl::Doc
// @param htt Beagle Hit Type
bool BeagleQueueIndexer::getFromCache(const string& udi, Rcl::Doc &dotdoc,
string& data, string *htt)
{
string dict;
if (!m_cache->get(udi, dict, data))
return false;
ConfSimple cf(dict, 1);
if (htt)
cf.get(keybght, *htt, "");
// Build a doc from saved metadata
cf.get("url", dotdoc.url, "");
cf.get("mimetype", dotdoc.mimetype, "");
cf.get("fmtime", dotdoc.fmtime, "");
cf.get("fbytes", dotdoc.fbytes, "");
dotdoc.sig = "";
list<string> names = cf.getNames("");
for (list<string>::const_iterator it = names.begin();
it != names.end(); it++) {
cf.get(*it, dotdoc.meta[*it], "");
}
dotdoc.meta[Rcl::Doc::keyudi] = udi;
return true;
}
// Index document stored in the cache.
bool BeagleQueueIndexer::indexFromCache(const string& udi)
{
@ -268,7 +224,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
string data;
string hittype;
if (!getFromCache(udi, dotdoc, data, &hittype))
if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype))
return false;
if (hittype.empty()) {
@ -280,11 +236,11 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
// Just index the dotdoc
dotdoc.meta[Rcl::Doc::keybcknd] = "BGL";
return m_db->addOrUpdate(udi, "", dotdoc);
} else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
(dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
return true;
} else {
Rcl::Doc doc;
@ -320,20 +276,24 @@ bool BeagleQueueIndexer::index()
return false;
LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str()));
m_config->setKeyDir(m_queuedir);
if (!m_cache || !m_cache->cc()) {
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
return false;
}
CirCache *cc = m_cache->cc();
// First check/index files found in the cache. If the index was reset,
// this actually does work, else it sets the existence flags (avoid
// purging). We don't do this when called from indexFiles
if (!m_nocacheindex) {
bool eof;
if (!m_cache->rewind(eof)) {
if (!cc->rewind(eof)) {
// rewind can return eof if the cache is empty
if (!eof)
return false;
}
while (m_cache->next(eof)) {
while (cc->next(eof)) {
string udi;
if (!m_cache->getCurrentUdi(udi)) {
if (!cc->getCurrentUdi(udi)) {
LOGERR(("BeagleQueueIndexer:: cache file damaged\n"));
break;
}
@ -438,7 +398,7 @@ BeagleQueueIndexer::processone(const string &path,
// Have to use the hit type for the udi, because the same url can exist
// as a bookmark or a page.
udipath = path_cat(dotdoc.meta[keybght], url_gpath(dotdoc.url));
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
make_udi(udipath, "", udi);
LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str()));
@ -446,7 +406,7 @@ BeagleQueueIndexer::processone(const string &path,
sprintf(ascdate, "%ld", long(stp->st_mtime));
// We only process bookmarks or text/html and text/plain files.
if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) {
if (!stringlowercmp("bookmark", dotdoc.meta[Rcl::Doc::keybght])) {
// For bookmarks, we just index the doc that was built from the
// metadata.
if (dotdoc.fmtime.empty())
@ -463,11 +423,11 @@ BeagleQueueIndexer::processone(const string &path,
if (!m_db->addOrUpdate(udi, "", dotdoc))
return FsTreeWalker::FtwError;
} else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
(dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
// Unlink them anyway
dounlink = true;
goto out;
@ -516,9 +476,13 @@ BeagleQueueIndexer::processone(const string &path,
dotfile.m_fields.set("udi", udi, "");
string fdata;
file_to_string(path, fdata);
if (!m_cache->put(udi, &dotfile.m_fields, fdata, 0)) {
if (!m_cache || !m_cache->cc()) {
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
goto out;
}
if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) {
LOGERR(("BeagleQueueIndexer::prc1: cache_put failed; %s\n",
m_cache->getReason().c_str()));
m_cache->cc()->getReason().c_str()));
goto out;
}
}

View File

@ -33,6 +33,7 @@
class DbIxStatusUpdater;
class CirCache;
class RclConfig;
class BeagleQueueCache;
namespace Rcl {
class Db;
}
@ -63,7 +64,7 @@ public:
private:
RclConfig *m_config;
Rcl::Db *m_db;
CirCache *m_cache;
BeagleQueueCache *m_cache;
string m_queuedir;
string m_tmpdir;
DbIxStatusUpdater *m_updater;

View File

@ -45,7 +45,7 @@ using namespace std;
#include "rclconfig.h"
#include "mh_html.h"
#include "fileudi.h"
#include "beaglequeue.h"
#include "beaglequeuecache.h"
#include "cancelcheck.h"
#ifdef RCL_USE_XATTR
@ -390,14 +390,10 @@ FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf,
}
init(fn, &st, cnf, td, flags, &idoc.mimetype);
} else if (!backend.compare("BGL")) {
// Retrieve from our webcache (beagle data). There should
// probably be a separate object type for readonly cache
// access (distinct from the one used for indexing).
// Anyway, we're not called in the same thread as indexing ops, and
// even, at worse, this would duplicate the memory used. The beagler
// object is created at the first call of this routine and deleted
// when the program exits.
static BeagleQueueIndexer beagler(cnf);
// Retrieve from our webcache (beagle data). The beagler
// object is created at the first call of this routine and
// deleted when the program exits.
static BeagleQueueCache beagler(cnf);
string data;
Rcl::Doc dotdoc;
map<string,string>::const_iterator it =

View File

@ -494,6 +494,7 @@ bool MimeHandlerMbox::next_document()
#include <string>
using namespace std;
#include "rclconfig.h"
#include "rclinit.h"
#include "mh_mbox.h"
@ -508,6 +509,11 @@ Usage(void)
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
exit(1);
}
static RclConfig *config;
RclConfig *RclConfig::getMainConfig()
{
return config;
}
int main(int argc, char **argv)
{
@ -530,8 +536,8 @@ int main(int argc, char **argv)
Usage();
string filename = *argv++;argc--;
string reason;
RclConfig *conf = recollinit(RclInitFlags(0), 0, 0, reason, 0);
if (conf == 0) {
config = recollinit(RclInitFlags(0), 0, 0, reason, 0);
if (config == 0) {
cerr << "init failed " << reason << endl;
exit(1);
}

View File

@ -6,8 +6,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
OBJS = rclaspell.o beaglequeuecache.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o
@ -17,6 +17,8 @@ unac.o : $(depth)/unac/unac.c $(depth)/unac/unac.h $
$(CC) $(ALL_CXXFLAGS) -c $(depth)/unac/unac.c
rclaspell.o : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../aspell/rclaspell.cpp
beaglequeuecache.o : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../common/beaglequeuecache.cpp
rclconfig.o : ../common/rclconfig.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../common/rclconfig.cpp
rclinit.o : ../common/rclinit.cpp $(depth)/mk/localdefs
@ -158,6 +160,9 @@ distclean: clean
rclaspell.dep.stamp : ../aspell/rclaspell.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../aspell/rclaspell.cpp > rclaspell.dep
touch rclaspell.dep.stamp
beaglequeuecache.dep.stamp : ../common/beaglequeuecache.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../common/beaglequeuecache.cpp > beaglequeuecache.dep
touch beaglequeuecache.dep.stamp
rclconfig.dep.stamp : ../common/rclconfig.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../common/rclconfig.cpp > rclconfig.dep
touch rclconfig.dep.stamp
@ -327,6 +332,7 @@ x11mon.dep.stamp : ../utils/x11mon.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../utils/x11mon.cpp > x11mon.dep
touch x11mon.dep.stamp
include rclaspell.dep
include beaglequeuecache.dep
include rclconfig.dep
include rclinit.dep
include textsplit.dep

View File

@ -5,6 +5,7 @@ depth=${depth:-..}
SRC_CPP="\
${depth}/aspell/rclaspell.cpp \
${depth}/common/beaglequeuecache.cpp \
${depth}/common/rclconfig.cpp \
${depth}/common/rclinit.cpp \
${depth}/common/textsplit.cpp \

View File

@ -43,6 +43,7 @@ namespace Rcl {
const string Doc::keybcknd("rclbes");
const string Doc::keyudi("rcludi");
const string Doc::keyapptg("rclaptg");
const string Doc::keybght("beagleHitType");
void Doc::dump(bool dotext) const
{

View File

@ -164,6 +164,7 @@ class Doc {
// udi back from index. Only set by Rcl::Query::getdoc().
static const string keyudi;
static const string keyapptg; // apptag. Set from localfields (fsindexer)
static const string keybght; // beagle hit type ("beagleHitType")
};