separated rcldb and rclquery

This commit is contained in:
dockes 2008-06-13 18:22:47 +00:00
parent e5e8249ad3
commit 0e7a78d688
20 changed files with 707 additions and 529 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.7 2007-11-09 15:46:17 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.8 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
@ -109,18 +109,20 @@ void RecollProtocol::get(const KURL & url)
RefCntr<Rcl::SearchData> sdata(new Rcl::SearchData(Rcl::SCLT_OR));
sdata->addClause(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
(const char *)u8));
if (!m_rcldb->setQuery(sdata, Rcl::Db::QO_STEM, "english")) {
Rcl::Query *query = new Rcl::Query(m_rcldb);
if (!query->setQuery(sdata, Rcl::Db::QO_STEM, "english")) {
m_reason = "Internal Error: setQuery failed";
outputError(m_reason.c_str());
finished();
delete query;
return;
}
if (m_docsource)
delete m_docsource;
m_docsource = new DocSequenceDb(m_rcldb, "Query results", sdata);
m_docsource = new DocSequenceDb(RefCntr<Rcl::Query>(query),
"Query results", sdata);
QByteArray output;
QTextStream os(output, IO_WriteOnly );

View File

@ -8,8 +8,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o
@ -71,6 +71,8 @@ pathhash.o : ../rcldb/pathhash.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
rcldb.o : ../rcldb/rcldb.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
rclquery.o : ../rcldb/rclquery.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp
searchdata.o : ../rcldb/searchdata.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
stemdb.o : ../rcldb/stemdb.cpp
@ -194,6 +196,9 @@ pathhash.dep.stamp : ../rcldb/pathhash.cpp
rcldb.dep.stamp : ../rcldb/rcldb.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
touch rcldb.dep.stamp
rclquery.dep.stamp : ../rcldb/rclquery.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep
touch rclquery.dep.stamp
searchdata.dep.stamp : ../rcldb/searchdata.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/searchdata.cpp > searchdata.dep
touch searchdata.dep.stamp
@ -275,6 +280,7 @@ include wasastringtoquery.dep
include wasatorcl.dep
include pathhash.dep
include rcldb.dep
include rclquery.dep
include searchdata.dep
include stemdb.dep
include stoplist.dep

View File

@ -31,6 +31,7 @@ ${depth}/query/wasastringtoquery.cpp \
${depth}/query/wasatorcl.cpp \
${depth}/rcldb/pathhash.cpp \
${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/rclquery.cpp \
${depth}/rcldb/searchdata.cpp \
${depth}/rcldb/stemdb.cpp \
${depth}/rcldb/stoplist.cpp \

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.2 2008-05-27 10:45:59 dockes Exp $ (C) 2007 J.F.Dockes";
static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.3 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes";
#endif
#include <Python.h>
@ -11,6 +11,7 @@ using namespace std;
#include "rclinit.h"
#include "rclconfig.h"
#include "rcldb.h"
#include "rclquery.h"
#include "pathut.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
@ -31,7 +32,7 @@ recollq_question(PyObject *self, PyObject *args)
string reason;
string dbdir = config->getDbDir();
rcldb.open(dbdir, config->getStopfile(),
Rcl::Db::DbRO, Rcl::Db::QO_STEM);
Rcl::Db::DbRO);
Rcl::SearchData *sd = wasaStringToRcl(qs, reason);
if (!sd) {
@ -40,8 +41,9 @@ recollq_question(PyObject *self, PyObject *args)
}
RefCntr<Rcl::SearchData> rq(sd);
rcldb.setQuery(rq, Rcl::Db::QO_STEM);
int cnt = rcldb.getResCnt();
RefCntr<Rcl::Query> query(new Rcl::Query(&rcldb));
query->setQuery(rq, Rcl::Query::QO_STEM);
int cnt = query->getResCnt();
cout << "Recoll query: " << rq->getDescription() << endl;
if (cnt <= limit)
cout << cnt << " results" << endl;
@ -51,7 +53,7 @@ recollq_question(PyObject *self, PyObject *args)
for (int i = 0; i < limit; i++) {
int pc;
Rcl::Doc doc;
if (!rcldb.getDoc(i, doc, &pc))
if (!query->getDoc(i, doc, &pc))
break;
char cpc[20];
sprintf(cpc, "%d", pc);

View File

@ -3,8 +3,6 @@ from distutils.core import setup, Extension
module1 = Extension('recollq',
define_macros = [('MAJOR_VERSION', '1'),
('MINOR_VERSION', '0'),
('HAVE_MKDTEMP', '1'),
('HAVE_VASPRINTF', '1'),
('UNAC_VERSION', '"1.0.7"'),
('STATFS_INCLUDE', '"sys/mount.h"'),
('RECOLL_DATADIR',
@ -27,6 +25,7 @@ module1 = Extension('recollq',
'../query/wasastringtoquery.cpp',
'../query/wasatorcl.cpp',
'../rcldb/rcldb.cpp',
'../rcldb/rclquery.cpp',
'../rcldb/searchdata.cpp',
'../rcldb/stemdb.cpp',
'../rcldb/pathhash.cpp',

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: main.cpp,v 1.66 2008-02-19 08:02:20 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: main.cpp,v 1.67 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -100,9 +100,6 @@ bool maybeOpenDb(string &reason, bool force)
return false;
}
int qopts = Rcl::Db::QO_NONE;
if (prefs.queryStemLang.length() > 0)
qopts |= Rcl::Db::QO_STEM;
if (force)
rcldb->close();
rcldb->rmQueryDb("");
@ -112,7 +109,7 @@ bool maybeOpenDb(string &reason, bool force)
rcldb->addQueryDb(*it);
}
if (!rcldb->isopen() && !rcldb->open(dbdir, rclconfig->getStopfile(),
Rcl::Db::DbRO, qopts)) {
Rcl::Db::DbRO)) {
reason = "Could not open database in " +
dbdir + " wait for indexing to complete?";
return false;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.48 2008-02-19 08:02:01 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.49 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -440,23 +440,25 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
int qopts = 0;
if (!prefs.queryStemLang.length() == 0)
qopts |= Rcl::Db::QO_STEM;
qopts |= Rcl::Query::QO_STEM;
QApplication::setOverrideCursor(QCursor(Qt::WaitCursor));
string stemLang = (const char *)prefs.queryStemLang.ascii();
if (stemLang == "ALL") {
rclconfig->getConfParam("indexstemminglanguages", stemLang);
}
Rcl::Query *query = new Rcl::Query(rcldb);
if (!rcldb->setQuery(sdata, qopts, stemLang)) {
if (!query || !query->setQuery(sdata, qopts, stemLang)) {
QMessageBox::warning(0, "Recoll", tr("Cant start query: ") +
QString::fromAscii(rcldb->getReason().c_str()));
QString::fromAscii(query->getReason().c_str()));
QApplication::restoreOverrideCursor();
return;
}
curPreview = 0;
DocSequenceDb *src =
new DocSequenceDb(rcldb, string(tr("Query results").utf8()), sdata);
new DocSequenceDb(RefCntr<Rcl::Query>(query),
string(tr("Query results").utf8()), sdata);
m_docSource = RefCntr<DocSequence>(src);
m_searchData = sdata;
setDocSequence();
@ -921,7 +923,8 @@ void RclMain::docExpand(int docnum)
if (!resList->getDoc(docnum, doc))
return;
list<string> terms;
terms = rcldb->expand(doc);
if (!m_docSource.isNull())
terms = m_docSource->expand(doc);
if (terms.empty())
return;
// Do we keep the original query. I think we'd better not.

View File

@ -16,7 +16,7 @@
*/
#ifndef _DOCSEQ_H_INCLUDED_
#define _DOCSEQ_H_INCLUDED_
/* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: docseq.h,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
#include <vector>
@ -89,7 +89,7 @@ class DocSequence {
vector<int>& gslks) const {
terms.clear(); groups.clear(); gslks.clear(); return true;
}
virtual list<string> expand(Rcl::Doc &) {list<string> e; return e;}
private:
string m_title;
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.4 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -23,28 +23,53 @@ static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp
#include "docseqdb.h"
#include "rcldb.h"
DocSequenceDb::DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
RefCntr<Rcl::SearchData> sdata)
: DocSequence(t), m_q(q), m_sdata(sdata), m_rescnt(-1)
{
}
DocSequenceDb::~DocSequenceDb()
{
}
bool DocSequenceDb::getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks) const
{
return m_sdata.getptr()->getTerms(terms, groups, gslks);
}
string DocSequenceDb::getDescription()
{
return m_sdata->getDescription();
}
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, int *percent, string *sh)
{
if (sh) sh->erase();
return m_db ? m_db->getDoc(num, doc, percent) : false;
return m_q->getDoc(num, doc, percent);
}
int DocSequenceDb::getResCnt()
{
if (!m_db)
return -1;
if (m_rescnt < 0) {
m_rescnt= m_db->getResCnt();
m_rescnt= m_q->getResCnt();
}
return m_rescnt;
}
string DocSequenceDb::getAbstract(Rcl::Doc &doc)
{
if (!m_db)
if (!m_q->whatDb())
return doc.meta["abstract"];
string abstract;
m_db->makeDocAbstract(doc, abstract);
m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), abstract);
return abstract.empty() ? doc.meta["abstract"] : abstract;
}
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{
return m_q->expand(doc);
}

View File

@ -16,33 +16,30 @@
*/
#ifndef _DOCSEQDB_H_INCLUDED_
#define _DOCSEQDB_H_INCLUDED_
/* @(#$Id: docseqdb.h,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: docseqdb.h,v 1.3 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include "docseq.h"
#include "refcntr.h"
#include "searchdata.h"
#include "rclquery.h"
/** A DocSequence from a Db query (there should be one active for this
to make sense) */
class DocSequenceDb : public DocSequence {
public:
DocSequenceDb(Rcl::Db *d, const string &t, RefCntr<Rcl::SearchData> sdata)
: DocSequence(t), m_db(d), m_sdata(sdata), m_rescnt(-1)
{}
virtual ~DocSequenceDb() {}
DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
RefCntr<Rcl::SearchData> sdata);
virtual ~DocSequenceDb();
virtual bool getDoc(int num, Rcl::Doc &doc, int *percent, string * = 0);
virtual int getResCnt();
virtual bool getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks) const {
return m_sdata.getptr()->getTerms(terms, groups, gslks);
}
vector<int>& gslks) const;
virtual string getAbstract(Rcl::Doc &doc);
virtual string getDescription() {return m_sdata->getDescription();}
virtual string getDescription();
virtual list<string> expand(Rcl::Doc &doc);
private:
Rcl::Db *m_db;
RefCntr<Rcl::Query> m_q;
RefCntr<Rcl::SearchData> m_sdata;
int m_rescnt;
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.12 2007-12-13 06:58:21 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -32,6 +32,7 @@ static char rcsid[] = "@(#$Id: recollq.cpp,v 1.12 2007-12-13 06:58:21 dockes Exp
using namespace std;
#include "rcldb.h"
#include "rclquery.h"
#include "rclconfig.h"
#include "pathut.h"
#include "rclinit.h"
@ -132,8 +133,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
exit(1);
}
dbdir = rclconfig->getDbDir();
rcldb.open(dbdir, rclconfig->getStopfile(),
Rcl::Db::DbRO, Rcl::Db::QO_STEM);
rcldb.open(dbdir, rclconfig->getStopfile(), Rcl::Db::DbRO);
Rcl::SearchData *sd = 0;
@ -166,8 +166,9 @@ int recollq(RclConfig **cfp, int argc, char **argv)
}
RefCntr<Rcl::SearchData> rq(sd);
rcldb.setQuery(rq, Rcl::Db::QO_STEM);
int cnt = rcldb.getResCnt();
Rcl::Query query(&rcldb);
query.setQuery(rq, Rcl::Query::QO_STEM);
int cnt = query.getResCnt();
if (!(op_flags & OPT_b)) {
cout << "Recoll query: " << rq->getDescription() << endl;
if (cnt <= limit)
@ -180,7 +181,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
for (int i = 0; i < limit; i++) {
int pc;
Rcl::Doc doc;
if (!rcldb.getDoc(i, doc, &pc))
if (!query.getDoc(i, doc, &pc))
break;
if (op_flags & OPT_b) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.132 2008-05-20 10:09:54 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.133 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -36,6 +36,7 @@ using namespace std;
#include "rclconfig.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "stemdb.h"
#include "textsplit.h"
#include "transcode.h"
@ -47,8 +48,9 @@ using namespace std;
#include "pathhash.h"
#include "utf8iter.h"
#include "searchdata.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "xapian.h"
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -88,125 +90,8 @@ namespace Rcl {
const static string rclSyntAbs = "?!#@";
const static string emptystring;
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
// 2 different ones for indexing or query as there is not much in
// common.
class Native {
public:
Db *m_db;
bool m_isopen;
bool m_iswritable;
// Indexing
Xapian::WritableDatabase wdb;
// Querying
Xapian::Database db;
Xapian::Query query; // query descriptor: terms and subqueries
// joined by operators (or/and etc...)
// Filtering results on location. There are 2 possible approaches
// for this:
// - Set a "MatchDecider" to be used by Xapian during the query
// - Filter the results out of Xapian (this also uses a
// Xapian::MatchDecider object, but applied to the results by Recoll.
//
// The result filtering approach was the first implemented.
//
// The efficiency of both methods depend on the searches, so the code
// for both has been kept. A nice point for the Xapian approach is that
// the result count estimate are correct (they are wrong with
// the postfilter approach). It is also faster in some worst case scenarios
// so this now the default (but the post-filtering is faster in many common
// cases).
//
// Which is used is decided in SetQuery(), by setting either of
// the two following members. This in turn is controlled by a
// preprocessor directive.
#define XAPIAN_FILTERING 1
Xapian::MatchDecider *decider; // Xapian does the filtering
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
Xapian::Enquire *enquire; // Open query descriptor.
Xapian::MSet mset; // Partial result set
// Term frequencies for current query. See makeAbstract, setQuery
map<string, double> m_termfreqs;
Native(Db *db)
: m_db(db),
m_isopen(false), m_iswritable(false), decider(0), postfilter(0),
enquire(0)
{ }
~Native() {
delete decider;
delete postfilter;
delete enquire;
}
string makeAbstract(Xapian::docid id, const list<string>& terms);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
/** Compute list of subdocuments for a given path (given by hash)
* We look for all Q terms beginning with the path/hash
* As suggested by James Aylett, a better method would be to add
* a single term (ie: XP/path/to/file) to all subdocs, then finding
* them would be a simple matter of retrieving the posting list for the
* term. There would still be a need for the current Qterm though, as a
* unique term for replace_document, and for retrieving by
* path/ipath (history)
*/
bool subDocs(const string &hash, vector<Xapian::docid>& docids);
};
class FilterMatcher : public Xapian::MatchDecider {
public:
FilterMatcher(const string &topdir)
: m_topdir(topdir)
{}
virtual ~FilterMatcher() {}
virtual
#if XAPIAN_MAJOR_VERSION < 1
int
#else
bool
#endif
operator()(const Xapian::Document &xdoc) const
{
m_cnt++;
// Parse xapian document's data and populate doc fields
string data = xdoc.get_data();
ConfSimple parms(&data);
// The only filtering for now is on file path (subtree)
string url;
parms.get(string("url"), url);
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
m_topdir.c_str(), url.c_str()));
if (url.find(m_topdir, 7) == 7) {
LOGDEB2(("FilterMatcher: MATCH %d\n", m_cnt));
return true;
} else {
LOGDEB2(("FilterMatcher: NO MATCH %d\n", m_cnt));
return false;
}
}
static int m_cnt;
private:
string m_topdir;
};
int FilterMatcher::m_cnt;
/* See comment in class declaration */
bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
bool Db::Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
{
docids.clear();
string qterm = "Q"+ hash + "|";
@ -250,7 +135,7 @@ bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
}
// Turn data record from db into document fields
bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
{
LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
ConfSimple parms(&data);
@ -306,26 +191,29 @@ static list<string> noPrefixList(const list<string>& in)
// Build a document abstract by extracting text chunks around the query terms
// This uses the db termlists, not the original document.
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
{
Chrono chron;
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
list<string> iterms;
query->getQueryTerms(iterms);
list<string> terms = noPrefixList(iterms);
if (terms.empty()) {
return "";
}
// Retrieve db-wide frequencies for the query terms
if (m_termfreqs.empty()) {
if (query->m_nq->termfreqs.empty()) {
double doccnt = db.get_doccount();
if (doccnt == 0) doccnt = 1;
for (list<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) {
m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
query->m_nq->termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
m_termfreqs[*qit]));
query->m_nq->termfreqs[*qit]));
}
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
}
@ -343,7 +231,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
Xapian::TermIterator term = db.termlist_begin(docid);
term.skip_to(*qit);
if (term != db.termlist_end(docid) && *term == *qit) {
double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
double q = (term.get_wdf() / doclen) * query->m_nq->termfreqs[*qit];
q = -log10(q);
if (q < 3) {
q = 0.05;
@ -556,7 +444,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
/* Rcl::Db methods ///////////////////////////////// */
Db::Db()
: m_ndb(0), m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
: m_ndb(0), m_idxAbsTruncLen(250), m_synthAbsLen(250),
m_synthAbsWordCtxLen(4), m_flushMb(-1),
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0),
m_maxFsOccupPc(0), m_mode(Db::DbRO)
@ -586,28 +474,9 @@ Db::~Db()
return res;
}
// Generic Xapian exception catching code. We do this quite often,
// and I have no idea how to do this except for a macro
#define XCATCHERROR(MSG) \
catch (const Xapian::Error &e) { \
MSG = e.get_msg(); \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const string &s) { \
MSG = s; \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const char *s) { \
MSG = s; \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (...) { \
MSG = "Caught unknown xapian exception"; \
}
bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
bool Db::open(const string& dir, const string &stops, OpenMode mode,
bool keep_updated)
{
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
qops &= ~QO_KEEP_UPDATED;
if (m_ndb == 0)
return false;
LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen,
@ -724,7 +593,7 @@ bool Db::reOpen()
if (m_ndb && m_ndb->m_isopen) {
if (!close())
return false;
if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
if (!open(m_basedir, "", m_mode, true)) {
return false;
}
}
@ -1467,64 +1336,6 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
return true;
}
// Prepare query out of user search data
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
const string& stemlang)
{
if (!m_ndb) {
LOGERR(("Db::setQuery: no db!\n"));
return false;
}
m_reason.erase();
LOGDEB(("Db::setQuery:\n"));
m_filterTopDir = sdata->getTopdir();
deleteZ(m_ndb->decider);
deleteZ(m_ndb->postfilter);
if (!m_filterTopDir.empty()) {
#if XAPIAN_FILTERING
m_ndb->decider =
#else
m_ndb->postfilter =
#endif
new FilterMatcher(m_filterTopDir);
}
m_dbindices.clear();
m_qOpts = opts;
m_ndb->m_termfreqs.clear();
FilterMatcher::m_cnt = 0;
Xapian::Query xq;
if (!sdata->toNativeQuery(*this, &xq,
(opts & Db::QO_STEM) ? stemlang : "")) {
m_reason += sdata->getReason();
return false;
}
m_ndb->query = xq;
string ermsg;
string d;
try {
delete m_ndb->enquire;
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
m_ndb->enquire->set_query(m_ndb->query);
m_ndb->mset = Xapian::MSet();
// Get the query description and trim the "Xapian::Query"
d = m_ndb->query.get_description();
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGDEB(("Db::SetQuery: xapian error %s\n", ermsg.c_str()));
return false;
}
if (d.find("Xapian::Query") == 0)
d.erase(0, strlen("Xapian::Query"));
if (!m_filterTopDir.empty()) {
d += string(" [dir: ") + m_filterTopDir + "]";
}
sdata->setDescription(d);
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
return true;
}
class TermMatchCmpByWcf {
public:
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
@ -1735,195 +1546,15 @@ bool Db::stemDiffers(const string& lang, const string& word,
return true;
}
bool Db::getQueryTerms(list<string>& terms)
{
if (!m_ndb)
return false;
terms.clear();
Xapian::TermIterator it;
string ermsg;
try {
for (it = m_ndb->query.get_terms_begin();
it != m_ndb->query.get_terms_end(); it++) {
terms.push_back(*it);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
return false;
}
return true;
}
bool Db::getMatchTerms(const Doc& doc, list<string>& terms)
{
if (!m_ndb || !m_ndb->enquire) {
LOGERR(("Db::getMatchTerms: no query opened\n"));
return -1;
}
terms.clear();
Xapian::TermIterator it;
Xapian::docid id = Xapian::docid(doc.xdocid);
string ermsg;
try {
for (it=m_ndb->enquire->get_matching_terms_begin(id);
it != m_ndb->enquire->get_matching_terms_end(id); it++) {
terms.push_back(*it);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
return false;
}
return true;
}
// Mset size
static const int qquantum = 30;
int Db::getResCnt()
{
if (!m_ndb || !m_ndb->enquire) {
LOGERR(("Db::getResCnt: no query opened\n"));
return -1;
}
string ermsg;
if (m_ndb->mset.size() <= 0) {
try {
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
0, m_ndb->decider);
} catch (const Xapian::DatabaseModifiedError &error) {
m_ndb->db.reopen();
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
0, m_ndb->decider);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str()));
return -1;
}
}
int ret = -1;
try {
ret = m_ndb->mset.get_matches_lower_bound();
} catch (...) {}
return ret;
}
// Get document at rank i in query (i is the index in the whole result
// set, as in the enquire class. We check if the current mset has the
// doc, else ask for an other one. We use msets of 10 documents. Don't
// know if the whole thing makes sense at all but it seems to work.
//
// If there is a postquery filter (ie: file names), we have to
// maintain a correspondance from the sequential external index
// sequence to the internal Xapian hole-y one (the holes being the documents
// that dont match the filter).
bool Db::getDoc(int exti, Doc &doc, int *percent)
{
LOGDEB1(("Db::getDoc: exti %d\n", exti));
if (!m_ndb || !m_ndb->enquire) {
LOGERR(("Db::getDoc: no query opened\n"));
return false;
}
int xapi;
if (m_ndb->postfilter) {
// There is a postquery filter, does this fall in already known area ?
if (exti >= (int)m_dbindices.size()) {
// Have to fetch xapian docs and filter until we get
// enough or fail
m_dbindices.reserve(exti+1);
// First xapian doc we fetch is the one after last stored
int first = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0;
// Loop until we get enough docs
while (exti >= (int)m_dbindices.size()) {
LOGDEB(("Db::getDoc: fetching %d starting at %d\n",
qquantum, first));
try {
m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
} catch (const Xapian::DatabaseModifiedError &error) {
m_ndb->db.reopen();
m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
} catch (const Xapian::Error & error) {
LOGERR(("enquire->get_mset: exception: %s\n",
error.get_msg().c_str()));
abort();
}
if (m_ndb->mset.empty()) {
LOGDEB(("Db::getDoc: got empty mset\n"));
return false;
}
first = m_ndb->mset.get_firstitem();
for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) {
LOGDEB(("Db::getDoc: [%d]\n", i));
Xapian::Document xdoc = m_ndb->mset[i].get_document();
if ((*m_ndb->postfilter)(xdoc)) {
m_dbindices.push_back(first + i);
}
}
first = first + m_ndb->mset.size();
}
}
xapi = m_dbindices[exti];
} else {
xapi = exti;
}
// From there on, we work with a xapian enquire item number. Fetch it
int first = m_ndb->mset.get_firstitem();
int last = first + m_ndb->mset.size() -1;
if (!(xapi >= first && xapi <= last)) {
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
try {
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
0, m_ndb->decider);
} catch (const Xapian::DatabaseModifiedError &error) {
m_ndb->db.reopen();
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
0, m_ndb->decider);
} catch (const Xapian::Error & error) {
LOGERR(("enquire->get_mset: exception: %s\n",
error.get_msg().c_str()));
abort();
}
if (m_ndb->mset.empty())
return false;
first = m_ndb->mset.get_firstitem();
last = first + m_ndb->mset.size() -1;
}
LOGDEB1(("Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d",
m_ndb->query.get_description().c_str(),
first, last,
m_ndb->mset.get_matches_lower_bound()));
Xapian::Document xdoc = m_ndb->mset[xapi-first].get_document();
Xapian::docid docid = *(m_ndb->mset[xapi-first]);
if (percent)
*percent = m_ndb->mset.convert_to_percent(m_ndb->mset[xapi-first]);
// Parse xapian document's data and populate doc fields
string data = xdoc.get_data();
return m_ndb->dbDataToRclDoc(docid, data, doc);
}
bool Db::makeDocAbstract(Doc &doc, string& abstract)
bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
{
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
if (!m_ndb || !m_ndb->enquire) {
LOGERR(("Db::makeDocAbstract: no query opened\n"));
if (!m_ndb) {
LOGERR(("Db::makeDocAbstract: no db\n"));
return false;
}
list<string> terms;
getQueryTerms(terms);
abstract = m_ndb->makeAbstract(doc.xdocid, terms);
abstract = m_ndb->makeAbstract(doc.xdocid, query);
return true;
}
@ -1969,45 +1600,6 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
return false;
}
list<string> Db::expand(const Doc &doc)
{
list<string> res;
if (!m_ndb || !m_ndb->enquire) {
LOGERR(("Db::expand: no query opened\n"));
return res;
}
string ermsg;
for (int tries = 0; tries < 2; tries++) {
try {
Xapian::RSet rset;
rset.add_document(Xapian::docid(doc.xdocid));
// We don't exclude the original query terms.
Xapian::ESet eset = m_ndb->enquire->get_eset(20, rset, false);
LOGDEB(("ESet terms:\n"));
// We filter out the special terms
for (Xapian::ESetIterator it = eset.begin();
it != eset.end(); it++) {
LOGDEB((" [%s]\n", (*it).c_str()));
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
continue;
res.push_back(*it);
if (res.size() >= 10)
break;
}
} catch (const Xapian::DatabaseModifiedError &error) {
continue;
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::expand: xapian error %s\n", ermsg.c_str()));
res.clear();
}
break;
}
return res;
}
#ifndef NO_NAMESPACES
}
#endif

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.54 2007-07-10 09:23:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.55 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -52,8 +52,8 @@ namespace Rcl {
#endif
class SearchData;
class Native;
class TermIter;
class Query;
class TermMatchEntry {
public:
@ -71,17 +71,17 @@ public:
*/
class Db {
public:
// A place for things we don't want visible here.
class Native;
friend class Native;
/* General stuff (valid for query or update) ****************************/
Db();
~Db();
enum OpenMode {DbRO, DbUpd, DbTrunc};
// KEEP_UPDATED is internal use by reOpen() only
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
bool open(const string &dbdir, const string &stoplistfn,
OpenMode mode, int qops = QO_NONE);
OpenMode mode, bool keep_updated = false);
bool close();
bool isopen();
@ -130,11 +130,12 @@ class Db {
/** Return total docs in db */
int docCnt();
// Parse query string and initialize query
bool setQuery(RefCntr<SearchData> q, int opts = QO_NONE,
const string& stemlang = "english");
bool getQueryTerms(list<string>& terms);
bool getMatchTerms(const Doc& doc, list<string>& terms);
/** Add extra database for querying */
bool addQueryDb(const string &dir);
/** Remove extra database. if dir == "", remove all. */
bool rmQueryDb(const string &dir);
/** Tell if directory seems to hold xapian db */
static bool testDbDir(const string &dir);
/** Return a list of index terms that match the input string
* Expansion is performed either with either wildcard or regexp processing
@ -143,33 +144,12 @@ class Db {
bool termMatch(MatchType typ, const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1);
/** Add extra database for querying */
bool addQueryDb(const string &dir);
/** Remove extra database. if dir == "", remove all. */
bool rmQueryDb(const string &dir);
/** Tell if directory seems to hold xapian db */
static bool testDbDir(const string &dir);
/** Get document at rank i in current query.
This is probably vastly inferior to the type of interface in
Xapian, but we have to start with something simple to
experiment with the GUI. i is sequential from 0 to some value.
*/
bool getDoc(int i, Doc &doc, int *percent = 0);
/* Build synthetic abstract out of query terms and term position data */
bool makeDocAbstract(Doc &doc, string& abstract);
bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
/** Get document for given filename and ipath */
bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent);
/** Expand query */
list<string> expand(const Doc &doc);
/** Get results count for current query */
int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
@ -189,22 +169,16 @@ class Db {
/** Filename wildcard expansion */
bool filenameWildExp(const string& exp, list<string>& names);
/** This has to be public for access by embedded Query::Native */
Native *m_ndb;
private:
// Internal form of close, can be called during destruction
bool i_close(bool final);
string m_filterTopDir; // Current query filter on subtree top directory
vector<int> m_dbindices; // In case there is a postq filter: sequence of
// db indices that match
string m_reason; // Error explanation
// A place for things we don't want visible here.
friend class Native;
Native *m_ndb;
unsigned int m_qOpts;
/* Parameters cached out of the configuration files */
// This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying
@ -215,7 +189,6 @@ private:
// This is how many words (context size) we keep around query terms
// when building the abstract
int m_synthAbsWordCtxLen;
// Flush threshold. Megabytes of text indexed before we flush.
int m_flushMb;
// Text bytes indexed since beginning
@ -224,7 +197,6 @@ private:
long long m_flushtxtsz;
// Text bytes at last fsoccup check
long long m_occtxtsz;
// Maximum file system occupation percentage
int m_maxFsOccupPc;

67
src/rcldb/rcldb_p.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_
#include "xapian.h"
namespace Rcl {
/* @(#$Id: rcldb_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes */
// Generic Xapian exception catching code. We do this quite often,
// and I have no idea how to do this except for a macro
#define XCATCHERROR(MSG) \
catch (const Xapian::Error &e) { \
MSG = e.get_msg(); \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const string &s) { \
MSG = s; \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const char *s) { \
MSG = s; \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (...) { \
MSG = "Caught unknown xapian exception"; \
}
class Query;
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
// 2 different ones for indexing or query as there is not much in
// common.
class Db::Native {
public:
Db *m_db;
bool m_isopen;
bool m_iswritable;
// Indexing
Xapian::WritableDatabase wdb;
// Querying
Xapian::Database db;
Native(Db *db)
: m_db(db), m_isopen(false), m_iswritable(false)
{ }
~Native() {
}
string makeAbstract(Xapian::docid id, Query *query);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
/** Compute list of subdocuments for a given path (given by hash)
* We look for all Q terms beginning with the path/hash
* As suggested by James Aylett, a better method would be to add
* a single term (ie: XP/path/to/file) to all subdocs, then finding
* them would be a simple matter of retrieving the posting list for the
* term. There would still be a need for the current Qterm though, as a
* unique term for replace_document, and for retrieving by
* path/ipath (history)
*/
bool subDocs(const string &hash, vector<Xapian::docid>& docids);
};
}
#endif /* _rcldb_p_h_included_ */

354
src/rcldb/rclquery.cpp Normal file
View File

@ -0,0 +1,354 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2008 J.F.Dockes";
#endif
#include <list>
#include <vector>
#include "rcldb.h"
#include "rcldb_p.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "debuglog.h"
#include "conftree.h"
#include "smallut.h"
#include "searchdata.h"
#ifndef NO_NAMESPACES
namespace Rcl {
#endif
class FilterMatcher : public Xapian::MatchDecider {
public:
FilterMatcher(const string &topdir)
: m_topdir(topdir)
{}
virtual ~FilterMatcher() {}
virtual
#if XAPIAN_MAJOR_VERSION < 1
int
#else
bool
#endif
operator()(const Xapian::Document &xdoc) const
{
// Parse xapian document's data and populate doc fields
string data = xdoc.get_data();
ConfSimple parms(&data);
// The only filtering for now is on file path (subtree)
string url;
parms.get(string("url"), url);
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
m_topdir.c_str(), url.c_str()));
if (url.find(m_topdir, 7) == 7) {
return true;
} else {
return false;
}
}
private:
string m_topdir;
};
Query::Query(Db *db)
: m_nq(new Native(this)), m_db(db)
{
}
Query::~Query()
{
deleteZ(m_nq);
}
string Query::getReason() const
{
return m_reason;
}
Db *Query::whatDb()
{
return m_db;
}
//#define ISNULL(X) (X).isNull()
#define ISNULL(X) !(X)
// Prepare query out of user search data
bool Query::setQuery(RefCntr<SearchData> sdata, int opts,
const string& stemlang)
{
if (!m_db || ISNULL(m_nq)) {
LOGERR(("Query::setQuery: not initialised!\n"));
return false;
}
m_reason.erase();
LOGDEB(("Query::setQuery:\n"));
m_filterTopDir = sdata->getTopdir();
deleteZ(m_nq->decider);
deleteZ(m_nq->postfilter);
if (!m_filterTopDir.empty()) {
#if XAPIAN_FILTERING
m_nq->decider =
#else
m_nq->postfilter =
#endif
new FilterMatcher(m_filterTopDir);
}
m_nq->m_dbindices.clear();
m_qOpts = opts;
m_nq->termfreqs.clear();
Xapian::Query xq;
if (!sdata->toNativeQuery(*m_db, &xq,
(opts & QO_STEM) ? stemlang : "")) {
m_reason += sdata->getReason();
return false;
}
m_nq->query = xq;
string ermsg;
string d;
try {
delete m_nq->enquire;
m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db);
m_nq->enquire->set_query(m_nq->query);
m_nq->mset = Xapian::MSet();
// Get the query description and trim the "Xapian::Query"
d = m_nq->query.get_description();
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGDEB(("Query::SetQuery: xapian error %s\n", ermsg.c_str()));
return false;
}
if (d.find("Xapian::Query") == 0)
d.erase(0, strlen("Xapian::Query"));
if (!m_filterTopDir.empty()) {
d += string(" [dir: ") + m_filterTopDir + "]";
}
sdata->setDescription(d);
LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
return true;
}
bool Query::getQueryTerms(list<string>& terms)
{
if (ISNULL(m_nq))
return false;
terms.clear();
Xapian::TermIterator it;
string ermsg;
try {
for (it = m_nq->query.get_terms_begin();
it != m_nq->query.get_terms_end(); it++) {
terms.push_back(*it);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
return false;
}
return true;
}
bool Query::getMatchTerms(const Doc& doc, list<string>& terms)
{
if (ISNULL(m_nq) || !m_nq->enquire) {
LOGERR(("Query::getMatchTerms: no query opened\n"));
return -1;
}
terms.clear();
Xapian::TermIterator it;
Xapian::docid id = Xapian::docid(doc.xdocid);
string ermsg;
try {
for (it=m_nq->enquire->get_matching_terms_begin(id);
it != m_nq->enquire->get_matching_terms_end(id); it++) {
terms.push_back(*it);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
return false;
}
return true;
}
// Mset size
static const int qquantum = 30;
int Query::getResCnt()
{
if (ISNULL(m_nq) || !m_nq->enquire) {
LOGERR(("Query::getResCnt: no query opened\n"));
return -1;
}
string ermsg;
if (m_nq->mset.size() <= 0) {
try {
m_nq->mset = m_nq->enquire->get_mset(0, qquantum,
0, m_nq->decider);
} catch (const Xapian::DatabaseModifiedError &error) {
m_db->m_ndb->db.reopen();
m_nq->mset = m_nq->enquire->get_mset(0, qquantum,
0, m_nq->decider);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str()));
return -1;
}
}
int ret = -1;
try {
ret = m_nq->mset.get_matches_lower_bound();
} catch (...) {}
return ret;
}
// Get document at rank i in query (i is the index in the whole result
// set, as in the enquire class. We check if the current mset has the
// doc, else ask for an other one. We use msets of 10 documents. Don't
// know if the whole thing makes sense at all but it seems to work.
//
// If there is a postquery filter (ie: file names), we have to
// maintain a correspondance from the sequential external index
// sequence to the internal Xapian hole-y one (the holes being the documents
// that dont match the filter).
bool Query::getDoc(int exti, Doc &doc, int *percent)
{
LOGDEB1(("Query::getDoc: exti %d\n", exti));
if (ISNULL(m_nq) || !m_nq->enquire) {
LOGERR(("Query::getDoc: no query opened\n"));
return false;
}
int xapi;
if (m_nq->postfilter) {
// There is a postquery filter, does this fall in already known area ?
if (exti >= (int)m_nq->m_dbindices.size()) {
// Have to fetch xapian docs and filter until we get
// enough or fail
m_nq->m_dbindices.reserve(exti+1);
// First xapian doc we fetch is the one after last stored
int first = m_nq->m_dbindices.size() > 0 ?
m_nq->m_dbindices.back() + 1 : 0;
// Loop until we get enough docs
while (exti >= (int)m_nq->m_dbindices.size()) {
LOGDEB(("Query::getDoc: fetching %d starting at %d\n",
qquantum, first));
try {
m_nq->mset = m_nq->enquire->get_mset(first, qquantum);
} catch (const Xapian::DatabaseModifiedError &error) {
m_db->m_ndb->db.reopen();
m_nq->mset = m_nq->enquire->get_mset(first, qquantum);
} catch (const Xapian::Error & error) {
LOGERR(("enquire->get_mset: exception: %s\n",
error.get_msg().c_str()));
abort();
}
if (m_nq->mset.empty()) {
LOGDEB(("Query::getDoc: got empty mset\n"));
return false;
}
first = m_nq->mset.get_firstitem();
for (unsigned int i = 0; i < m_nq->mset.size() ; i++) {
LOGDEB(("Query::getDoc: [%d]\n", i));
Xapian::Document xdoc = m_nq->mset[i].get_document();
if ((*m_nq->postfilter)(xdoc)) {
m_nq->m_dbindices.push_back(first + i);
}
}
first = first + m_nq->mset.size();
}
}
xapi = m_nq->m_dbindices[exti];
} else {
xapi = exti;
}
// From there on, we work with a xapian enquire item number. Fetch it
int first = m_nq->mset.get_firstitem();
int last = first + m_nq->mset.size() -1;
if (!(xapi >= first && xapi <= last)) {
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
try {
m_nq->mset = m_nq->enquire->get_mset(xapi, qquantum,
0, m_nq->decider);
} catch (const Xapian::DatabaseModifiedError &error) {
m_db->m_ndb->db.reopen();
m_nq->mset = m_nq->enquire->get_mset(xapi, qquantum,
0, m_nq->decider);
} catch (const Xapian::Error & error) {
LOGERR(("enquire->get_mset: exception: %s\n",
error.get_msg().c_str()));
abort();
}
if (m_nq->mset.empty())
return false;
first = m_nq->mset.get_firstitem();
last = first + m_nq->mset.size() -1;
}
LOGDEB1(("Query::getDoc: Qry [%s] win [%d-%d] Estimated results: %d",
m_nq->query.get_description().c_str(),
first, last,
m_nq->mset.get_matches_lower_bound()));
Xapian::Document xdoc = m_nq->mset[xapi-first].get_document();
Xapian::docid docid = *(m_nq->mset[xapi-first]);
if (percent)
*percent = m_nq->mset.convert_to_percent(m_nq->mset[xapi-first]);
// Parse xapian document's data and populate doc fields
string data = xdoc.get_data();
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc);
}
list<string> Query::expand(const Doc &doc)
{
list<string> res;
if (ISNULL(m_nq) || !m_nq->enquire) {
LOGERR(("Query::expand: no query opened\n"));
return res;
}
string ermsg;
for (int tries = 0; tries < 2; tries++) {
try {
Xapian::RSet rset;
rset.add_document(Xapian::docid(doc.xdocid));
// We don't exclude the original query terms.
Xapian::ESet eset = m_nq->enquire->get_eset(20, rset, false);
LOGDEB(("ESet terms:\n"));
// We filter out the special terms
for (Xapian::ESetIterator it = eset.begin();
it != eset.end(); it++) {
LOGDEB((" [%s]\n", (*it).c_str()));
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
continue;
res.push_back(*it);
if (res.size() >= 10)
break;
}
} catch (const Xapian::DatabaseModifiedError &error) {
continue;
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Query::expand: xapian error %s\n", ermsg.c_str()));
res.clear();
}
break;
}
return res;
}
}

92
src/rcldb/rclquery.h Normal file
View File

@ -0,0 +1,92 @@
#ifndef _rclquery_h_included_
#define _rclquery_h_included_
/* @(#$Id: rclquery.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2008 J.F.Dockes */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <string>
#include <list>
#include <vector>
#ifndef NO_NAMESPACES
using std::string;
using std::list;
using std::vector;
#endif
#include "refcntr.h"
#ifndef NO_NAMESPACES
namespace Rcl {
#endif
class SearchData;
class Db;
class Doc;
/**
* An Rcl::Query is a question (SearchData) applied to a
* database. Handles access to the results. Somewhat equivalent to a
* cursor in an rdb.
*/
class Query {
public:
enum QueryOpts {QO_NONE=0, QO_STEM = 1};
Query(Db *db);
~Query();
/** Get explanation about last error */
string getReason() const;
/** Parse query string and initialize query */
bool setQuery(RefCntr<SearchData> q, int opts = QO_NONE,
const string& stemlang = "english");
bool getQueryTerms(list<string>& terms);
bool getMatchTerms(const Doc& doc, list<string>& terms);
/** Get document at rank i in current query. */
bool getDoc(int i, Doc &doc, int *percent = 0);
/** Expand query */
list<string> expand(const Doc &doc);
/** Get results count for current query */
int getResCnt();
Db *whatDb();
/** make this public for access from embedded Db::Native */
class Native;
Native *m_nq;
private:
string m_filterTopDir; // Current query filter on subtree top directory
string m_reason; // Error explanation
Db *m_db;
unsigned int m_qOpts;
/* Copyconst and assignemt private and forbidden */
Query(const Query &) {}
Query & operator=(const Query &) {return *this;};
};
#ifndef NO_NAMESPACES
}
#endif // NO_NAMESPACES
#endif /* _rclquery_h_included_ */

66
src/rcldb/rclquery_p.h Normal file
View File

@ -0,0 +1,66 @@
#ifndef _rclquery_p_h_included_
#define _rclquery_p_h_included_
/* @(#$Id: rclquery_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes */
#include <map>
#include <vector>
using std::map;
using std::vector;
#include <xapian.h>
#include "rclquery.h"
namespace Rcl {
class Query::Native {
public:
Xapian::Query query; // query descriptor: terms and subqueries
// joined by operators (or/and etc...)
vector<int> m_dbindices; // In case there is a postq filter: sequence of
// db indices that match
// Filtering results on location. There are 2 possible approaches
// for this:
// - Set a "MatchDecider" to be used by Xapian during the query
// - Filter the results out of Xapian (this also uses a
// Xapian::MatchDecider object, but applied to the results by Recoll.
//
// The result filtering approach was the first implemented.
//
// The efficiency of both methods depend on the searches, so the code
// for both has been kept. A nice point for the Xapian approach is that
// the result count estimate are correct (they are wrong with
// the postfilter approach). It is also faster in some worst case scenarios
// so this now the default (but the post-filtering is faster in many common
// cases).
//
// Which is used is decided in SetQuery(), by setting either of
// the two following members. This in turn is controlled by a
// preprocessor directive.
#define XAPIAN_FILTERING 1
Xapian::MatchDecider *decider; // Xapian does the filtering
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
Xapian::Enquire *enquire; // Open query descriptor.
Xapian::MSet mset; // Partial result set
Query *m_q;
// Term frequencies for current query. See makeAbstract, setQuery
map<string, double> termfreqs;
Native(Query *q)
: decider(0), postfilter(0), enquire(0), m_q(q)
{ }
~Native() {
delete decider;
delete postfilter;
delete enquire;
}
};
}
#endif /* _rclquery_p_h_included_ */

View File

@ -16,7 +16,7 @@
*/
#ifndef _SEARCHDATA_H_INCLUDED_
#define _SEARCHDATA_H_INCLUDED_
/* @(#$Id: searchdata.h,v 1.13 2008-05-08 10:00:20 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: searchdata.h,v 1.14 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* Structures to hold data coming almost directly from the gui
@ -47,7 +47,7 @@ class SearchDataClause;
/**
Data structure representing a Recoll user query, for translation
into a Xapian query tree.
into a Xapian query tree. This could probably better called a 'question'.
This is a list of search clauses combined through either OR or AND.

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.19 2008-05-27 06:18:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.20 2008-06-13 18:22:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -36,6 +36,7 @@ using std::list;
using std::stack;
#endif /* NO_NAMESPACES */
#include "autoconfig.h"
#include "pathut.h"
#include <sys/types.h>

View File

@ -42,6 +42,7 @@ public:
X *operator->() {return rep;}
int getcnt() const {return pcount ? *pcount : 0;}
const X *getptr() const {return rep;}
bool isNull() const {return rep == 0;}
};