diff --git a/src/kde/kioslave/recoll/kio_recoll.cpp b/src/kde/kioslave/recoll/kio_recoll.cpp index 6daaa9b8..b5a46045 100644 --- a/src/kde/kioslave/recoll/kio_recoll.cpp +++ b/src/kde/kioslave/recoll/kio_recoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.7 2007-11-09 15:46:17 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.8 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -109,18 +109,20 @@ void RecollProtocol::get(const KURL & url) RefCntr sdata(new Rcl::SearchData(Rcl::SCLT_OR)); sdata->addClause(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, (const char *)u8)); - - if (!m_rcldb->setQuery(sdata, Rcl::Db::QO_STEM, "english")) { + Rcl::Query *query = new Rcl::Query(m_rcldb); + if (!query->setQuery(sdata, Rcl::Db::QO_STEM, "english")) { m_reason = "Internal Error: setQuery failed"; outputError(m_reason.c_str()); finished(); + delete query; return; } if (m_docsource) delete m_docsource; - m_docsource = new DocSequenceDb(m_rcldb, "Query results", sdata); + m_docsource = new DocSequenceDb(RefCntr(query), + "Query results", sdata); QByteArray output; QTextStream os(output, IO_WriteOnly ); diff --git a/src/lib/Makefile b/src/lib/Makefile index a39b4061..a2da2aab 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,8 +8,8 @@ LIBS = librcl.a all: $(LIBS) -OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o -DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp +OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o +DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp librcl.a : $(DEPS) $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o @@ -71,6 +71,8 @@ pathhash.o : ../rcldb/pathhash.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp rcldb.o : ../rcldb/rcldb.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp +rclquery.o : ../rcldb/rclquery.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp searchdata.o : ../rcldb/searchdata.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp stemdb.o : ../rcldb/stemdb.cpp @@ -194,6 +196,9 @@ pathhash.dep.stamp : ../rcldb/pathhash.cpp rcldb.dep.stamp : ../rcldb/rcldb.cpp $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep touch rcldb.dep.stamp +rclquery.dep.stamp : ../rcldb/rclquery.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep + touch rclquery.dep.stamp searchdata.dep.stamp : ../rcldb/searchdata.cpp $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/searchdata.cpp > searchdata.dep touch searchdata.dep.stamp @@ -275,6 +280,7 @@ include wasastringtoquery.dep include wasatorcl.dep include pathhash.dep include rcldb.dep +include rclquery.dep include searchdata.dep include stemdb.dep include stoplist.dep diff --git a/src/lib/mkMake b/src/lib/mkMake index fbfd3bc3..6c9e2af9 100755 --- a/src/lib/mkMake +++ b/src/lib/mkMake @@ -31,6 +31,7 @@ ${depth}/query/wasastringtoquery.cpp \ ${depth}/query/wasatorcl.cpp \ ${depth}/rcldb/pathhash.cpp \ ${depth}/rcldb/rcldb.cpp \ +${depth}/rcldb/rclquery.cpp \ ${depth}/rcldb/searchdata.cpp \ ${depth}/rcldb/stemdb.cpp \ ${depth}/rcldb/stoplist.cpp \ diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index acbbe3d9..6c35c0af 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.2 2008-05-27 10:45:59 dockes Exp $ (C) 2007 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.3 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes"; #endif #include @@ -11,6 +11,7 @@ using namespace std; #include "rclinit.h" #include "rclconfig.h" #include "rcldb.h" +#include "rclquery.h" #include "pathut.h" #include "wasastringtoquery.h" #include "wasatorcl.h" @@ -31,7 +32,7 @@ recollq_question(PyObject *self, PyObject *args) string reason; string dbdir = config->getDbDir(); rcldb.open(dbdir, config->getStopfile(), - Rcl::Db::DbRO, Rcl::Db::QO_STEM); + Rcl::Db::DbRO); Rcl::SearchData *sd = wasaStringToRcl(qs, reason); if (!sd) { @@ -40,8 +41,9 @@ recollq_question(PyObject *self, PyObject *args) } RefCntr rq(sd); - rcldb.setQuery(rq, Rcl::Db::QO_STEM); - int cnt = rcldb.getResCnt(); + RefCntr query(new Rcl::Query(&rcldb)); + query->setQuery(rq, Rcl::Query::QO_STEM); + int cnt = query->getResCnt(); cout << "Recoll query: " << rq->getDescription() << endl; if (cnt <= limit) cout << cnt << " results" << endl; @@ -51,7 +53,7 @@ recollq_question(PyObject *self, PyObject *args) for (int i = 0; i < limit; i++) { int pc; Rcl::Doc doc; - if (!rcldb.getDoc(i, doc, &pc)) + if (!query->getDoc(i, doc, &pc)) break; char cpc[20]; sprintf(cpc, "%d", pc); diff --git a/src/python/recoll/setup.py b/src/python/recoll/setup.py index 0c04c6af..1d659887 100644 --- a/src/python/recoll/setup.py +++ b/src/python/recoll/setup.py @@ -3,8 +3,6 @@ from distutils.core import setup, Extension module1 = Extension('recollq', define_macros = [('MAJOR_VERSION', '1'), ('MINOR_VERSION', '0'), - ('HAVE_MKDTEMP', '1'), - ('HAVE_VASPRINTF', '1'), ('UNAC_VERSION', '"1.0.7"'), ('STATFS_INCLUDE', '"sys/mount.h"'), ('RECOLL_DATADIR', @@ -27,6 +25,7 @@ module1 = Extension('recollq', '../query/wasastringtoquery.cpp', '../query/wasatorcl.cpp', '../rcldb/rcldb.cpp', + '../rcldb/rclquery.cpp', '../rcldb/searchdata.cpp', '../rcldb/stemdb.cpp', '../rcldb/pathhash.cpp', diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index 064fc125..bba98ead 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: main.cpp,v 1.66 2008-02-19 08:02:20 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: main.cpp,v 1.67 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -100,9 +100,6 @@ bool maybeOpenDb(string &reason, bool force) return false; } - int qopts = Rcl::Db::QO_NONE; - if (prefs.queryStemLang.length() > 0) - qopts |= Rcl::Db::QO_STEM; if (force) rcldb->close(); rcldb->rmQueryDb(""); @@ -112,7 +109,7 @@ bool maybeOpenDb(string &reason, bool force) rcldb->addQueryDb(*it); } if (!rcldb->isopen() && !rcldb->open(dbdir, rclconfig->getStopfile(), - Rcl::Db::DbRO, qopts)) { + Rcl::Db::DbRO)) { reason = "Could not open database in " + dbdir + " wait for indexing to complete?"; return false; diff --git a/src/qtgui/rclmain_w.cpp b/src/qtgui/rclmain_w.cpp index bde214fd..bee1beb7 100644 --- a/src/qtgui/rclmain_w.cpp +++ b/src/qtgui/rclmain_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.48 2008-02-19 08:02:01 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.49 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -440,23 +440,25 @@ void RclMain::startSearch(RefCntr sdata) int qopts = 0; if (!prefs.queryStemLang.length() == 0) - qopts |= Rcl::Db::QO_STEM; + qopts |= Rcl::Query::QO_STEM; QApplication::setOverrideCursor(QCursor(Qt::WaitCursor)); string stemLang = (const char *)prefs.queryStemLang.ascii(); if (stemLang == "ALL") { rclconfig->getConfParam("indexstemminglanguages", stemLang); } + Rcl::Query *query = new Rcl::Query(rcldb); - if (!rcldb->setQuery(sdata, qopts, stemLang)) { + if (!query || !query->setQuery(sdata, qopts, stemLang)) { QMessageBox::warning(0, "Recoll", tr("Cant start query: ") + - QString::fromAscii(rcldb->getReason().c_str())); + QString::fromAscii(query->getReason().c_str())); QApplication::restoreOverrideCursor(); return; } curPreview = 0; DocSequenceDb *src = - new DocSequenceDb(rcldb, string(tr("Query results").utf8()), sdata); + new DocSequenceDb(RefCntr(query), + string(tr("Query results").utf8()), sdata); m_docSource = RefCntr(src); m_searchData = sdata; setDocSequence(); @@ -921,7 +923,8 @@ void RclMain::docExpand(int docnum) if (!resList->getDoc(docnum, doc)) return; list terms; - terms = rcldb->expand(doc); + if (!m_docSource.isNull()) + terms = m_docSource->expand(doc); if (terms.empty()) return; // Do we keep the original query. I think we'd better not. diff --git a/src/query/docseq.h b/src/query/docseq.h index 121fc7dd..0311293a 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -16,7 +16,7 @@ */ #ifndef _DOCSEQ_H_INCLUDED_ #define _DOCSEQ_H_INCLUDED_ -/* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: docseq.h,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include @@ -89,7 +89,7 @@ class DocSequence { vector& gslks) const { terms.clear(); groups.clear(); gslks.clear(); return true; } - + virtual list expand(Rcl::Doc &) {list e; return e;} private: string m_title; }; diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index 1e45b95d..426667bc 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.4 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -23,28 +23,53 @@ static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp #include "docseqdb.h" #include "rcldb.h" +DocSequenceDb::DocSequenceDb(RefCntr q, const string &t, + RefCntr sdata) + : DocSequence(t), m_q(q), m_sdata(sdata), m_rescnt(-1) +{ +} + +DocSequenceDb::~DocSequenceDb() +{ +} + +bool DocSequenceDb::getTerms(vector& terms, + vector >& groups, + vector& gslks) const +{ + return m_sdata.getptr()->getTerms(terms, groups, gslks); +} + +string DocSequenceDb::getDescription() +{ + return m_sdata->getDescription(); +} + bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, int *percent, string *sh) { if (sh) sh->erase(); - return m_db ? m_db->getDoc(num, doc, percent) : false; + return m_q->getDoc(num, doc, percent); } int DocSequenceDb::getResCnt() { - if (!m_db) - return -1; if (m_rescnt < 0) { - m_rescnt= m_db->getResCnt(); + m_rescnt= m_q->getResCnt(); } return m_rescnt; } string DocSequenceDb::getAbstract(Rcl::Doc &doc) { - if (!m_db) + if (!m_q->whatDb()) return doc.meta["abstract"]; string abstract; - m_db->makeDocAbstract(doc, abstract); + m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), abstract); return abstract.empty() ? doc.meta["abstract"] : abstract; } +list DocSequenceDb::expand(Rcl::Doc &doc) +{ + return m_q->expand(doc); +} + diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h index 203713eb..a0c0be4e 100644 --- a/src/query/docseqdb.h +++ b/src/query/docseqdb.h @@ -16,33 +16,30 @@ */ #ifndef _DOCSEQDB_H_INCLUDED_ #define _DOCSEQDB_H_INCLUDED_ -/* @(#$Id: docseqdb.h,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: docseqdb.h,v 1.3 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */ #include "docseq.h" #include "refcntr.h" #include "searchdata.h" +#include "rclquery.h" /** A DocSequence from a Db query (there should be one active for this to make sense) */ class DocSequenceDb : public DocSequence { public: - DocSequenceDb(Rcl::Db *d, const string &t, RefCntr sdata) - : DocSequence(t), m_db(d), m_sdata(sdata), m_rescnt(-1) - {} - virtual ~DocSequenceDb() {} + DocSequenceDb(RefCntr q, const string &t, + RefCntr sdata); + virtual ~DocSequenceDb(); virtual bool getDoc(int num, Rcl::Doc &doc, int *percent, string * = 0); virtual int getResCnt(); virtual bool getTerms(vector& terms, vector >& groups, - vector& gslks) const { - return m_sdata.getptr()->getTerms(terms, groups, gslks); - } - + vector& gslks) const; virtual string getAbstract(Rcl::Doc &doc); - virtual string getDescription() {return m_sdata->getDescription();} - + virtual string getDescription(); + virtual list expand(Rcl::Doc &doc); private: - Rcl::Db *m_db; + RefCntr m_q; RefCntr m_sdata; int m_rescnt; }; diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index d52bfb04..22ca6de3 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollq.cpp,v 1.12 2007-12-13 06:58:21 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollq.cpp,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -32,6 +32,7 @@ static char rcsid[] = "@(#$Id: recollq.cpp,v 1.12 2007-12-13 06:58:21 dockes Exp using namespace std; #include "rcldb.h" +#include "rclquery.h" #include "rclconfig.h" #include "pathut.h" #include "rclinit.h" @@ -132,8 +133,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) exit(1); } dbdir = rclconfig->getDbDir(); - rcldb.open(dbdir, rclconfig->getStopfile(), - Rcl::Db::DbRO, Rcl::Db::QO_STEM); + rcldb.open(dbdir, rclconfig->getStopfile(), Rcl::Db::DbRO); Rcl::SearchData *sd = 0; @@ -166,8 +166,9 @@ int recollq(RclConfig **cfp, int argc, char **argv) } RefCntr rq(sd); - rcldb.setQuery(rq, Rcl::Db::QO_STEM); - int cnt = rcldb.getResCnt(); + Rcl::Query query(&rcldb); + query.setQuery(rq, Rcl::Query::QO_STEM); + int cnt = query.getResCnt(); if (!(op_flags & OPT_b)) { cout << "Recoll query: " << rq->getDescription() << endl; if (cnt <= limit) @@ -180,7 +181,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) for (int i = 0; i < limit; i++) { int pc; Rcl::Doc doc; - if (!rcldb.getDoc(i, doc, &pc)) + if (!query.getDoc(i, doc, &pc)) break; if (op_flags & OPT_b) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 72ec324b..18c9790f 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.132 2008-05-20 10:09:54 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.133 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -36,6 +36,7 @@ using namespace std; #include "rclconfig.h" #include "rcldb.h" +#include "rcldb_p.h" #include "stemdb.h" #include "textsplit.h" #include "transcode.h" @@ -47,8 +48,9 @@ using namespace std; #include "pathhash.h" #include "utf8iter.h" #include "searchdata.h" +#include "rclquery.h" +#include "rclquery_p.h" -#include "xapian.h" #ifndef MAX #define MAX(A,B) (A>B?A:B) @@ -88,125 +90,8 @@ namespace Rcl { const static string rclSyntAbs = "?!#@"; const static string emptystring; -// A class for data and methods that would have to expose -// Xapian-specific stuff if they were in Rcl::Db. There could actually be -// 2 different ones for indexing or query as there is not much in -// common. -class Native { - public: - Db *m_db; - bool m_isopen; - bool m_iswritable; - - // Indexing - Xapian::WritableDatabase wdb; - - // Querying - Xapian::Database db; - Xapian::Query query; // query descriptor: terms and subqueries - // joined by operators (or/and etc...) - - // Filtering results on location. There are 2 possible approaches - // for this: - // - Set a "MatchDecider" to be used by Xapian during the query - // - Filter the results out of Xapian (this also uses a - // Xapian::MatchDecider object, but applied to the results by Recoll. - // - // The result filtering approach was the first implemented. - // - // The efficiency of both methods depend on the searches, so the code - // for both has been kept. A nice point for the Xapian approach is that - // the result count estimate are correct (they are wrong with - // the postfilter approach). It is also faster in some worst case scenarios - // so this now the default (but the post-filtering is faster in many common - // cases). - // - // Which is used is decided in SetQuery(), by setting either of - // the two following members. This in turn is controlled by a - // preprocessor directive. - -#define XAPIAN_FILTERING 1 - - Xapian::MatchDecider *decider; // Xapian does the filtering - Xapian::MatchDecider *postfilter; // Result filtering done by Recoll - - Xapian::Enquire *enquire; // Open query descriptor. - Xapian::MSet mset; // Partial result set - - // Term frequencies for current query. See makeAbstract, setQuery - map m_termfreqs; - - Native(Db *db) - : m_db(db), - m_isopen(false), m_iswritable(false), decider(0), postfilter(0), - enquire(0) - { } - - ~Native() { - delete decider; - delete postfilter; - delete enquire; - } - - string makeAbstract(Xapian::docid id, const list& terms); - - bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); - - /** Compute list of subdocuments for a given path (given by hash) - * We look for all Q terms beginning with the path/hash - * As suggested by James Aylett, a better method would be to add - * a single term (ie: XP/path/to/file) to all subdocs, then finding - * them would be a simple matter of retrieving the posting list for the - * term. There would still be a need for the current Qterm though, as a - * unique term for replace_document, and for retrieving by - * path/ipath (history) - */ - bool subDocs(const string &hash, vector& docids); - -}; - -class FilterMatcher : public Xapian::MatchDecider { -public: - FilterMatcher(const string &topdir) - : m_topdir(topdir) - {} - virtual ~FilterMatcher() {} - - virtual -#if XAPIAN_MAJOR_VERSION < 1 - int -#else - bool -#endif - operator()(const Xapian::Document &xdoc) const - { - m_cnt++; - // Parse xapian document's data and populate doc fields - string data = xdoc.get_data(); - ConfSimple parms(&data); - - // The only filtering for now is on file path (subtree) - string url; - parms.get(string("url"), url); - LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n", - m_topdir.c_str(), url.c_str())); - if (url.find(m_topdir, 7) == 7) { - LOGDEB2(("FilterMatcher: MATCH %d\n", m_cnt)); - return true; - } else { - LOGDEB2(("FilterMatcher: NO MATCH %d\n", m_cnt)); - return false; - } - } - static int m_cnt; - -private: - string m_topdir; -}; -int FilterMatcher::m_cnt; - /* See comment in class declaration */ -bool Native::subDocs(const string &hash, vector& docids) +bool Db::Native::subDocs(const string &hash, vector& docids) { docids.clear(); string qterm = "Q"+ hash + "|"; @@ -250,7 +135,7 @@ bool Native::subDocs(const string &hash, vector& docids) } // Turn data record from db into document fields -bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc) +bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc) { LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str())); ConfSimple parms(&data); @@ -306,26 +191,29 @@ static list noPrefixList(const list& in) // Build a document abstract by extracting text chunks around the query terms // This uses the db termlists, not the original document. -string Native::makeAbstract(Xapian::docid docid, const list& iterms) +string Db::Native::makeAbstract(Xapian::docid docid, Query *query) { Chrono chron; LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen)); + list iterms; + query->getQueryTerms(iterms); + list terms = noPrefixList(iterms); if (terms.empty()) { return ""; } // Retrieve db-wide frequencies for the query terms - if (m_termfreqs.empty()) { + if (query->m_nq->termfreqs.empty()) { double doccnt = db.get_doccount(); if (doccnt == 0) doccnt = 1; for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { - m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt; + query->m_nq->termfreqs[*qit] = db.get_termfreq(*qit) / doccnt; LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), - m_termfreqs[*qit])); + query->m_nq->termfreqs[*qit])); } LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms())); } @@ -343,7 +231,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) Xapian::TermIterator term = db.termlist_begin(docid); term.skip_to(*qit); if (term != db.termlist_end(docid) && *term == *qit) { - double q = (term.get_wdf() / doclen) * m_termfreqs[*qit]; + double q = (term.get_wdf() / doclen) * query->m_nq->termfreqs[*qit]; q = -log10(q); if (q < 3) { q = 0.05; @@ -556,7 +444,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& iterms) /* Rcl::Db methods ///////////////////////////////// */ Db::Db() - : m_ndb(0), m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250), + : m_ndb(0), m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), m_flushMb(-1), m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_maxFsOccupPc(0), m_mode(Db::DbRO) @@ -586,28 +474,9 @@ Db::~Db() return res; } -// Generic Xapian exception catching code. We do this quite often, -// and I have no idea how to do this except for a macro -#define XCATCHERROR(MSG) \ - catch (const Xapian::Error &e) { \ - MSG = e.get_msg(); \ - if (MSG.empty()) MSG = "Empty error message"; \ - } catch (const string &s) { \ - MSG = s; \ - if (MSG.empty()) MSG = "Empty error message"; \ - } catch (const char *s) { \ - MSG = s; \ - if (MSG.empty()) MSG = "Empty error message"; \ - } catch (...) { \ - MSG = "Caught unknown xapian exception"; \ - } - - -bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops) +bool Db::open(const string& dir, const string &stops, OpenMode mode, + bool keep_updated) { - bool keep_updated = (qops & QO_KEEP_UPDATED) != 0; - qops &= ~QO_KEEP_UPDATED; - if (m_ndb == 0) return false; LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, @@ -724,7 +593,7 @@ bool Db::reOpen() if (m_ndb && m_ndb->m_isopen) { if (!close()) return false; - if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) { + if (!open(m_basedir, "", m_mode, true)) { return false; } } @@ -1467,64 +1336,6 @@ bool Db::filenameWildExp(const string& fnexp, list& names) return true; } -// Prepare query out of user search data -bool Db::setQuery(RefCntr sdata, int opts, - const string& stemlang) -{ - if (!m_ndb) { - LOGERR(("Db::setQuery: no db!\n")); - return false; - } - m_reason.erase(); - LOGDEB(("Db::setQuery:\n")); - - m_filterTopDir = sdata->getTopdir(); - deleteZ(m_ndb->decider); - deleteZ(m_ndb->postfilter); - if (!m_filterTopDir.empty()) { -#if XAPIAN_FILTERING - m_ndb->decider = -#else - m_ndb->postfilter = -#endif - new FilterMatcher(m_filterTopDir); - } - m_dbindices.clear(); - m_qOpts = opts; - m_ndb->m_termfreqs.clear(); - FilterMatcher::m_cnt = 0; - Xapian::Query xq; - if (!sdata->toNativeQuery(*this, &xq, - (opts & Db::QO_STEM) ? stemlang : "")) { - m_reason += sdata->getReason(); - return false; - } - m_ndb->query = xq; - string ermsg; - string d; - try { - delete m_ndb->enquire; - m_ndb->enquire = new Xapian::Enquire(m_ndb->db); - m_ndb->enquire->set_query(m_ndb->query); - m_ndb->mset = Xapian::MSet(); - // Get the query description and trim the "Xapian::Query" - d = m_ndb->query.get_description(); - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGDEB(("Db::SetQuery: xapian error %s\n", ermsg.c_str())); - return false; - } - - if (d.find("Xapian::Query") == 0) - d.erase(0, strlen("Xapian::Query")); - if (!m_filterTopDir.empty()) { - d += string(" [dir: ") + m_filterTopDir + "]"; - } - sdata->setDescription(d); - LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str())); - return true; -} - class TermMatchCmpByWcf { public: int operator()(const TermMatchEntry& l, const TermMatchEntry& r) { @@ -1735,195 +1546,15 @@ bool Db::stemDiffers(const string& lang, const string& word, return true; } -bool Db::getQueryTerms(list& terms) -{ - if (!m_ndb) - return false; - terms.clear(); - Xapian::TermIterator it; - string ermsg; - try { - for (it = m_ndb->query.get_terms_begin(); - it != m_ndb->query.get_terms_end(); it++) { - terms.push_back(*it); - } - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str())); - return false; - } - return true; -} - -bool Db::getMatchTerms(const Doc& doc, list& terms) -{ - if (!m_ndb || !m_ndb->enquire) { - LOGERR(("Db::getMatchTerms: no query opened\n")); - return -1; - } - - terms.clear(); - Xapian::TermIterator it; - Xapian::docid id = Xapian::docid(doc.xdocid); - string ermsg; - try { - for (it=m_ndb->enquire->get_matching_terms_begin(id); - it != m_ndb->enquire->get_matching_terms_end(id); it++) { - terms.push_back(*it); - } - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str())); - return false; - } - - return true; -} - -// Mset size -static const int qquantum = 30; - -int Db::getResCnt() -{ - if (!m_ndb || !m_ndb->enquire) { - LOGERR(("Db::getResCnt: no query opened\n")); - return -1; - } - string ermsg; - if (m_ndb->mset.size() <= 0) { - try { - m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum, - 0, m_ndb->decider); - } catch (const Xapian::DatabaseModifiedError &error) { - m_ndb->db.reopen(); - m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum, - 0, m_ndb->decider); - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str())); - return -1; - } - } - int ret = -1; - try { - ret = m_ndb->mset.get_matches_lower_bound(); - } catch (...) {} - return ret; -} - - -// Get document at rank i in query (i is the index in the whole result -// set, as in the enquire class. We check if the current mset has the -// doc, else ask for an other one. We use msets of 10 documents. Don't -// know if the whole thing makes sense at all but it seems to work. -// -// If there is a postquery filter (ie: file names), we have to -// maintain a correspondance from the sequential external index -// sequence to the internal Xapian hole-y one (the holes being the documents -// that dont match the filter). -bool Db::getDoc(int exti, Doc &doc, int *percent) -{ - LOGDEB1(("Db::getDoc: exti %d\n", exti)); - if (!m_ndb || !m_ndb->enquire) { - LOGERR(("Db::getDoc: no query opened\n")); - return false; - } - - int xapi; - if (m_ndb->postfilter) { - // There is a postquery filter, does this fall in already known area ? - if (exti >= (int)m_dbindices.size()) { - // Have to fetch xapian docs and filter until we get - // enough or fail - m_dbindices.reserve(exti+1); - // First xapian doc we fetch is the one after last stored - int first = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0; - // Loop until we get enough docs - while (exti >= (int)m_dbindices.size()) { - LOGDEB(("Db::getDoc: fetching %d starting at %d\n", - qquantum, first)); - try { - m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum); - } catch (const Xapian::DatabaseModifiedError &error) { - m_ndb->db.reopen(); - m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum); - } catch (const Xapian::Error & error) { - LOGERR(("enquire->get_mset: exception: %s\n", - error.get_msg().c_str())); - abort(); - } - - if (m_ndb->mset.empty()) { - LOGDEB(("Db::getDoc: got empty mset\n")); - return false; - } - first = m_ndb->mset.get_firstitem(); - for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) { - LOGDEB(("Db::getDoc: [%d]\n", i)); - Xapian::Document xdoc = m_ndb->mset[i].get_document(); - if ((*m_ndb->postfilter)(xdoc)) { - m_dbindices.push_back(first + i); - } - } - first = first + m_ndb->mset.size(); - } - } - xapi = m_dbindices[exti]; - } else { - xapi = exti; - } - - // From there on, we work with a xapian enquire item number. Fetch it - int first = m_ndb->mset.get_firstitem(); - int last = first + m_ndb->mset.size() -1; - - if (!(xapi >= first && xapi <= last)) { - LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); - try { - m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum, - 0, m_ndb->decider); - } catch (const Xapian::DatabaseModifiedError &error) { - m_ndb->db.reopen(); - m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum, - 0, m_ndb->decider); - - } catch (const Xapian::Error & error) { - LOGERR(("enquire->get_mset: exception: %s\n", - error.get_msg().c_str())); - abort(); - } - if (m_ndb->mset.empty()) - return false; - first = m_ndb->mset.get_firstitem(); - last = first + m_ndb->mset.size() -1; - } - - LOGDEB1(("Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d", - m_ndb->query.get_description().c_str(), - first, last, - m_ndb->mset.get_matches_lower_bound())); - - Xapian::Document xdoc = m_ndb->mset[xapi-first].get_document(); - Xapian::docid docid = *(m_ndb->mset[xapi-first]); - if (percent) - *percent = m_ndb->mset.convert_to_percent(m_ndb->mset[xapi-first]); - - // Parse xapian document's data and populate doc fields - string data = xdoc.get_data(); - return m_ndb->dbDataToRclDoc(docid, data, doc); -} - -bool Db::makeDocAbstract(Doc &doc, string& abstract) +bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract) { LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti)); - if (!m_ndb || !m_ndb->enquire) { - LOGERR(("Db::makeDocAbstract: no query opened\n")); + if (!m_ndb) { + LOGERR(("Db::makeDocAbstract: no db\n")); return false; } - list terms; - getQueryTerms(terms); - abstract = m_ndb->makeAbstract(doc.xdocid, terms); + abstract = m_ndb->makeAbstract(doc.xdocid, query); return true; } @@ -1969,45 +1600,6 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc) return false; } -list Db::expand(const Doc &doc) -{ - list res; - if (!m_ndb || !m_ndb->enquire) { - LOGERR(("Db::expand: no query opened\n")); - return res; - } - string ermsg; - for (int tries = 0; tries < 2; tries++) { - try { - Xapian::RSet rset; - rset.add_document(Xapian::docid(doc.xdocid)); - // We don't exclude the original query terms. - Xapian::ESet eset = m_ndb->enquire->get_eset(20, rset, false); - LOGDEB(("ESet terms:\n")); - // We filter out the special terms - for (Xapian::ESetIterator it = eset.begin(); - it != eset.end(); it++) { - LOGDEB((" [%s]\n", (*it).c_str())); - if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z')) - continue; - res.push_back(*it); - if (res.size() >= 10) - break; - } - } catch (const Xapian::DatabaseModifiedError &error) { - continue; - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("Db::expand: xapian error %s\n", ermsg.c_str())); - res.clear(); - } - break; - } - - return res; -} - - #ifndef NO_NAMESPACES } #endif diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 2da51564..dc74fbfa 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.54 2007-07-10 09:23:28 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.55 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -52,8 +52,8 @@ namespace Rcl { #endif class SearchData; -class Native; class TermIter; +class Query; class TermMatchEntry { public: @@ -71,17 +71,17 @@ public: */ class Db { public: + // A place for things we don't want visible here. + class Native; + friend class Native; /* General stuff (valid for query or update) ****************************/ Db(); ~Db(); enum OpenMode {DbRO, DbUpd, DbTrunc}; - // KEEP_UPDATED is internal use by reOpen() only - enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8}; - bool open(const string &dbdir, const string &stoplistfn, - OpenMode mode, int qops = QO_NONE); + OpenMode mode, bool keep_updated = false); bool close(); bool isopen(); @@ -130,11 +130,12 @@ class Db { /** Return total docs in db */ int docCnt(); - // Parse query string and initialize query - bool setQuery(RefCntr q, int opts = QO_NONE, - const string& stemlang = "english"); - bool getQueryTerms(list& terms); - bool getMatchTerms(const Doc& doc, list& terms); + /** Add extra database for querying */ + bool addQueryDb(const string &dir); + /** Remove extra database. if dir == "", remove all. */ + bool rmQueryDb(const string &dir); + /** Tell if directory seems to hold xapian db */ + static bool testDbDir(const string &dir); /** Return a list of index terms that match the input string * Expansion is performed either with either wildcard or regexp processing @@ -143,33 +144,12 @@ class Db { bool termMatch(MatchType typ, const string &lang, const string &s, list& result, int max = -1); - /** Add extra database for querying */ - bool addQueryDb(const string &dir); - /** Remove extra database. if dir == "", remove all. */ - bool rmQueryDb(const string &dir); - /** Tell if directory seems to hold xapian db */ - static bool testDbDir(const string &dir); - - /** Get document at rank i in current query. - - This is probably vastly inferior to the type of interface in - Xapian, but we have to start with something simple to - experiment with the GUI. i is sequential from 0 to some value. - */ - bool getDoc(int i, Doc &doc, int *percent = 0); - /* Build synthetic abstract out of query terms and term position data */ - bool makeDocAbstract(Doc &doc, string& abstract); + bool makeDocAbstract(Doc &doc, Query *query, string& abstract); /** Get document for given filename and ipath */ bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent); - /** Expand query */ - list expand(const Doc &doc); - - /** Get results count for current query */ - int getResCnt(); - /** Get a list of existing stemming databases */ std::list getStemLangs(); @@ -189,22 +169,16 @@ class Db { /** Filename wildcard expansion */ bool filenameWildExp(const string& exp, list& names); + /** This has to be public for access by embedded Query::Native */ + Native *m_ndb; + private: // Internal form of close, can be called during destruction bool i_close(bool final); - string m_filterTopDir; // Current query filter on subtree top directory - vector m_dbindices; // In case there is a postq filter: sequence of - // db indices that match - string m_reason; // Error explanation - // A place for things we don't want visible here. - friend class Native; - Native *m_ndb; - - unsigned int m_qOpts; - + /* Parameters cached out of the configuration files */ // This is how long an abstract we keep or build from beginning of // text when indexing. It only has an influence on the size of the // db as we are free to shorten it again when displaying @@ -215,7 +189,6 @@ private: // This is how many words (context size) we keep around query terms // when building the abstract int m_synthAbsWordCtxLen; - // Flush threshold. Megabytes of text indexed before we flush. int m_flushMb; // Text bytes indexed since beginning @@ -224,7 +197,6 @@ private: long long m_flushtxtsz; // Text bytes at last fsoccup check long long m_occtxtsz; - // Maximum file system occupation percentage int m_maxFsOccupPc; diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h new file mode 100644 index 00000000..1be6d022 --- /dev/null +++ b/src/rcldb/rcldb_p.h @@ -0,0 +1,67 @@ +#ifndef _rcldb_p_h_included_ +#define _rcldb_p_h_included_ + +#include "xapian.h" + +namespace Rcl { +/* @(#$Id: rcldb_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes */ + +// Generic Xapian exception catching code. We do this quite often, +// and I have no idea how to do this except for a macro +#define XCATCHERROR(MSG) \ + catch (const Xapian::Error &e) { \ + MSG = e.get_msg(); \ + if (MSG.empty()) MSG = "Empty error message"; \ + } catch (const string &s) { \ + MSG = s; \ + if (MSG.empty()) MSG = "Empty error message"; \ + } catch (const char *s) { \ + MSG = s; \ + if (MSG.empty()) MSG = "Empty error message"; \ + } catch (...) { \ + MSG = "Caught unknown xapian exception"; \ + } + +class Query; + +// A class for data and methods that would have to expose +// Xapian-specific stuff if they were in Rcl::Db. There could actually be +// 2 different ones for indexing or query as there is not much in +// common. +class Db::Native { + public: + Db *m_db; + bool m_isopen; + bool m_iswritable; + + // Indexing + Xapian::WritableDatabase wdb; + + // Querying + Xapian::Database db; + + Native(Db *db) + : m_db(db), m_isopen(false), m_iswritable(false) + { } + + ~Native() { + } + + string makeAbstract(Xapian::docid id, Query *query); + + bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); + + /** Compute list of subdocuments for a given path (given by hash) + * We look for all Q terms beginning with the path/hash + * As suggested by James Aylett, a better method would be to add + * a single term (ie: XP/path/to/file) to all subdocs, then finding + * them would be a simple matter of retrieving the posting list for the + * term. There would still be a need for the current Qterm though, as a + * unique term for replace_document, and for retrieving by + * path/ipath (history) + */ + bool subDocs(const string &hash, vector& docids); + +}; +} +#endif /* _rcldb_p_h_included_ */ diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp new file mode 100644 index 00000000..4c5aee41 --- /dev/null +++ b/src/rcldb/rclquery.cpp @@ -0,0 +1,354 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2008 J.F.Dockes"; +#endif + +#include +#include + +#include "rcldb.h" +#include "rcldb_p.h" +#include "rclquery.h" +#include "rclquery_p.h" +#include "debuglog.h" +#include "conftree.h" +#include "smallut.h" +#include "searchdata.h" + +#ifndef NO_NAMESPACES +namespace Rcl { +#endif +class FilterMatcher : public Xapian::MatchDecider { +public: + FilterMatcher(const string &topdir) + : m_topdir(topdir) + {} + virtual ~FilterMatcher() {} + + virtual +#if XAPIAN_MAJOR_VERSION < 1 + int +#else + bool +#endif + operator()(const Xapian::Document &xdoc) const + { + // Parse xapian document's data and populate doc fields + string data = xdoc.get_data(); + ConfSimple parms(&data); + + // The only filtering for now is on file path (subtree) + string url; + parms.get(string("url"), url); + LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n", + m_topdir.c_str(), url.c_str())); + if (url.find(m_topdir, 7) == 7) { + return true; + } else { + return false; + } + } + +private: + string m_topdir; +}; + +Query::Query(Db *db) + : m_nq(new Native(this)), m_db(db) +{ +} + +Query::~Query() +{ + deleteZ(m_nq); +} + +string Query::getReason() const +{ + return m_reason; +} + +Db *Query::whatDb() +{ + return m_db; +} + +//#define ISNULL(X) (X).isNull() +#define ISNULL(X) !(X) + +// Prepare query out of user search data +bool Query::setQuery(RefCntr sdata, int opts, + const string& stemlang) +{ + if (!m_db || ISNULL(m_nq)) { + LOGERR(("Query::setQuery: not initialised!\n")); + return false; + } + m_reason.erase(); + LOGDEB(("Query::setQuery:\n")); + + m_filterTopDir = sdata->getTopdir(); + deleteZ(m_nq->decider); + deleteZ(m_nq->postfilter); + if (!m_filterTopDir.empty()) { +#if XAPIAN_FILTERING + m_nq->decider = +#else + m_nq->postfilter = +#endif + new FilterMatcher(m_filterTopDir); + } + m_nq->m_dbindices.clear(); + m_qOpts = opts; + m_nq->termfreqs.clear(); + Xapian::Query xq; + if (!sdata->toNativeQuery(*m_db, &xq, + (opts & QO_STEM) ? stemlang : "")) { + m_reason += sdata->getReason(); + return false; + } + m_nq->query = xq; + string ermsg; + string d; + try { + delete m_nq->enquire; + m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db); + m_nq->enquire->set_query(m_nq->query); + m_nq->mset = Xapian::MSet(); + // Get the query description and trim the "Xapian::Query" + d = m_nq->query.get_description(); + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGDEB(("Query::SetQuery: xapian error %s\n", ermsg.c_str())); + return false; + } + + if (d.find("Xapian::Query") == 0) + d.erase(0, strlen("Xapian::Query")); + if (!m_filterTopDir.empty()) { + d += string(" [dir: ") + m_filterTopDir + "]"; + } + sdata->setDescription(d); + LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str())); + return true; +} + + +bool Query::getQueryTerms(list& terms) +{ + if (ISNULL(m_nq)) + return false; + + terms.clear(); + Xapian::TermIterator it; + string ermsg; + try { + for (it = m_nq->query.get_terms_begin(); + it != m_nq->query.get_terms_end(); it++) { + terms.push_back(*it); + } + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str())); + return false; + } + return true; +} + +bool Query::getMatchTerms(const Doc& doc, list& terms) +{ + if (ISNULL(m_nq) || !m_nq->enquire) { + LOGERR(("Query::getMatchTerms: no query opened\n")); + return -1; + } + + terms.clear(); + Xapian::TermIterator it; + Xapian::docid id = Xapian::docid(doc.xdocid); + string ermsg; + try { + for (it=m_nq->enquire->get_matching_terms_begin(id); + it != m_nq->enquire->get_matching_terms_end(id); it++) { + terms.push_back(*it); + } + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str())); + return false; + } + + return true; +} + +// Mset size +static const int qquantum = 30; + +int Query::getResCnt() +{ + if (ISNULL(m_nq) || !m_nq->enquire) { + LOGERR(("Query::getResCnt: no query opened\n")); + return -1; + } + string ermsg; + if (m_nq->mset.size() <= 0) { + try { + m_nq->mset = m_nq->enquire->get_mset(0, qquantum, + 0, m_nq->decider); + } catch (const Xapian::DatabaseModifiedError &error) { + m_db->m_ndb->db.reopen(); + m_nq->mset = m_nq->enquire->get_mset(0, qquantum, + 0, m_nq->decider); + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str())); + return -1; + } + } + int ret = -1; + try { + ret = m_nq->mset.get_matches_lower_bound(); + } catch (...) {} + return ret; +} + + +// Get document at rank i in query (i is the index in the whole result +// set, as in the enquire class. We check if the current mset has the +// doc, else ask for an other one. We use msets of 10 documents. Don't +// know if the whole thing makes sense at all but it seems to work. +// +// If there is a postquery filter (ie: file names), we have to +// maintain a correspondance from the sequential external index +// sequence to the internal Xapian hole-y one (the holes being the documents +// that dont match the filter). +bool Query::getDoc(int exti, Doc &doc, int *percent) +{ + LOGDEB1(("Query::getDoc: exti %d\n", exti)); + if (ISNULL(m_nq) || !m_nq->enquire) { + LOGERR(("Query::getDoc: no query opened\n")); + return false; + } + + int xapi; + if (m_nq->postfilter) { + // There is a postquery filter, does this fall in already known area ? + if (exti >= (int)m_nq->m_dbindices.size()) { + // Have to fetch xapian docs and filter until we get + // enough or fail + m_nq->m_dbindices.reserve(exti+1); + // First xapian doc we fetch is the one after last stored + int first = m_nq->m_dbindices.size() > 0 ? + m_nq->m_dbindices.back() + 1 : 0; + // Loop until we get enough docs + while (exti >= (int)m_nq->m_dbindices.size()) { + LOGDEB(("Query::getDoc: fetching %d starting at %d\n", + qquantum, first)); + try { + m_nq->mset = m_nq->enquire->get_mset(first, qquantum); + } catch (const Xapian::DatabaseModifiedError &error) { + m_db->m_ndb->db.reopen(); + m_nq->mset = m_nq->enquire->get_mset(first, qquantum); + } catch (const Xapian::Error & error) { + LOGERR(("enquire->get_mset: exception: %s\n", + error.get_msg().c_str())); + abort(); + } + + if (m_nq->mset.empty()) { + LOGDEB(("Query::getDoc: got empty mset\n")); + return false; + } + first = m_nq->mset.get_firstitem(); + for (unsigned int i = 0; i < m_nq->mset.size() ; i++) { + LOGDEB(("Query::getDoc: [%d]\n", i)); + Xapian::Document xdoc = m_nq->mset[i].get_document(); + if ((*m_nq->postfilter)(xdoc)) { + m_nq->m_dbindices.push_back(first + i); + } + } + first = first + m_nq->mset.size(); + } + } + xapi = m_nq->m_dbindices[exti]; + } else { + xapi = exti; + } + + // From there on, we work with a xapian enquire item number. Fetch it + int first = m_nq->mset.get_firstitem(); + int last = first + m_nq->mset.size() -1; + + if (!(xapi >= first && xapi <= last)) { + LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); + try { + m_nq->mset = m_nq->enquire->get_mset(xapi, qquantum, + 0, m_nq->decider); + } catch (const Xapian::DatabaseModifiedError &error) { + m_db->m_ndb->db.reopen(); + m_nq->mset = m_nq->enquire->get_mset(xapi, qquantum, + 0, m_nq->decider); + + } catch (const Xapian::Error & error) { + LOGERR(("enquire->get_mset: exception: %s\n", + error.get_msg().c_str())); + abort(); + } + if (m_nq->mset.empty()) + return false; + first = m_nq->mset.get_firstitem(); + last = first + m_nq->mset.size() -1; + } + + LOGDEB1(("Query::getDoc: Qry [%s] win [%d-%d] Estimated results: %d", + m_nq->query.get_description().c_str(), + first, last, + m_nq->mset.get_matches_lower_bound())); + + Xapian::Document xdoc = m_nq->mset[xapi-first].get_document(); + Xapian::docid docid = *(m_nq->mset[xapi-first]); + if (percent) + *percent = m_nq->mset.convert_to_percent(m_nq->mset[xapi-first]); + + // Parse xapian document's data and populate doc fields + string data = xdoc.get_data(); + return m_db->m_ndb->dbDataToRclDoc(docid, data, doc); +} + +list Query::expand(const Doc &doc) +{ + list res; + if (ISNULL(m_nq) || !m_nq->enquire) { + LOGERR(("Query::expand: no query opened\n")); + return res; + } + string ermsg; + for (int tries = 0; tries < 2; tries++) { + try { + Xapian::RSet rset; + rset.add_document(Xapian::docid(doc.xdocid)); + // We don't exclude the original query terms. + Xapian::ESet eset = m_nq->enquire->get_eset(20, rset, false); + LOGDEB(("ESet terms:\n")); + // We filter out the special terms + for (Xapian::ESetIterator it = eset.begin(); + it != eset.end(); it++) { + LOGDEB((" [%s]\n", (*it).c_str())); + if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z')) + continue; + res.push_back(*it); + if (res.size() >= 10) + break; + } + } catch (const Xapian::DatabaseModifiedError &error) { + continue; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("Query::expand: xapian error %s\n", ermsg.c_str())); + res.clear(); + } + break; + } + + return res; +} + +} diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h new file mode 100644 index 00000000..bb801c7b --- /dev/null +++ b/src/rcldb/rclquery.h @@ -0,0 +1,92 @@ +#ifndef _rclquery_h_included_ +#define _rclquery_h_included_ +/* @(#$Id: rclquery.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2008 J.F.Dockes */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include +#include + +#ifndef NO_NAMESPACES +using std::string; +using std::list; +using std::vector; +#endif + +#include "refcntr.h" + +#ifndef NO_NAMESPACES +namespace Rcl { +#endif + +class SearchData; +class Db; +class Doc; + +/** + * An Rcl::Query is a question (SearchData) applied to a + * database. Handles access to the results. Somewhat equivalent to a + * cursor in an rdb. + */ +class Query { + public: + enum QueryOpts {QO_NONE=0, QO_STEM = 1}; + + Query(Db *db); + + ~Query(); + + /** Get explanation about last error */ + string getReason() const; + + /** Parse query string and initialize query */ + bool setQuery(RefCntr q, int opts = QO_NONE, + const string& stemlang = "english"); + bool getQueryTerms(list& terms); + bool getMatchTerms(const Doc& doc, list& terms); + + /** Get document at rank i in current query. */ + bool getDoc(int i, Doc &doc, int *percent = 0); + + /** Expand query */ + list expand(const Doc &doc); + + /** Get results count for current query */ + int getResCnt(); + + Db *whatDb(); + + /** make this public for access from embedded Db::Native */ + class Native; + Native *m_nq; + +private: + string m_filterTopDir; // Current query filter on subtree top directory + string m_reason; // Error explanation + Db *m_db; + unsigned int m_qOpts; + /* Copyconst and assignemt private and forbidden */ + Query(const Query &) {} + Query & operator=(const Query &) {return *this;}; +}; + +#ifndef NO_NAMESPACES +} +#endif // NO_NAMESPACES + + +#endif /* _rclquery_h_included_ */ diff --git a/src/rcldb/rclquery_p.h b/src/rcldb/rclquery_p.h new file mode 100644 index 00000000..7f666829 --- /dev/null +++ b/src/rcldb/rclquery_p.h @@ -0,0 +1,66 @@ +#ifndef _rclquery_p_h_included_ +#define _rclquery_p_h_included_ +/* @(#$Id: rclquery_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes */ + +#include +#include + +using std::map; +using std::vector; + +#include +#include "rclquery.h" + +namespace Rcl { + +class Query::Native { +public: + Xapian::Query query; // query descriptor: terms and subqueries + // joined by operators (or/and etc...) + + vector m_dbindices; // In case there is a postq filter: sequence of + // db indices that match + + // Filtering results on location. There are 2 possible approaches + // for this: + // - Set a "MatchDecider" to be used by Xapian during the query + // - Filter the results out of Xapian (this also uses a + // Xapian::MatchDecider object, but applied to the results by Recoll. + // + // The result filtering approach was the first implemented. + // + // The efficiency of both methods depend on the searches, so the code + // for both has been kept. A nice point for the Xapian approach is that + // the result count estimate are correct (they are wrong with + // the postfilter approach). It is also faster in some worst case scenarios + // so this now the default (but the post-filtering is faster in many common + // cases). + // + // Which is used is decided in SetQuery(), by setting either of + // the two following members. This in turn is controlled by a + // preprocessor directive. + +#define XAPIAN_FILTERING 1 + + Xapian::MatchDecider *decider; // Xapian does the filtering + Xapian::MatchDecider *postfilter; // Result filtering done by Recoll + + Xapian::Enquire *enquire; // Open query descriptor. + Xapian::MSet mset; // Partial result set + Query *m_q; + // Term frequencies for current query. See makeAbstract, setQuery + map termfreqs; + + Native(Query *q) + : decider(0), postfilter(0), enquire(0), m_q(q) + { } + + ~Native() { + delete decider; + delete postfilter; + delete enquire; + } +}; + +} +#endif /* _rclquery_p_h_included_ */ diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index b9101c4e..3d1a7d5b 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -16,7 +16,7 @@ */ #ifndef _SEARCHDATA_H_INCLUDED_ #define _SEARCHDATA_H_INCLUDED_ -/* @(#$Id: searchdata.h,v 1.13 2008-05-08 10:00:20 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: searchdata.h,v 1.14 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */ /** * Structures to hold data coming almost directly from the gui @@ -47,7 +47,7 @@ class SearchDataClause; /** Data structure representing a Recoll user query, for translation - into a Xapian query tree. + into a Xapian query tree. This could probably better called a 'question'. This is a list of search clauses combined through either OR or AND. diff --git a/src/utils/pathut.cpp b/src/utils/pathut.cpp index 856d4c34..63cf0e0c 100644 --- a/src/utils/pathut.cpp +++ b/src/utils/pathut.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pathut.cpp,v 1.19 2008-05-27 06:18:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pathut.cpp,v 1.20 2008-06-13 18:22:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -36,6 +36,7 @@ using std::list; using std::stack; #endif /* NO_NAMESPACES */ +#include "autoconfig.h" #include "pathut.h" #include diff --git a/src/utils/refcntr.h b/src/utils/refcntr.h index abe10174..bc9c53ec 100644 --- a/src/utils/refcntr.h +++ b/src/utils/refcntr.h @@ -42,6 +42,7 @@ public: X *operator->() {return rep;} int getcnt() const {return pcount ? *pcount : 0;} const X *getptr() const {return rep;} + bool isNull() const {return rep == 0;} };