use a Xapian MatchDecider to filter on dir path

This commit is contained in:
dockes 2007-10-24 15:38:53 +00:00
parent f56d3849dd
commit 2b5593887f

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.124 2007-10-24 08:42:59 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.125 2007-10-24 15:38:53 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -104,7 +104,31 @@ class Native {
Xapian::Database db; Xapian::Database db;
Xapian::Query query; // query descriptor: terms and subqueries Xapian::Query query; // query descriptor: terms and subqueries
// joined by operators (or/and etc...) // joined by operators (or/and etc...)
Xapian::MatchDecider *decider;
// Filtering results on location. There are 2 possible approaches
// for this:
// - Set a "MatchDecider" to be used by Xapian during the query
// - Filter the results out of Xapian (this also uses a
// Xapian::MatchDecider object, but applied to the results by Recoll.
//
// The result filtering approach was the first implemented.
//
// The efficiency of both methods depend on the searches, so the code
// for both has been kept. A nice point for the Xapian approach is that
// the result count estimate are correct (they are wrong with
// the postfilter approach). It is also faster in some worst case scenarios
// so this now the default (but the post-filtering is faster in many common
// cases).
//
// Which is used is decided in SetQuery(), by setting either of
// the two following members. This in turn is controlled by a
// preprocessor directive.
#define XAPIAN_FILTERING 1
Xapian::MatchDecider *decider; // Xapian does the filtering
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
Xapian::Enquire *enquire; // Open query descriptor. Xapian::Enquire *enquire; // Open query descriptor.
Xapian::MSet mset; // Partial result set Xapian::MSet mset; // Partial result set
@ -113,11 +137,13 @@ class Native {
Native(Db *db) Native(Db *db)
: m_db(db), : m_db(db),
m_isopen(false), m_iswritable(false), decider(0), enquire(0) m_isopen(false), m_iswritable(false), decider(0), postfilter(0),
enquire(0)
{ } { }
~Native() { ~Native() {
delete decider; delete decider;
delete postfilter;
delete enquire; delete enquire;
} }
@ -145,7 +171,9 @@ public:
{} {}
virtual ~FilterMatcher() {} virtual ~FilterMatcher() {}
virtual bool operator()(const Xapian::Document &xdoc) const { virtual bool operator()(const Xapian::Document &xdoc) const
{
m_cnt++;
// Parse xapian document's data and populate doc fields // Parse xapian document's data and populate doc fields
string data = xdoc.get_data(); string data = xdoc.get_data();
ConfSimple parms(&data); ConfSimple parms(&data);
@ -156,17 +184,19 @@ public:
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n", LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
m_topdir.c_str(), url.c_str())); m_topdir.c_str(), url.c_str()));
if (url.find(m_topdir, 7) == 7) { if (url.find(m_topdir, 7) == 7) {
LOGDEB(("FilterMatcher: MATCH\n")); LOGDEB2(("FilterMatcher: MATCH %d\n", m_cnt));
return true; return true;
} else { } else {
LOGDEB(("FilterMatcher: NO MATCH\n")); LOGDEB2(("FilterMatcher: NO MATCH %d\n", m_cnt));
return false; return false;
} }
} }
static int m_cnt;
private: private:
string m_topdir; string m_topdir;
}; };
int FilterMatcher::m_cnt;
/* See comment in class declaration */ /* See comment in class declaration */
bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids) bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
@ -664,8 +694,7 @@ bool Db::i_close(bool final)
if (w) if (w)
LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n")); LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));
// Used to do a flush here. Cant see why it should be necessary. // Used to do a flush here. Cant see why it should be necessary.
delete m_ndb; deleteZ(m_ndb);
m_ndb = 0;
if (w) if (w)
LOGDEB(("Rcl::Db:close() xapian close done.\n")); LOGDEB(("Rcl::Db:close() xapian close done.\n"));
if (final) { if (final) {
@ -1440,14 +1469,20 @@ bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
LOGDEB(("Db::setQuery:\n")); LOGDEB(("Db::setQuery:\n"));
m_filterTopDir = sdata->getTopdir(); m_filterTopDir = sdata->getTopdir();
delete m_ndb->decider; deleteZ(m_ndb->decider);
m_ndb->decider = 0; deleteZ(m_ndb->postfilter);
if (!m_filterTopDir.empty()) if (!m_filterTopDir.empty()) {
m_ndb->decider = new FilterMatcher(m_filterTopDir); #if XAPIAN_FILTERING
m_ndb->decider =
#else
m_ndb->postfilter =
#endif
new FilterMatcher(m_filterTopDir);
}
m_dbindices.clear(); m_dbindices.clear();
m_qOpts = opts; m_qOpts = opts;
m_ndb->m_termfreqs.clear(); m_ndb->m_termfreqs.clear();
FilterMatcher::m_cnt = 0;
Xapian::Query xq; Xapian::Query xq;
if (!sdata->toNativeQuery(*this, &xq, if (!sdata->toNativeQuery(*this, &xq,
(opts & Db::QO_STEM) ? stemlang : "")) { (opts & Db::QO_STEM) ? stemlang : "")) {
@ -1745,10 +1780,12 @@ int Db::getResCnt()
string ermsg; string ermsg;
if (m_ndb->mset.size() <= 0) { if (m_ndb->mset.size() <= 0) {
try { try {
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum); m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
0, m_ndb->decider);
} catch (const Xapian::DatabaseModifiedError &error) { } catch (const Xapian::DatabaseModifiedError &error) {
m_ndb->db.reopen(); m_ndb->db.reopen();
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum); m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
0, m_ndb->decider);
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str())); LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str()));
@ -1781,7 +1818,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
} }
int xapi; int xapi;
if (m_ndb->decider) { if (m_ndb->postfilter) {
// There is a postquery filter, does this fall in already known area ? // There is a postquery filter, does this fall in already known area ?
if (exti >= (int)m_dbindices.size()) { if (exti >= (int)m_dbindices.size()) {
// Have to fetch xapian docs and filter until we get // Have to fetch xapian docs and filter until we get
@ -1812,7 +1849,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) { for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) {
LOGDEB(("Db::getDoc: [%d]\n", i)); LOGDEB(("Db::getDoc: [%d]\n", i));
Xapian::Document xdoc = m_ndb->mset[i].get_document(); Xapian::Document xdoc = m_ndb->mset[i].get_document();
if ((*m_ndb->decider)(xdoc)) { if ((*m_ndb->postfilter)(xdoc)) {
m_dbindices.push_back(first + i); m_dbindices.push_back(first + i);
} }
} }
@ -1831,10 +1868,13 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
if (!(xapi >= first && xapi <= last)) { if (!(xapi >= first && xapi <= last)) {
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
try { try {
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum); m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
0, m_ndb->decider);
} catch (const Xapian::DatabaseModifiedError &error) { } catch (const Xapian::DatabaseModifiedError &error) {
m_ndb->db.reopen(); m_ndb->db.reopen();
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum); m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
0, m_ndb->decider);
} catch (const Xapian::Error & error) { } catch (const Xapian::Error & error) {
LOGERR(("enquire->get_mset: exception: %s\n", LOGERR(("enquire->get_mset: exception: %s\n",
error.get_msg().c_str())); error.get_msg().c_str()));