Use a xapian phrase search on the split path for filtering on directory location (much faster than the current method)

This commit is contained in:
Jean-Francois Dockes 2010-12-16 15:53:40 +01:00
parent 6ebc4b4fad
commit 21c6025ba7
7 changed files with 61 additions and 143 deletions

View File

@ -74,6 +74,8 @@ static const unsigned int baseTextPosition = 100000;
namespace Rcl { namespace Rcl {
#endif #endif
const string pathelt_prefix = "XP";
string version_string(){ string version_string(){
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") + return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
string(Xapian::version_string()); string(Xapian::version_string());
@ -921,7 +923,24 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
splitter.basepos += splitter.curpos + 100; splitter.basepos += splitter.curpos + 100;
} }
} }
// Split and index the path from the url for path-based filtering
{
string path = url_gpath(doc.url);
vector<string> vpath;
stringToTokens(path, vpath, "/");
splitter.curpos = 0;
newdocument.add_posting(pathelt_prefix,
splitter.basepos + splitter.curpos++);
for (vector<string>::const_iterator it = vpath.begin();
it != vpath.end(); it++){
newdocument.add_posting(pathelt_prefix + *it,
splitter.basepos + splitter.curpos++);
}
}
splitter.basepos += splitter.curpos + 100;
// Index textual metadata. These are all indexed as text with // Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this // positions, as we may want to do phrase searches with them (this
// makes no sense for keywords by the way). // makes no sense for keywords by the way).
@ -938,8 +957,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
continue; continue;
} }
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
meta_it->first.c_str(), pfx.c_str(), meta_it->first.c_str(), pfx.c_str(),
meta_it->second.c_str())); meta_it->second.c_str()));
splitter.setprefix(pfx); // Subject splitter.setprefix(pfx); // Subject
if (!splitter.text_to_words(meta_it->second)) if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n", LOGDEB(("Db::addOrUpdate: split failed for %s\n",

View File

@ -262,6 +262,7 @@ private:
// the most reasonable place. // the most reasonable place.
string version_string(); string version_string();
extern const string pathelt_prefix;
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
} }
#endif // NO_NAMESPACES #endif // NO_NAMESPACES

View File

@ -24,42 +24,6 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.11 2008-12-19 09:55:36 dockes Ex
namespace Rcl { namespace Rcl {
#endif #endif
class FilterMatcher : public Xapian::MatchDecider {
public:
FilterMatcher(const string &topdir)
: m_topdir(topdir)
{}
virtual ~FilterMatcher() {}
virtual
#if XAPIAN_MAJOR_VERSION < 1
int
#else
bool
#endif
operator()(const Xapian::Document &xdoc) const
{
// Parse xapian document's data and populate doc fields
string data = xdoc.get_data();
ConfSimple parms(data);
// The only filtering for now is on file path (subtree)
string url;
parms.get(Doc::keyurl, url);
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
m_topdir.c_str(), url.c_str()));
if (url.find(m_topdir, 7) == 7) {
return true;
} else {
return false;
}
}
private:
string m_topdir;
};
// Sort helper class // Sort helper class
class QSorter : public Xapian::Sorter { class QSorter : public Xapian::Sorter {
public: public:
@ -134,23 +98,14 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
} }
m_reason.erase(); m_reason.erase();
m_filterTopDir = sdata->getTopdir();
m_nq->clear(); m_nq->clear();
if (!m_filterTopDir.empty()) {
#if XAPIAN_FILTERING
m_nq->decider =
#else
m_nq->postfilter =
#endif
new FilterMatcher(m_filterTopDir);
}
Xapian::Query xq; Xapian::Query xq;
if (!sdata->toNativeQuery(*m_db, &xq)) { if (!sdata->toNativeQuery(*m_db, &xq)) {
m_reason += sdata->getReason(); m_reason += sdata->getReason();
return false; return false;
} }
m_nq->xquery = xq; m_nq->xquery = xq;
string d; string d;
@ -162,6 +117,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
} else { } else {
m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO); m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO);
} }
m_nq->xenquire->set_docid_order(Xapian::Enquire::DONT_CARE);
if (!m_sortField.empty()) { if (!m_sortField.empty()) {
if (m_sorter) { if (m_sorter) {
delete (QSorter*)m_sorter; delete (QSorter*)m_sorter;
@ -194,9 +150,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
if (d.find("Xapian::Query") == 0) if (d.find("Xapian::Query") == 0)
d.erase(0, strlen("Xapian::Query")); d.erase(0, strlen("Xapian::Query"));
if (!m_filterTopDir.empty()) {
d += string(" [dir: ") + m_filterTopDir + "]";
}
sdata->setDescription(d); sdata->setDescription(d);
LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str())); LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
return true; return true;
@ -252,8 +206,9 @@ bool Query::getMatchTerms(unsigned long xdocid, list<string>& terms)
return true; return true;
} }
// Mset size // Mset size
static const int qquantum = 30; static const int qquantum = 50;
// Get estimated result count for query. Xapian actually does most of // Get estimated result count for query. Xapian actually does most of
// the search job in there, this can be long // the search job in there, this can be long
@ -269,7 +224,7 @@ int Query::getResCnt()
Chrono chron; Chrono chron;
XAPTRY(m_nq->xmset = XAPTRY(m_nq->xmset =
m_nq->xenquire->get_mset(0, qquantum,0, m_nq->decider); m_nq->xenquire->get_mset(0, qquantum, (const Xapian::RSet *)0);
ret = m_nq->xmset.get_matches_lower_bound(), ret = m_nq->xmset.get_matches_lower_bound(),
m_db->m_ndb->xrdb, m_reason); m_db->m_ndb->xrdb, m_reason);
@ -283,76 +238,29 @@ int Query::getResCnt()
} }
// Get document at rank i in query (i is the index in the whole result // Get document at rank xapi in query results. We check if the
// set, as in the enquire class. We check if the current mset has the // current mset has the doc, else ask for an other one. We use msets
// doc, else ask for an other one. We use msets of 10 documents. Don't // of qquantum documents.
// know if the whole thing makes sense at all but it seems to work.
// //
// If there is a postquery filter (ie: file names), we have to // Note that as stated by a Xapian developer, Enquire searches from
// maintain a correspondance from the sequential external index // scratch each time get_mset() is called. So the better performance
// sequence to the internal Xapian hole-y one (the holes being the documents // on subsequent calls is probably only due to disk caching.
// that dont match the filter). bool Query::getDoc(int xapi, Doc &doc)
bool Query::getDoc(int exti, Doc &doc)
{ {
LOGDEB1(("Query::getDoc: exti %d\n", exti)); LOGDEB1(("Query::getDoc: xapian enquire index %d\n", xapi));
if (ISNULL(m_nq) || !m_nq->xenquire) { if (ISNULL(m_nq) || !m_nq->xenquire) {
LOGERR(("Query::getDoc: no query opened\n")); LOGERR(("Query::getDoc: no query opened\n"));
return false; return false;
} }
int xapi;
if (m_nq->postfilter) {
// There is a postquery filter, does this fall in already known area ?
if (exti >= (int)m_nq->m_dbindices.size()) {
// Have to fetch xapian docs and filter until we get
// enough or fail
m_nq->m_dbindices.reserve(exti+1);
// First xapian doc we fetch is the one after last stored
int first = m_nq->m_dbindices.size() > 0 ?
m_nq->m_dbindices.back() + 1 : 0;
// Loop until we get enough docs
while (exti >= (int)m_nq->m_dbindices.size()) {
LOGDEB(("Query::getDoc: fetching %d starting at %d\n",
qquantum, first));
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(first, qquantum),
m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR(("enquire->get_mset: exception: %s\n",
m_reason.c_str()));
return false;
}
if (m_nq->xmset.empty()) {
LOGDEB(("Query::getDoc: got empty mset\n"));
return false;
}
first = m_nq->xmset.get_firstitem();
for (unsigned int i = 0; i < m_nq->xmset.size() ; i++) {
LOGDEB(("Query::getDoc: [%d]\n", i));
Xapian::Document xdoc = m_nq->xmset[i].get_document();
if ((*m_nq->postfilter)(xdoc)) {
m_nq->m_dbindices.push_back(first + i);
}
}
first = first + m_nq->xmset.size();
}
}
xapi = m_nq->m_dbindices[exti];
} else {
xapi = exti;
}
// From there on, we work with a xapian enquire item number. Fetch it
int first = m_nq->xmset.get_firstitem(); int first = m_nq->xmset.get_firstitem();
int last = first + m_nq->xmset.size() -1; int last = first + m_nq->xmset.size() -1;
if (!(xapi >= first && xapi <= last)) { if (!(xapi >= first && xapi <= last)) {
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum, XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
0, m_nq->decider), (const Xapian::RSet *)0),
m_db->m_ndb->xrdb, m_reason); m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) { if (!m_reason.empty()) {
@ -408,6 +316,7 @@ bool Query::getDoc(int exti, Doc &doc)
return false; return false;
} }
doc.meta[Rcl::Doc::keyudi] = udi; doc.meta[Rcl::Doc::keyudi] = udi;
// Parse xapian document's data and populate doc fields // Parse xapian document's data and populate doc fields
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, pc); return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, pc);
} }

View File

@ -90,7 +90,6 @@ class Query {
Native *m_nq; Native *m_nq;
private: private:
string m_filterTopDir; // Current query filter on subtree top directory
string m_reason; // Error explanation string m_reason; // Error explanation
Db *m_db; Db *m_db;
void *m_sorter; void *m_sorter;

View File

@ -17,51 +17,25 @@ class Query::Native {
public: public:
/** The query I belong to */ /** The query I belong to */
Query *m_q; Query *m_q;
/** query descriptor: terms and subqueries joined by operators /** query descriptor: terms and subqueries joined by operators
* (or/and etc...) * (or/and etc...)
*/ */
Xapian::Query xquery; Xapian::Query xquery;
/** In case there is a postq filter: sequence of db indices that match */
vector<int> m_dbindices;
// Filtering results on location. There are 2 possible approaches
// for this:
// - Set a "MatchDecider" to be used by Xapian during the query
// - Filter the results out of Xapian (this also uses a
// Xapian::MatchDecider object, but applied to the results by Recoll.
//
// The result filtering approach was the first implemented.
//
// The efficiency of both methods depend on the searches, so the code
// for both has been kept. A nice point for the Xapian approach is that
// the result count estimate are correct (they are wrong with
// the postfilter approach). It is also faster in some worst case scenarios
// so this now the default (but the post-filtering is faster in many common
// cases).
//
// Which is used is decided in SetQuery(), by setting either of
// the two following members. This in turn is controlled by a
// preprocessor directive.
#define XAPIAN_FILTERING 1
Xapian::MatchDecider *decider; // Xapian does the filtering
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
Xapian::Enquire *xenquire; // Open query descriptor. Xapian::Enquire *xenquire; // Open query descriptor.
Xapian::MSet xmset; // Partial result set Xapian::MSet xmset; // Partial result set
// Term frequencies for current query. See makeAbstract, setQuery // Term frequencies for current query. See makeAbstract, setQuery
map<string, double> termfreqs; map<string, double> termfreqs;
Native(Query *q) Native(Query *q)
: m_q(q), decider(0), postfilter(0), xenquire(0) : m_q(q), xenquire(0)
{ } { }
~Native() { ~Native() {
clear(); clear();
} }
void clear() { void clear() {
m_dbindices.clear();
delete decider; decider = 0;
delete postfilter; postfilter = 0;
delete xenquire; xenquire = 0; delete xenquire; xenquire = 0;
termfreqs.clear(); termfreqs.clear();
} }

View File

@ -249,6 +249,21 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq); xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
} }
// Add the directory filtering clause
if (!m_topdir.empty()) {
vector<string> vpath;
stringToTokens(m_topdir, vpath, "/");
vector<string> pvpath;
pvpath.push_back(pathelt_prefix);
for (vector<string>::const_iterator it = vpath.begin();
it != vpath.end(); it++){
pvpath.push_back(pathelt_prefix + *it);
}
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq,
Xapian::Query(Xapian::Query::OP_PHRASE,
pvpath.begin(), pvpath.end()));
}
*((Xapian::Query *)d) = xq; *((Xapian::Query *)d) = xq;
return true; return true;
} }

View File

@ -18,14 +18,15 @@
# Native fields matching omega uses, which we index without an X first # Native fields matching omega uses, which we index without an X first
# letter. Don't change these. Caption is used for 'title' to keep a last # letter. Don't change these. Caption is used for 'title' to keep a last
# remnant of omega compatibility inside the data record # remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
caption = S caption = S
author = A author = A
keywords = K keywords = K
# Extension examples. These are actually used by default by Recoll, you can # Extension examples. These are actually used by default by Recoll, you can
# add your own to search for fields produced by the filters and not handled # add your own to search for fields produced by the filters and not handled
# by default. # by default.
# Some values are reserved by recoll: XP (for path elements).
ext = XE ext = XE
filename = XSFN filename = XSFN
recipient = XTO recipient = XTO