Use a xapian phrase search on the split path for filtering on directory location (much faster than the current method)

2010-12-16 15:53:40 +01:00 · 2010-12-16 15:53:40 +01:00 · 21c6025ba7
commit 21c6025ba7
parent 6ebc4b4fad
7 changed files with 61 additions and 143 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -74,6 +74,8 @@ static const unsigned int baseTextPosition = 100000;
 namespace Rcl {
 #endif
 const string pathelt_prefix = "XP";
 string version_string(){
    return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
        string(Xapian::version_string());
@ -921,7 +923,24 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	    splitter.basepos += splitter.curpos + 100;
 	}
    }
-	    
+
    // Split and index the path from the url for path-based filtering
    {
 	string path = url_gpath(doc.url);
 	vector<string> vpath;
 	stringToTokens(path, vpath, "/");
 	splitter.curpos = 0;
 	newdocument.add_posting(pathelt_prefix, 
 				splitter.basepos + splitter.curpos++);
 	for (vector<string>::const_iterator it = vpath.begin(); 
 	     it != vpath.end(); it++){
 	    newdocument.add_posting(pathelt_prefix + *it, 
 				    splitter.basepos + splitter.curpos++);
 	}
    }
    splitter.basepos += splitter.curpos + 100;
    // Index textual metadata.  These are all indexed as text with
    // positions, as we may want to do phrase searches with them (this
    // makes no sense for keywords by the way).
@ -938,8 +957,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 		continue;
 	    }
 	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
-		    meta_it->first.c_str(), pfx.c_str(), 
+		     meta_it->first.c_str(), pfx.c_str(), 
-		    meta_it->second.c_str()));
+		     meta_it->second.c_str()));
 	    splitter.setprefix(pfx); // Subject
 	    if (!splitter.text_to_words(meta_it->second))
                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -262,6 +262,7 @@ private:
 // the most reasonable place.
 string version_string();
 extern const string pathelt_prefix;
 #ifndef NO_NAMESPACES
 }
 #endif // NO_NAMESPACES
--- a/src/rcldb/rclquery.cpp
+++ b/src/rcldb/rclquery.cpp
@ -24,42 +24,6 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.11 2008-12-19 09:55:36 dockes Ex
 namespace Rcl {
 #endif
 class FilterMatcher : public Xapian::MatchDecider {
 public:
    FilterMatcher(const string &topdir)
 	: m_topdir(topdir)
    {}
    virtual ~FilterMatcher() {}
    virtual 
 #if XAPIAN_MAJOR_VERSION < 1
    int 
 #else
    bool
 #endif
    operator()(const Xapian::Document &xdoc) const 
    {
 	// Parse xapian document's data and populate doc fields
 	string data = xdoc.get_data();
 	ConfSimple parms(data);
 	// The only filtering for now is on file path (subtree)
 	string url;
 	parms.get(Doc::keyurl, url);
 	LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
 		 m_topdir.c_str(), url.c_str()));
 	if (url.find(m_topdir, 7) == 7) {
 	    return true; 
 	} else {
 	    return false;
 	}
    }
 private:
    string m_topdir;
 };
 // Sort helper class
 class QSorter : public Xapian::Sorter {
 public:
@ -134,23 +98,14 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
    }
    m_reason.erase();
    m_filterTopDir = sdata->getTopdir();
    m_nq->clear();
    if (!m_filterTopDir.empty()) {
 #if XAPIAN_FILTERING
 	m_nq->decider = 
 #else
        m_nq->postfilter =
 #endif
 	    new FilterMatcher(m_filterTopDir);
    }
    Xapian::Query xq;
    if (!sdata->toNativeQuery(*m_db, &xq)) {
 	m_reason += sdata->getReason();
 	return false;
    }
    m_nq->xquery = xq;
    string d;
@ -162,6 +117,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
            } else {
                m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO);
            }
 	    m_nq->xenquire->set_docid_order(Xapian::Enquire::DONT_CARE);
            if (!m_sortField.empty()) {
                if (m_sorter) {
                    delete (QSorter*)m_sorter;
@ -194,9 +150,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
    if (d.find("Xapian::Query") == 0)
 	d.erase(0, strlen("Xapian::Query"));
-    if (!m_filterTopDir.empty()) {
+
 	d += string(" [dir: ") + m_filterTopDir + "]";
    }
    sdata->setDescription(d);
    LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
    return true;
@ -252,8 +206,9 @@ bool Query::getMatchTerms(unsigned long xdocid, list<string>& terms)
    return true;
 }
 // Mset size
-static const int qquantum = 30;
+static const int qquantum = 50;
 // Get estimated result count for query. Xapian actually does most of
 // the search job in there, this can be long
@ -269,7 +224,7 @@ int Query::getResCnt()
        Chrono chron;
        XAPTRY(m_nq->xmset = 
-               m_nq->xenquire->get_mset(0, qquantum,0, m_nq->decider);
+               m_nq->xenquire->get_mset(0, qquantum, (const Xapian::RSet *)0);
               ret = m_nq->xmset.get_matches_lower_bound(),
               m_db->m_ndb->xrdb, m_reason);
@ -283,76 +238,29 @@ int Query::getResCnt()
 }
-// Get document at rank i in query (i is the index in the whole result
+// Get document at rank xapi in query results.  We check if the
-// set, as in the enquire class. We check if the current mset has the
+// current mset has the doc, else ask for an other one. We use msets
-// doc, else ask for an other one. We use msets of 10 documents. Don't
+// of qquantum documents.
 // know if the whole thing makes sense at all but it seems to work.
 //
-// If there is a postquery filter (ie: file names), we have to
+// Note that as stated by a Xapian developer, Enquire searches from
-// maintain a correspondance from the sequential external index
+// scratch each time get_mset() is called. So the better performance
-// sequence to the internal Xapian hole-y one (the holes being the documents 
+// on subsequent calls is probably only due to disk caching.
-// that dont match the filter).
+bool Query::getDoc(int xapi, Doc &doc)
 bool Query::getDoc(int exti, Doc &doc)
 {
-    LOGDEB1(("Query::getDoc: exti %d\n", exti));
+    LOGDEB1(("Query::getDoc: xapian enquire index %d\n", xapi));
    if (ISNULL(m_nq) || !m_nq->xenquire) {
 	LOGERR(("Query::getDoc: no query opened\n"));
 	return false;
    }
    int xapi;
    if (m_nq->postfilter) {
 	// There is a postquery filter, does this fall in already known area ?
 	if (exti >= (int)m_nq->m_dbindices.size()) {
 	    // Have to fetch xapian docs and filter until we get
 	    // enough or fail
 	    m_nq->m_dbindices.reserve(exti+1);
 	    // First xapian doc we fetch is the one after last stored 
 	    int first = m_nq->m_dbindices.size() > 0 ? 
 		m_nq->m_dbindices.back() + 1 : 0;
 	    // Loop until we get enough docs
 	    while (exti >= (int)m_nq->m_dbindices.size()) {
 		LOGDEB(("Query::getDoc: fetching %d starting at %d\n",
 			qquantum, first));
 		XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(first, qquantum),
                       m_db->m_ndb->xrdb, m_reason);
                if (!m_reason.empty()) {
                    LOGERR(("enquire->get_mset: exception: %s\n", 
                            m_reason.c_str()));
                    return false;
 		}
 		if (m_nq->xmset.empty()) {
 		    LOGDEB(("Query::getDoc: got empty mset\n"));
 		    return false;
 		}
 		first = m_nq->xmset.get_firstitem();
 		for (unsigned int i = 0; i < m_nq->xmset.size() ; i++) {
 		    LOGDEB(("Query::getDoc: [%d]\n", i));
 		    Xapian::Document xdoc = m_nq->xmset[i].get_document();
 		    if ((*m_nq->postfilter)(xdoc)) {
 			m_nq->m_dbindices.push_back(first + i);
 		    }
 		}
 		first = first + m_nq->xmset.size();
 	    }
 	}
 	xapi = m_nq->m_dbindices[exti];
    } else {
 	xapi = exti;
    }
    // From there on, we work with a xapian enquire item number. Fetch it
    int first = m_nq->xmset.get_firstitem();
    int last = first + m_nq->xmset.size() -1;
    if (!(xapi >= first && xapi <= last)) {
 	LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
-	XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
+	XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,  
-                                                      0, m_nq->decider),
+						      (const Xapian::RSet *)0),
               m_db->m_ndb->xrdb, m_reason);
        if (!m_reason.empty()) {
@ -408,6 +316,7 @@ bool Query::getDoc(int exti, Doc &doc)
        return false;
    }
    doc.meta[Rcl::Doc::keyudi] = udi;
    // Parse xapian document's data and populate doc fields
    return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, pc);
 }
--- a/src/rcldb/rclquery.h
+++ b/src/rcldb/rclquery.h
@ -90,7 +90,6 @@ class Query {
    Native *m_nq;
 private:
    string m_filterTopDir; // Current query filter on subtree top directory 
    string m_reason; // Error explanation
    Db    *m_db;
    void  *m_sorter;
--- a/src/rcldb/rclquery_p.h
+++ b/src/rcldb/rclquery_p.h
@ -17,51 +17,25 @@ class Query::Native {
 public:
    /** The query I belong to */
    Query                *m_q;
    /** query descriptor: terms and subqueries joined by operators
     * (or/and etc...)
     */
    Xapian::Query    xquery; 
    /** In case there is a postq filter: sequence of db indices that match */
    vector<int> m_dbindices; 
    // Filtering results on location. There are 2 possible approaches
    // for this:
    //   - Set a "MatchDecider" to be used by Xapian during the query
    //   - Filter the results out of Xapian (this also uses a
    //     Xapian::MatchDecider object, but applied to the results by Recoll.
    // 
    // The result filtering approach was the first implemented. 
    //
    // The efficiency of both methods depend on the searches, so the code
    // for both has been kept.  A nice point for the Xapian approach is that
    // the result count estimate are correct (they are wrong with
    // the postfilter approach). It is also faster in some worst case scenarios
    // so this now the default (but the post-filtering is faster in many common
    // cases).
    // 
    // Which is used is decided in SetQuery(), by setting either of
    // the two following members. This in turn is controlled by a
    // preprocessor directive.
 #define XAPIAN_FILTERING 1
    Xapian::MatchDecider *decider;   // Xapian does the filtering
    Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
    Xapian::Enquire      *xenquire; // Open query descriptor.
    Xapian::MSet          xmset;    // Partial result set
    // Term frequencies for current query. See makeAbstract, setQuery
    map<string, double>  termfreqs; 
    Native(Query *q)
-	: m_q(q), decider(0), postfilter(0), xenquire(0)
+	: m_q(q), xenquire(0)
    { }
    ~Native() {
 	clear();
    }
    void clear() {
 	m_dbindices.clear();
 	delete decider; decider = 0;
 	delete postfilter; postfilter = 0;
 	delete xenquire; xenquire = 0;
 	termfreqs.clear();
    }
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -249,6 +249,21 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
 	xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
    }
    // Add the directory filtering clause
    if (!m_topdir.empty()) {
 	vector<string> vpath;
 	stringToTokens(m_topdir, vpath, "/");
 	vector<string> pvpath;
 	pvpath.push_back(pathelt_prefix);
 	for (vector<string>::const_iterator it = vpath.begin(); 
 	     it != vpath.end(); it++){
 	    pvpath.push_back(pathelt_prefix + *it);
 	}
 	xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, 
 			   Xapian::Query(Xapian::Query::OP_PHRASE, 
 					 pvpath.begin(), pvpath.end()));
    }
    *((Xapian::Query *)d) = xq;
    return true;
 }
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -18,14 +18,15 @@
 # Native fields matching omega uses, which we index without an X first
 # letter. Don't change these. Caption is used for 'title' to keep a last
-# remnant of omega compatibility inside the data record
+# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
 caption = S
 author = A
 keywords = K
 # Extension examples. These are actually used by default by Recoll, you can
 # add your own to search for fields produced by the filters and not handled
-# by default.
+# by default. 
 # Some values are reserved by recoll: XP (for path elements).
 ext = XE
 filename = XSFN
 recipient = XTO