correctly generate highlighting term groups when stem-expanding NEAR queries

2006-11-18 12:30:14 +00:00 · 2006-11-18 12:30:14 +00:00 · 1e55b88443
commit 1e55b88443
parent 078b414cab
1 changed files with 80 additions and 37 deletions
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.4 2006-11-17 10:06:34 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.5 2006-11-18 12:30:14 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -135,13 +135,6 @@ bool SearchData::getTerms(vector<string>& terms,
 class wsQData : public TextSplitCB {
 public:
    vector<string> terms;
-    // Debug
-    string catterms() {
-	string s;
-	for (unsigned int i = 0; i < terms.size(); i++)
-	    s += "[" + terms[i] + "] ";
-	return s;
-    }
    bool takeword(const std::string &term, int , int, int) {
 	LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
 	terms.push_back(term);
@ -149,16 +142,22 @@ class wsQData : public TextSplitCB {
    }
 };

+/// Translate user string (ie: term1 "a phrase" term3) into a xapian
+/// query tree
 // This used to be a static function, but we couldn't just keep adding
 // parameters to the interface!
 class StringToXapianQ {
 public:
-    StringToXapianQ(Db& db) : m_db(db) { }
+    StringToXapianQ(Db& db, const string &stmlng) 
+	: m_db(db), m_stemlang(stmlng) 
+    { }
+
+
    bool translate(const string &iq,
-		   const string& stemlang,
 		   string &ermsg,
 		   list<Xapian::Query> &pqueries,
 		   int slack = 0, bool useNear = false);
+
    bool getTerms(vector<string>& terms, 
 		  vector<vector<string> >& groups) 
    {
@ -166,18 +165,20 @@ public:
 	groups.insert(groups.end(), m_groups.begin(), m_groups.end());
 	return true;
    }
-private:
-    void maybeStemExp(const string& stemlang, const string& term, 
-		      list<string>& exp);

-    Db& m_db;
+private:
+    void maybeStemExp(bool dont, const string& term, list<string>& exp);
+
+    Db&           m_db;
+    const string& m_stemlang;
+
    // Single terms and phrases resulting from breaking up text;
    vector<string>          m_terms;
    vector<vector<string> > m_groups; 
 };

-/** Make term dumb and possibly expand it into its stem siblings */
-void StringToXapianQ::maybeStemExp(const string& stemlang, 
+/** Make term dumb and possibly expand it into its stem siblings. */
+void StringToXapianQ::maybeStemExp(bool nostemexp, 
 				   const string& term, 
 				   list<string>& exp)
 {
@ -190,8 +191,7 @@ void StringToXapianQ::maybeStemExp(const string& stemlang,
    string term1;
    dumb_string(term, term1);

-    bool nostemexp = stemlang.empty() ? true : false;
-    if (!nostemexp) {
+    if (!m_stemlang.empty() && !nostemexp) {
 	// Check if the first letter is a majuscule in which
 	// case we do not want to do stem expansion. Note that
 	// the test is convoluted and possibly problematic
@ -210,7 +210,41 @@ void StringToXapianQ::maybeStemExp(const string& stemlang,
    if (nostemexp) {
 	exp = list<string>(1, term1);
    } else {
-	exp = m_db.stemExpand(stemlang, term1);
+	exp = m_db.stemExpand(m_stemlang, term1);
+    }
+}
+
+// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
+void multiply_groups(vector<vector<string> >::const_iterator vvit,
+		     vector<vector<string> >::const_iterator vvend, 
+		     vector<string>& comb,
+		     vector<vector<string> >&allcombs)
+{
+    // Remember my string vector and compute next, for recursive calls.
+    vector<vector<string> >::const_iterator myvit = vvit++;
+
+    // Walk the string vector I'm called upon and, for each string,
+    // add it to current result, an call myself recursively on the
+    // next string vector. The last call (last element of the vector of
+    // vectors), adds the elementary result to the output
+
+    // Walk my string vector
+    for (vector<string>::const_iterator strit = (*myvit).begin();
+	 strit != (*myvit).end(); strit++) {
+
+	// Add my current value to the string vector we're building
+	comb.push_back(*strit);
+
+	if (vvit == vvend) {
+	    // Last call: store current result
+	    allcombs.push_back(comb);
+	} else {
+	    // Call recursively on next string vector
+	    multiply_groups(vvit, vvend, comb, allcombs);
+	}
+	// Pop the value I just added (make room for the next element in my
+	// vector)
+	comb.pop_back();
    }
 }

@ -228,13 +262,11 @@ void StringToXapianQ::maybeStemExp(const string& stemlang,
 *   count)
 */
 bool StringToXapianQ::translate(const string &iq,
-				const string& stemlang,
 				string &ermsg,
 				list<Xapian::Query> &pqueries,
 				int slack, bool useNear)
 {
    string qstring = iq;
-    bool opt_stemexp = !stemlang.empty();
    ermsg.erase();
    m_terms.clear();
    m_groups.clear();
@ -243,8 +275,8 @@ bool StringToXapianQ::translate(const string &iq,
    list<string> phrases;
    stringToStrings(qstring, phrases);

-    // Then process each phrase: split into terms and transform into
-    // appropriate Xapian Query
+    // Then process each word/phrase: split into terms and transform
+    // into appropriate Xapian Query
    try {
 	for (list<string>::iterator it = phrases.begin(); 
 	     it != phrases.end(); it++) {
@ -272,7 +304,7 @@ bool StringToXapianQ::translate(const string &iq,
 		{
 		    string term = splitData.terms.front();
 		    list<string> exp;  
-		    maybeStemExp(stemlang, term, exp);
+		    maybeStemExp(false, term, exp);
 		    // Push either term or OR of stem-expanded set
 		    pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
 						     exp.begin(), exp.end()));
@ -286,27 +318,38 @@ bool StringToXapianQ::translate(const string &iq,
 		Xapian::Query::OP_PHRASE;
 		list<Xapian::Query> orqueries;
 		bool hadmultiple = false;
-		string nolang, lang;
-		vector<string> dumbterms;
+		vector<vector<string> >groups;
 		for (vector<string>::iterator it = splitData.terms.begin();
 		     it != splitData.terms.end(); it++) {
+		    // Some version of xapian will accept only one OR clause
+		    // inside NEAR, all others must be leafs
+		    bool nostemexp = 
+			(op == Xapian::Query::OP_PHRASE || hadmultiple) ?
+			true : false;
+
 		    list<string>exp;
-		    lang = (op == Xapian::Query::OP_PHRASE || hadmultiple) ?
-			nolang : stemlang;
-		    maybeStemExp(lang, *it, exp);
-		    dumbterms.insert(dumbterms.end(), exp.begin(), exp.end());
+		    maybeStemExp(nostemexp, *it, exp);
+
+		    groups.push_back(vector<string>(exp.begin(), exp.end()));
+		    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
+						      exp.begin(), exp.end()));
 #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
 		    if (exp.size() > 1) 
 			hadmultiple = true;
 #endif
-		    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
-						      exp.begin(), exp.end()));
 		}
+
 		pqueries.push_back(Xapian::Query(op,
 						 orqueries.begin(),
 						 orqueries.end(),
 					 splitData.terms.size() + slack));
-		m_groups.push_back(dumbterms);
+		// Add NEAR/PHRASE groups to the highlighting data. Must
+		// push all combinations
+		vector<vector<string> > allcombs;
+		vector<string> comb;
+		multiply_groups(groups.begin(), groups.end(), comb, allcombs);
+		m_groups.insert(m_groups.end(), allcombs.begin(), 
+				allcombs.end());
 	    }
 	}
    } catch (const Xapian::Error &e) {
@ -345,8 +388,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
 	return false;
    }
    list<Xapian::Query> pqueries;
-    StringToXapianQ tr(db);
-    if (!tr.translate(m_text, stemlang, m_reason, pqueries))
+    StringToXapianQ tr(db, stemlang);
+    if (!tr.translate(m_text, m_reason, pqueries))
 	return false;
    if (pqueries.empty()) {
 	LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -388,8 +431,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
    // terms etc. The result should be a single element list
    string s = string("\"") + m_text + string("\"");
    bool useNear = m_tp == SCLT_NEAR;
-    StringToXapianQ tr(db);
-    if (!tr.translate(s, stemlang, m_reason, pqueries, m_slack, useNear))
+    StringToXapianQ tr(db, stemlang);
+    if (!tr.translate(s, m_reason, pqueries, m_slack, useNear))
 	return false;
    if (pqueries.empty()) {
 	LOGERR(("SearchDataClauseDist: resolved to null query\n"));