Term expansion: handle field issues inside rcldb::termmatch, ensuring that we take the field name into account for all expansions. Ensures that File Name searches and filename: query language searches work the same, + overall better consistency

2009-12-07 13:27:57 +00:00 · 2009-12-07 13:27:57 +00:00 · bab030f846
commit bab030f846
parent e932144440
3 changed files with 48 additions and 43 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1329,11 +1329,7 @@ bool Db::purgeFile(const string &udi, bool *existed)
 // File name wild card expansion. This is a specialisation ot termMatch
 bool Db::filenameWildExp(const string& fnexp, list<string>& names)
 {
-    string pattern;
+    string pattern = fnexp;
    if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
 	LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
 	return false;
    }
    names.clear();
    // If pattern is not quoted, and has no wildcards, we add * at
@ -1350,12 +1346,12 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
 	return false;
    for (list<TermMatchEntry>::const_iterator it = entries.begin();
 	 it != entries.end(); it++) 
-	names.push_back("XSFN"+it->term);
+	names.push_back(it->term);
    if (names.empty()) {
 	// Build an impossible query: we know its impossible because we
 	// control the prefixes!
-	names.push_back("XIMPOSSIBLE");
+	names.push_back("XNONENoMatchingTerms");
    }
    return true;
 }
@ -1398,6 +1394,16 @@ bool Db::stemExpand(const string &lang, const string &term,
    return true;
 }
 /** Add prefix to all strings in list */
 static void addPrefix(list<TermMatchEntry>& terms, const string& prefix)
 {
    if (prefix.empty())
 	return;
    for (list<TermMatchEntry>::iterator it = terms.begin(); 
         it != terms.end(); it++)
 	it->term.insert(0, prefix);
 }
 // Characters that can begin a wildcard or regexp expression. We use skipto
 // to begin the allterms search with terms that begin with the portion of
 // the input string prior to these chars.
@ -1409,7 +1415,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
 		   const string &root, 
 		   list<TermMatchEntry>& res,
 		   int max, 
-		   const string& field)
+		   const string& field,
                   string *prefixp
    )
 {
    if (!m_ndb || !m_ndb->m_isopen)
 	return false;
@ -1428,6 +1436,12 @@ bool Db::termMatch(MatchType typ, const string &lang,
    string prefix;
    if (!field.empty()) {
 	(void)fieldToPrefix(field, prefix); 
        if (prefix.empty()) {
            LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
                    field.c_str()));
        }
        if (prefixp)
            *prefixp = prefix;
    }
    if (typ == ET_STEM) {
@ -1443,6 +1457,8 @@ bool Db::termMatch(MatchType typ, const string &lang,
                return false;
 	    LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
 	}
        if (!prefix.empty())
            addPrefix(res, prefix);
    } else {
 	regex_t reg;
 	int errcode;
@ -1493,7 +1509,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
                            continue;
                    }
                    // Do we want stem expansion here? We don't do it for now
-                    res.push_back(TermMatchEntry(term, it.get_termfreq()));
+                    res.push_back(TermMatchEntry(*it, it.get_termfreq()));
                    ++n;
                }
                m_reason.erase();
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -158,7 +158,9 @@ class Db {
    enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
    bool termMatch(MatchType typ, const string &lang, const string &s, 
 		   list<TermMatchEntry>& result, int max = -1, 
-		   const string& field = "");
+		   const string& field = "",
                   string *prefix = 0
        );
    /** Special filename wildcard to XSFN terms expansion.
 	internal/searchdata use only */
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -240,9 +240,9 @@ class wsQData : public TextSplitCB {
 // translating.
 class StringToXapianQ {
 public:
-    StringToXapianQ(Db& db, const string& prefix, 
+    StringToXapianQ(Db& db, const string& field, 
 		    const string &stmlng, bool boostUser)
-	: m_db(db), m_prefix(prefix), m_stemlang(stmlng), 
+	: m_db(db), m_field(field), m_stemlang(stmlng), 
 	  m_doBoostUserTerms(boostUser)
    { }
@ -267,7 +267,7 @@ public:
 private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
-		      string& sterm);
+                    string& sterm, string *prefix);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
@ -276,7 +276,7 @@ private:
 			     bool useNear, int slack);
    Db&           m_db;
-    const string& m_prefix;
+    const string& m_field;
    const string& m_stemlang;
    bool          m_doBoostUserTerms;
    // Single terms and phrases resulting from breaking up text;
@ -309,9 +309,9 @@ static void listVector(const string& what, const vector<string>&l)
 * @param sterm output original input term if there were no wildcards
 */
 void StringToXapianQ::expandTerm(bool nostemexp, 
-				      const string& term, 
+                                 const string& term, 
-				      list<string>& exp,
+                                 list<string>& exp,
-				      string &sterm)
+                                 string &sterm, string *prefix)
 {
    LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
 	     term.c_str(), m_stemlang.c_str(), nostemexp));
@ -336,11 +336,13 @@ void StringToXapianQ::expandTerm(bool nostemexp,
    } else {
 	list<TermMatchEntry> l;
 	if (haswild) {
-	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
+	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l, -1, m_field,
                           prefix);
 	} else {
 	    sterm = term;
            m_uterms.push_back(sterm);
-	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
+	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l, -1, m_field,
                           prefix);
 	}
 	for (list<TermMatchEntry>::const_iterator it = l.begin(); 
 	     it != l.end(); it++) {
@ -384,23 +386,14 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
    }
 }
 /** Add prefix to all strings in list */
 static void addPrefix(list<string>& terms, const string& prefix)
 {
    if (prefix.empty())
 	return;
    for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
 	it->insert(0, prefix);
 }
 void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 					list<Xapian::Query> &pqueries)
 {
    list<string> exp;  
    string sterm; // dumb version of user term
-    expandTerm(nostemexp, span, exp, sterm);
+    string prefix;
    expandTerm(nostemexp, span, exp, sterm, &prefix);
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
    addPrefix(exp, m_prefix);
    // Push either term or OR of stem-expanded set
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
@ -412,7 +405,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
    if (m_doBoostUserTerms && !sterm.empty()) {
        xq = Xapian::Query(Xapian::Query::OP_OR, 
                           xq, 
-                           Xapian::Query(m_prefix+sterm, 
+                           Xapian::Query(prefix+sterm, 
                                         original_term_wqf_booster));
    }
    pqueries.push_back(xq);
@ -443,9 +436,9 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
 	string sterm;
 	list<string>exp;
-	expandTerm(nostemexp, *it, exp, sterm);
+        string prefix;
 	expandTerm(nostemexp, *it, exp, sterm, &prefix);
 	groups.push_back(vector<string>(exp.begin(), exp.end()));
 	addPrefix(exp, m_prefix);
 	orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
 					  exp.begin(), exp.end()));
 #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
@ -597,9 +590,6 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
 	LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
 	return false;
    }
    string prefix;
    if (!m_field.empty())
 	db.fieldToPrefix(m_field, prefix);
    list<Xapian::Query> pqueries;
    // We normally boost the original term in the stem expansion list. Don't
@ -608,7 +598,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
 	(m_parentSearch && !m_parentSearch->haveWildCards()) || 
 	(m_parentSearch == 0 && !m_haveWildCards);
-    StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
+    StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm);
    if (!tr.processUserString(m_text, m_reason, pqueries, 
 			      db.getStopList()))
 	return false;
@ -623,7 +613,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
    return true;
 }
-// Translate a FILENAME search clause. 
+// Translate a FILENAME search clause. Actually this is now mostly
 // a "filename" field search.
 bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, 
 					     const string&)
 {
@ -660,10 +651,6 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
    list<Xapian::Query> pqueries;
    Xapian::Query nq;
    string prefix;
    if (!m_field.empty())
 	db.fieldToPrefix(m_field, prefix);
    // We normally boost the original term in the stem expansion list. Don't
    // do it if there are wildcards anywhere, this would skew the results.
    bool doBoostUserTerm = 
@ -680,7 +667,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
    }
    string s = string("\"") + m_text + string("\"");
    bool useNear = (m_tp == SCLT_NEAR);
-    StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
+    StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm);
    if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
 			      m_slack, useNear))
 	return false;