Support multi-word synonyms and add modifier to turn-off synonyms expansion

2015-08-23 12:15:52 +02:00 · 2015-08-23 12:15:52 +02:00 · e7a669b668
commit e7a669b668
parent 766a34a8db
6 changed files with 112 additions and 61 deletions
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common
            stemming is off by default for phrases).</para>
            </listitem>

+            <listitem><para><literal>s</literal> can be used to turn off
+            synonym expansion, if a synonyms file is in place (only for
+            &RCL; 1.22 and later).</para>
+            </listitem>
+
            <listitem><para><literal>o</literal> can be used to specify a
            "slack" for phrase and proximity searches: the number of
            additional terms that may be found between the specified
--- a/src/query/wasaparse.ypp
+++ b/src/query/wasaparse.ypp
@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
                //cerr << "set slack " << cl->getslack() << " done" << endl;
            }
            break;
+        case 's': 
+            cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
+            break;
+	case 'S':
+            break;
        case '.':case '0':case '1':case '2':case '3':case '4':
        case '5':case '6':case '7':case '8':case '9':
        {
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -341,8 +341,9 @@ class Db {
     * Stem expansion is performed if lang is not empty 
     * 
     * @param typ_sens defines the kind of expansion: none, wildcard, 
-     *    regexp or stemming. "none" will still expand case and
-     *    diacritics depending on the casesens and diacsens flags.
+     *    regexp or stemming. "none" may still expand case,
+     *    diacritics and synonyms, depending on the casesens, diacsens and 
+     *    synexp flags.
     * @param lang sets the stemming language(s). Can be a space-separated list
     * @param term is the term to expand
     * @param result is the main output
@ -354,14 +355,14 @@ class Db {
     *        in the TermMatchResult header
     */
    enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3, 
-		    ET_DIACSENS=8, ET_CASESENS=16};
+		    ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32};
    int matchTypeTp(int tp) 
    {
 	return tp & 7;
    }
    bool termMatch(int typ_sens, const string &lang, const string &term, 
-		   TermMatchResult& result, int max = -1, 
-		   const string& field = cstr_null);
+		   TermMatchResult& result, int max = -1,
+		   const string& field = "", vector<string> *multiwords = 0);
    bool dbStats(DbStats& stats);
    /** Return min and max years for doc mod times in db */
    bool maxYearSpan(int *minyear, int *maxyear);
--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@ -164,7 +164,8 @@ static const char *tmtptostr(int typ)
 // using the main index terms (filtering, retrieving stats, expansion
 // in some cases).
 bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
-		   TermMatchResult& res, int max,  const string& field)
+		   TermMatchResult& res, int max,  const string& field,
+		   vector<string>* multiwords)
 {
    int matchtyp = matchTypeTp(typ_sens);
    if (!m_ndb || !m_ndb->m_isopen)
@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
 	    synac.synExpand(term, lexp);
 	}

-	if (matchTypeTp(typ_sens) == ET_STEM) {
+	if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
 	    // Need stem expansion. Lowercase the result of accent and case
 	    // expansion for input to stemdb.
 	    for (unsigned int i = 0; i < lexp.size(); i++) {
@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
 	    }
 	    sort(lexp.begin(), lexp.end());
 	    lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
-	    StemDb sdb(xrdb);
-	    vector<string> exp1;
-	    for (vector<string>::const_iterator it = lexp.begin(); 
-		 it != lexp.end(); it++) {
-		sdb.stemExpand(lang, *it, exp1);
-	    }
-	    LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));

-	    lexp.clear();
+	    if (matchtyp == ET_STEM) {
+		StemDb sdb(xrdb);
+		vector<string> exp1;
+		for (vector<string>::const_iterator it = lexp.begin(); 
+		     it != lexp.end(); it++) {
+		    sdb.stemExpand(lang, *it, exp1);
+		}
+		exp1.swap(lexp);
+		sort(lexp.begin(), lexp.end());
+		lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
+		LOGDEB(("ExpTerm: stemexp: %s\n", 
+			stringsToString(lexp).c_str()));
+	    }
+
 	    // Expand the result for synonyms. Note that doing it here
 	    // means that multi-term synonyms will not work
 	    // (e.g. stakhanovist -> "hard at work". We would have to
 	    // separate the multi-word expansions for our caller to
 	    // add them as phrases to the query. Not impossible, but
 	    // let's keep it at single words for now.
-	    if (m_syngroups.ok()) {
+	    if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
 		LOGDEB(("ExpTerm: got syngroups\n"));
-		for (vector<string>::const_iterator it = exp1.begin(); 
-		     it != exp1.end(); it++) {
+		vector<string> exp1(lexp);
+		for (vector<string>::const_iterator it = lexp.begin(); 
+		     it != lexp.end(); it++) {
 		    vector<string> sg = m_syngroups.getgroup(*it);
 		    if (!sg.empty()) {
 			LOGDEB(("ExpTerm: syns: %s -> %s\n", 
 				it->c_str(), stringsToString(sg).c_str()));
-			lexp.insert(lexp.end(), sg.begin(), sg.end());
+			for (vector<string>::const_iterator it1 = sg.begin();
+			     it1 != sg.end(); it1++) {
+			    if (it1->find_first_of(" ") != string::npos) {
+				if (multiwords)
+				    multiwords->push_back(*it1);
+			    } else {
+				exp1.push_back(*it);
+			    }
+			}
 		    }
 		}
+		lexp.swap(exp1);
 		sort(lexp.begin(), lexp.end());
 		lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
-		// Keep result in exp1 for next step
-		exp1.swap(lexp);
 	    }

 	    // Expand the resulting list for case (all stemdb content
 	    // is lowercase)
-	    lexp.clear();
-	    for (vector<string>::const_iterator it = exp1.begin(); 
-		 it != exp1.end(); it++) {
-		synac.synExpand(*it, lexp);
+	    vector<string> exp1;
+	    for (vector<string>::const_iterator it = lexp.begin(); 
+		 it != lexp.end(); it++) {
+		synac.synExpand(*it, exp1);
 	    }
+	    exp1.swap(lexp);
 	    sort(lexp.begin(), lexp.end());
 	    lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
 	}
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -224,9 +224,10 @@ private:

 class SearchDataClause {
 public:
-    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
-		   SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
-		   SDCM_NOTERMS=32 // Don't include terms for highlighting
+    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
+		   SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
+		   SDCM_NOTERMS=0x20, // Don't include terms for highlighting
+		   SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
    };
    enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};

@ -382,7 +383,8 @@ protected:
    bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods, 
 		    const std::string& term, 
 		    std::vector<std::string>& exp, 
-                    std::string& sterm, const std::string& prefix);
+                    std::string& sterm, const std::string& prefix,
+		    std::vector<std::string>* multiwords = 0);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span, 
 			   int mods, void *pq);
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@ -379,17 +379,6 @@ private:
 };


-#if 1
-static void listVector(const string& what, const vector<string>&l)
-{
-    string a;
-    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
-        a = a + *it + " ";
-    }
-    LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
-}
-#endif
-
 /** Expand term into term list, using appropriate mode: stem, wildcards, 
 *  diacritics... 
 *
@ -400,12 +389,16 @@ static void listVector(const string& what, const vector<string>&l)
 * @param prefix field prefix in index. We could recompute it, but the caller
 *  has it already. Used in the simple case where there is nothing to expand, 
 *  and we just return the prefixed term (else Db::termMatch deals with it).
+ * @param multiwords it may happen that synonym processing results in multi-word
+ *   expansions which should be processed as phrases.
 */
 bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, 
 					string& ermsg, int mods, 
 					const string& term, 
 					vector<string>& oexp, string &sterm,
-					const string& prefix)
+					const string& prefix,
+					vector<string>* multiwords
+    )
 {
    LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
 	     mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
 	nostemexp = true;
    }

-    // noexpansion can be modified further down by possible case/diac expansion
-    bool noexpansion = nostemexp && !haswild; 
-
-    int termmatchsens = 0;
-
    bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
    bool case_sensitive = (mods & SDCM_CASESENS) != 0;
+    bool synonyms = (mods & SDCM_NOSYNS) == 0;
+
+    // noexpansion can be modified further down by possible case/diac expansion
+    bool noexpansion = nostemexp && !haswild && !synonyms; 

    if (o_index_stripchars) {
 	diac_sensitive = case_sensitive = false;
@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
 	    noexpansion = false;
    }

-    if (case_sensitive)
-	termmatchsens |= Db::ET_CASESENS;
-    if (diac_sensitive)
-	termmatchsens |= Db::ET_DIACSENS;

    if (noexpansion) {
 	oexp.push_back(prefix + term);
@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
 	return true;
    } 

+    int termmatchsens = 0;
+    if (case_sensitive)
+	termmatchsens |= Db::ET_CASESENS;
+    if (diac_sensitive)
+	termmatchsens |= Db::ET_DIACSENS;
+    if (synonyms)
+	termmatchsens |= Db::ET_SYNEXP;
+	
    Db::MatchType mtyp = haswild ? Db::ET_WILD : 
 	nostemexp ? Db::ET_NONE : Db::ET_STEM;
    TermMatchResult res;
-    if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
-		      m_field)) {
+    if (!db.termMatch(mtyp | termmatchsens, getStemLang(), 
+		      term, res, maxexpand,  m_field, multiwords)) {
 	// Let it go through
    }

@ -560,9 +556,17 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
    }
 }

-void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
-					       const string& span, 
-					       int mods, void * pq)
+static void prefix_vector(vector<string>& v, const string& prefix)
+{
+    for (vector<string>::iterator it = v.begin(); it != v.end(); it++) {
+	*it = prefix + *it;
+    }
+}
+
+void SearchDataClauseSimple::
+processSimpleSpan(Rcl::Db &db, string& ermsg,
+		  const string& span, 
+		  int mods, void * pq)
 {
    vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
    LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
 	if (ftp->noterms)
-	    addModifier(SDCM_NOTERMS);
+	    addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
 	prefix = wrap_prefix(ftp->pfx);
    }

-    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
+    vector<string> multiwords;
+    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
 	return;
    
    // Set up the highlight data. No prefix should go in there
@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
 			   Xapian::Query(prefix+sterm, 
 					 original_term_wqf_booster));
    }
+
+    // Push phrases for the multi-word expansions
+    for (vector<string>::const_iterator mwp = multiwords.begin();
+	 mwp != multiwords.end(); mwp++) {
+	vector<string> phr;
+	// We just do a basic split to keep things a bit simpler here
+	// (no textsplit). This means though that no punctuation is
+	// allowed in multi-word synonyms.
+	stringToTokens(*mwp, phr);
+	if (!prefix.empty())
+	    prefix_vector(phr, prefix);
+	xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
+			   Xapian::Query(Xapian::Query::OP_PHRASE, 
+					 phr.begin(), phr.end()));
+	m_curcl++;
+    }
+
    pqueries.push_back(xq);
 }

@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
 	vector<string> exp;
 	if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
 	    return;
-	LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
-	listVector("", exp);
+	LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(),
+		 stringsToString(exp).c_str()));
 	// groups is used for highlighting, we don't want prefixes in there.
 	vector<string> noprefs;
 	for (vector<string>::const_iterator it = exp.begin(); 
@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
 			*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
 	    return false;
 	}
-	LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
-	listVector("", exp);
+	LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(),
+		 stringsToString(exp).c_str()));
 	if (exp.size() == 1)
 	    orqueries.push_back(Xapian::Query(exp[0]));
 	else