Add frequency threshold to avoid adding common term to the automatic phrase search extension. Use autophrase by default with simple search, with a default freq threshold at 2%

2011-10-04 09:03:43 +02:00 · 2011-10-04 09:03:43 +02:00 · bb2685c2f5
commit bb2685c2f5
parent 4ced9bee49
7 changed files with 142 additions and 62 deletions
--- a/src/qtgui/guiutils.cpp
+++ b/src/qtgui/guiutils.cpp
@ -113,7 +113,9 @@ void rwSettings(bool writing)
 	       Bool, false);
    SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false);
    SETTING_RW(prefs.ssearchAutoPhrase, 
-	       "/Recoll/prefs/ssearchAutoPhrase", Bool, false);
+	       "/Recoll/prefs/ssearchAutoPhrase", Bool, true);
+    SETTING_RW(prefs.ssearchAutoPhraseThreshPC, 
+	       "/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0);
    SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8);
    SETTING_RW(prefs.collapseDuplicates, 
 	       "/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
--- a/src/qtgui/guiutils.h
+++ b/src/qtgui/guiutils.h
@ -85,6 +85,7 @@ class PrefsPack {
    QStringList ssearchHistory;
    // Make phrase out of search terms and add to search in simple search
    bool ssearchAutoPhrase;
+    double ssearchAutoPhraseThreshPC;
    // Ignored file types in adv search (startup default)
    QStringList asearchIgnFilTyps;
    bool        fileTypesByCats;
--- a/src/qtgui/ssearch_w.cpp
+++ b/src/qtgui/ssearch_w.cpp
@ -132,7 +132,8 @@ void SSearch::startSimpleSearch()
    if (tp == SST_LANG) {
 	string reason;
        if (prefs.autoSuffsEnable)
-            sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8());
+            sdata = wasaStringToRcl(theconfig, u8, reason, 
+				    (const char *)prefs.autoSuffs.toUtf8());
        else
            sdata = wasaStringToRcl(theconfig, u8, reason);
 	if (sdata == 0) {
@ -140,51 +141,40 @@ void SSearch::startSimpleSearch()
 				 QString::fromAscii(reason.c_str()));
 	    return;
 	}
-	if (prefs.ssearchAutoPhrase) {
-	    sdata->maybeAddAutoPhrase();
-	}
    } else {
 	sdata = new Rcl::SearchData(Rcl::SCLT_OR);
 	if (sdata == 0) {
 	    QMessageBox::warning(0, "Recoll", tr("Out of memory"));
 	    return;
 	}
-
-	// If there is no white space inside the query, then the user
-	// certainly means it as a phrase.
-	bool isreallyaphrase = false;
-	if (!TextSplit::hasVisibleWhite(u8))
-	    isreallyaphrase = true;
-
-	// Maybe add automatic phrase ? For ALL and ANY, and not if
-	// there is already a phrase or wildcard terms.
-	if (!isreallyaphrase && 
-	    prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
-	    u8.find_first_of("\"*[]?") == string::npos && 
-	    TextSplit::countWords(u8) > 1) {
-	    sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, 
-							   u8, 0));
-	}
 	Rcl::SearchDataClause *clp = 0;
-	switch (tp) {
-	case SST_ANY:
-	default:
-	    clp = isreallyaphrase ? 
-		new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
-		new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
-	    break;
-	case SST_ALL:
-	    clp = isreallyaphrase ? 
-		new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
-		new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
-	    break;
-	case SST_FNM:
+	if (tp == SST_FNM) {
 	    clp = new Rcl::SearchDataClauseFilename(u8);
-	    break;
+	} else if (!TextSplit::hasVisibleWhite(u8)) {
+	    // If there is no white space inside the query, then the user
+	    // certainly means it as a phrase.
+	    clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0);
+	} else {
+	    // ANY or ALL, several words.
+	    if (tp == SST_ANY) {
+		clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
+	    } else {
+		clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
+	    }
 	}
 	sdata->addClause(clp);
    }

+    if (prefs.ssearchAutoPhrase && rcldb) {
+	string stemLang = (const char *)prefs.queryStemLang.toAscii();
+	if (stemLang == "ALL") {
+	    theconfig->getConfParam("indexstemminglanguages", stemLang);
+	}
+	sdata->setStemlang(stemLang);
+	sdata->maybeAddAutoPhrase(*rcldb, 
+				  prefs.ssearchAutoPhraseThreshPC / 100.0);
+    }
+
    // Search terms history

    // We want to have the new text at the top and any older identical
--- a/src/qtgui/uiprefs.ui
+++ b/src/qtgui/uiprefs.ui
@ -6,7 +6,7 @@
   <rect>
    <x>0</x>
    <y>0</y>
-    <width>527</width>
+    <width>530</width>
    <height>559</height>
   </rect>
  </property>
@ -21,6 +21,9 @@
    <layout class="QVBoxLayout">
     <item>
      <widget class="QTabWidget" name="tabWidget">
+       <property name="currentIndex">
+        <number>1</number>
+       </property>
       <widget class="QWidget" name="tab">
        <attribute name="title">
         <string>User interface</string>
@ -300,12 +303,12 @@
           </item>
           <item>
            <widget class="QCheckBox" name="previewPlainPreCB">
+             <property name="toolTip">
+              <string>Lines in PRE text are not folded. Using BR loses indentation.</string>
+             </property>
             <property name="text">
              <string>Use &lt;PRE&gt; tags instead of &lt;BR&gt;to display plain text as html.</string>
             </property>
-               <property name="toolTip">
-                <string>Lines in PRE text are not folded. Using BR loses indentation.</string>
-               </property>
             <property name="checked">
              <bool>false</bool>
             </property>
@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear
             </property>
            </widget>
           </item>
+           <item>
+            <layout class="QHBoxLayout">
+             <item>
+              <widget class="QLabel" name="textLabel33">
+               <property name="sizePolicy">
+                <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+                 <horstretch>1</horstretch>
+                 <verstretch>0</verstretch>
+                </sizepolicy>
+               </property>
+               <property name="toolTip">
+                <string>Frequency percentage threshold over which we do not use terms inside autophrase. 
+Frequent terms are a major performance issue with phrases. 
+Skipped terms augment the phrase slack, and reduce the autophrase efficiency.
+The default value is 2 (percent). </string>
+               </property>
+               <property name="text">
+                <string>Autophrase term frequency threshold percentage</string>
+               </property>
+               <property name="wordWrap">
+                <bool>false</bool>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QDoubleSpinBox" name="autoPThreshSB">
+               <property name="singleStep">
+                <double>0.200000000000000</double>
+               </property>
+               <property name="value">
+                <double>2.000000000000000</double>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
           <item>
            <widget class="Line" name="line2">
             <property name="frameShape">
--- a/src/qtgui/uiprefs_w.cpp
+++ b/src/qtgui/uiprefs_w.cpp
@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs()
    stemLangCMB->setCurrentIndex(cur);

    autoPhraseCB->setChecked(prefs.ssearchAutoPhrase);
+    autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC);

    buildAbsCB->setChecked(prefs.queryBuildAbstract);
    replAbsCB->setEnabled(prefs.queryBuildAbstract);
@ -199,6 +200,7 @@ void UIPrefsDialog::accept()
 	prefs.queryStemLang = stemLangCMB->currentText();
    }
    prefs.ssearchAutoPhrase = autoPhraseCB->isChecked();
+    prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value();
    prefs.queryBuildAbstract = buildAbsCB->isChecked();
    prefs.queryReplaceAbstract = buildAbsCB->isChecked() && 
 	replAbsCB->isChecked();
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)

 bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
 {
+    LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n", 
+	    m_stemlang.c_str()));
    Xapian::Query xq;
    m_reason.erase();

@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
    return true;
 }

-
-bool SearchData::maybeAddAutoPhrase()
+// This is called by the GUI simple search if the option is set: add
+// (OR) phrase to a query (if it is simple enough) so that results
+// where the search terms are close and in order will come up on top.
+// We remove very common terms from the query to avoid performance issues.
+bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
 {
    LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
    if (!m_query.size()) {
@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase()
    }

    string field;
-    string words;
+    list<string> words;
    // Walk the clause list. If we find any non simple clause or different
    // field names, bail out.
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
 	SClType tp = (*it)->m_tp;
 	if (tp != SCLT_AND && tp != SCLT_OR) {
-	    LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n"));
+	    LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n"));
 	    return false;
 	}
 	SearchDataClauseSimple *clp = 
@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase()
 	    field = clp->getfield();
 	} else {
 	    if (clp->getfield().compare(field)) {
-		LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n"));
+		LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
 		return false;
 	    }
 	}
-	if (!words.empty())
-	    words += " ";
-	words +=  clp->gettext();
+
+	// If there are wildcards or quotes in there, bail out
+	if (clp->gettext().find_first_of("\"*[?") != string::npos) { 
+	    LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
+	    return false;
+	}
+        // Do a simple word-split here, don't bother with the full-blown
+	// textsplit. The autophrase thing is just "best effort", it's
+	// normal that it won't work in strange cases.
+	vector<string> wl;
+	stringToStrings(clp->gettext(), wl);
+	words.insert(words.end(), wl.begin(), wl.end());
    }

-    // If there are wildcards or quotes in there, or this is a single word,
-    // bail out
-    if (words.find_first_of("\"*[?") != string::npos &&
-	TextSplit::countWords(words) <= 1) { 
-	LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n"));
+
+    // Trim the word list by eliminating very frequent terms
+    // (increasing the slack as we do it):
+    int slack = 0;
+    int doccnt = db.docCnt();
+    if (!doccnt)
+	doccnt = 1;
+    string swords;
+    for (list<string>::iterator it = words.begin(); 
+	 it != words.end(); it++) {
+	double freq = double(db.termDocCnt(*it)) / doccnt;
+	if (freq < freqThreshold) {
+	    if (!swords.empty())
+		swords.append(1, ' ');
+	    swords += *it;
+	} else {
+	    LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n", 
+		    it->c_str(), 100 * freq));
+	    slack++;
+	}
+    }
+    
+    // We can't make a phrase with a single word :)
+    if (TextSplit::countWords(swords) <= 1) {
+	LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
 	return false;
    }
-
+	
    SearchDataClauseDist *nclp = 
-	new SearchDataClauseDist(SCLT_PHRASE, words, 0, field);
+	new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field);
+
+    // If the toplevel conjunction is an OR, just OR the phrase, else 
+    // deepen the tree.
    if (m_tp == SCLT_OR) {
 	addClause(nclp);
    } else {
@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase()
 	// phrase.
 	SearchData *sd = new SearchData(m_tp);
 	sd->m_query = m_query;
+	sd->m_stemlang = m_stemlang;
 	m_tp = SCLT_OR;
 	m_query.clear();
 	SearchDataClauseSub *oq = 
@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp,
                                 list<string>& exp,
                                 string &sterm, const string& prefix)
 {
-    LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", 
-             m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
+    LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
+	     m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
    sterm.erase();
    exp.clear();
    if (term.empty()) {
@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp,
    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;

    // No stemming if there are wildcards or prevented globally.
-    if (haswild || m_stemlang.empty())
+    if (haswild || m_stemlang.empty()) {
+	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
 	nostemexp = true;
+    }

    if (nostemexp && !haswild) {
 	sterm = term;
@ -631,6 +671,8 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
 void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 					list<Xapian::Query> &pqueries)
 {
+    LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
+	     span.c_str(), int(nostemexp)));
    list<string> exp;  
    string sterm; // dumb version of user term

@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
 {
    const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
 	stemlang;
+    LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
+	     stemlang.c_str()));

    m_terms.clear();
    m_groups.clear();
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -98,11 +98,13 @@ public:
    bool addClause(SearchDataClause *cl);

    /** If this is a simple query (one field only, no distance clauses),
-	add phrase made of query terms to query, so that docs containing the
-	user terms in order will have higher relevance. This must be called 
-	before toNativeQuery().
-    */
-    bool maybeAddAutoPhrase();
+     * add phrase made of query terms to query, so that docs containing the
+     * user terms in order will have higher relevance. This must be called 
+     * before toNativeQuery().
+     * @param threshold: don't use terms more frequent than the value 
+     *     (proportion of docs where they occur) 	
+     */
+    bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);

    /** Set/get top subdirectory for filtering results */
    void setTopdir(const string& t, bool excl = false, float w = 1.0)