diff --git a/src/qtgui/guiutils.cpp b/src/qtgui/guiutils.cpp
index 96b3375c..3b73b544 100644
--- a/src/qtgui/guiutils.cpp
+++ b/src/qtgui/guiutils.cpp
@@ -113,7 +113,9 @@ void rwSettings(bool writing)
Bool, false);
SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false);
SETTING_RW(prefs.ssearchAutoPhrase,
- "/Recoll/prefs/ssearchAutoPhrase", Bool, false);
+ "/Recoll/prefs/ssearchAutoPhrase", Bool, true);
+ SETTING_RW(prefs.ssearchAutoPhraseThreshPC,
+ "/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0);
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8);
SETTING_RW(prefs.collapseDuplicates,
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
diff --git a/src/qtgui/guiutils.h b/src/qtgui/guiutils.h
index aa573150..4e3d5d56 100644
--- a/src/qtgui/guiutils.h
+++ b/src/qtgui/guiutils.h
@@ -85,6 +85,7 @@ class PrefsPack {
QStringList ssearchHistory;
// Make phrase out of search terms and add to search in simple search
bool ssearchAutoPhrase;
+ double ssearchAutoPhraseThreshPC;
// Ignored file types in adv search (startup default)
QStringList asearchIgnFilTyps;
bool fileTypesByCats;
diff --git a/src/qtgui/ssearch_w.cpp b/src/qtgui/ssearch_w.cpp
index 33ba5c06..fc75b5f4 100644
--- a/src/qtgui/ssearch_w.cpp
+++ b/src/qtgui/ssearch_w.cpp
@@ -132,7 +132,8 @@ void SSearch::startSimpleSearch()
if (tp == SST_LANG) {
string reason;
if (prefs.autoSuffsEnable)
- sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8());
+ sdata = wasaStringToRcl(theconfig, u8, reason,
+ (const char *)prefs.autoSuffs.toUtf8());
else
sdata = wasaStringToRcl(theconfig, u8, reason);
if (sdata == 0) {
@@ -140,51 +141,40 @@ void SSearch::startSimpleSearch()
QString::fromAscii(reason.c_str()));
return;
}
- if (prefs.ssearchAutoPhrase) {
- sdata->maybeAddAutoPhrase();
- }
} else {
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
return;
}
-
- // If there is no white space inside the query, then the user
- // certainly means it as a phrase.
- bool isreallyaphrase = false;
- if (!TextSplit::hasVisibleWhite(u8))
- isreallyaphrase = true;
-
- // Maybe add automatic phrase ? For ALL and ANY, and not if
- // there is already a phrase or wildcard terms.
- if (!isreallyaphrase &&
- prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
- u8.find_first_of("\"*[]?") == string::npos &&
- TextSplit::countWords(u8) > 1) {
- sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
- u8, 0));
- }
Rcl::SearchDataClause *clp = 0;
- switch (tp) {
- case SST_ANY:
- default:
- clp = isreallyaphrase ?
- new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
- new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
- break;
- case SST_ALL:
- clp = isreallyaphrase ?
- new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
- new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
- break;
- case SST_FNM:
+ if (tp == SST_FNM) {
clp = new Rcl::SearchDataClauseFilename(u8);
- break;
+ } else if (!TextSplit::hasVisibleWhite(u8)) {
+ // If there is no white space inside the query, then the user
+ // certainly means it as a phrase.
+ clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0);
+ } else {
+ // ANY or ALL, several words.
+ if (tp == SST_ANY) {
+ clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
+ } else {
+ clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
+ }
}
sdata->addClause(clp);
}
+ if (prefs.ssearchAutoPhrase && rcldb) {
+ string stemLang = (const char *)prefs.queryStemLang.toAscii();
+ if (stemLang == "ALL") {
+ theconfig->getConfParam("indexstemminglanguages", stemLang);
+ }
+ sdata->setStemlang(stemLang);
+ sdata->maybeAddAutoPhrase(*rcldb,
+ prefs.ssearchAutoPhraseThreshPC / 100.0);
+ }
+
// Search terms history
// We want to have the new text at the top and any older identical
diff --git a/src/qtgui/uiprefs.ui b/src/qtgui/uiprefs.ui
index 6809f1ab..071f1e64 100644
--- a/src/qtgui/uiprefs.ui
+++ b/src/qtgui/uiprefs.ui
@@ -6,7 +6,7 @@
0
0
- 527
+ 530
559
@@ -21,6 +21,9 @@
-
+
+ 1
+
User interface
@@ -300,12 +303,12 @@
-
+
+ Lines in PRE text are not folded. Using BR loses indentation.
+
Use <PRE> tags instead of <BR>to display plain text as html.
-
- Lines in PRE text are not folded. Using BR loses indentation.
-
false
@@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear
+ -
+
+
-
+
+
+
+ 1
+ 0
+
+
+
+ Frequency percentage threshold over which we do not use terms inside autophrase.
+Frequent terms are a major performance issue with phrases.
+Skipped terms augment the phrase slack, and reduce the autophrase efficiency.
+The default value is 2 (percent).
+
+
+ Autophrase term frequency threshold percentage
+
+
+ false
+
+
+
+ -
+
+
+ 0.200000000000000
+
+
+ 2.000000000000000
+
+
+
+
+
-
diff --git a/src/qtgui/uiprefs_w.cpp b/src/qtgui/uiprefs_w.cpp
index fcd75a43..16fae8b8 100644
--- a/src/qtgui/uiprefs_w.cpp
+++ b/src/qtgui/uiprefs_w.cpp
@@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs()
stemLangCMB->setCurrentIndex(cur);
autoPhraseCB->setChecked(prefs.ssearchAutoPhrase);
+ autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC);
buildAbsCB->setChecked(prefs.queryBuildAbstract);
replAbsCB->setEnabled(prefs.queryBuildAbstract);
@@ -199,6 +200,7 @@ void UIPrefsDialog::accept()
prefs.queryStemLang = stemLangCMB->currentText();
}
prefs.ssearchAutoPhrase = autoPhraseCB->isChecked();
+ prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value();
prefs.queryBuildAbstract = buildAbsCB->isChecked();
prefs.queryReplaceAbstract = buildAbsCB->isChecked() &&
replAbsCB->isChecked();
diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
index be834213..77d60748 100644
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector& tps)
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
+ LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
+ m_stemlang.c_str()));
Xapian::Query xq;
m_reason.erase();
@@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
return true;
}
-
-bool SearchData::maybeAddAutoPhrase()
+// This is called by the GUI simple search if the option is set: add
+// (OR) phrase to a query (if it is simple enough) so that results
+// where the search terms are close and in order will come up on top.
+// We remove very common terms from the query to avoid performance issues.
+bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
{
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
if (!m_query.size()) {
@@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase()
}
string field;
- string words;
+ list words;
// Walk the clause list. If we find any non simple clause or different
// field names, bail out.
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
SClType tp = (*it)->m_tp;
if (tp != SCLT_AND && tp != SCLT_OR) {
- LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n"));
+ LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n"));
return false;
}
SearchDataClauseSimple *clp =
@@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase()
field = clp->getfield();
} else {
if (clp->getfield().compare(field)) {
- LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n"));
+ LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
return false;
}
}
- if (!words.empty())
- words += " ";
- words += clp->gettext();
+
+ // If there are wildcards or quotes in there, bail out
+ if (clp->gettext().find_first_of("\"*[?") != string::npos) {
+ LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
+ return false;
+ }
+ // Do a simple word-split here, don't bother with the full-blown
+ // textsplit. The autophrase thing is just "best effort", it's
+ // normal that it won't work in strange cases.
+ vector wl;
+ stringToStrings(clp->gettext(), wl);
+ words.insert(words.end(), wl.begin(), wl.end());
}
- // If there are wildcards or quotes in there, or this is a single word,
- // bail out
- if (words.find_first_of("\"*[?") != string::npos &&
- TextSplit::countWords(words) <= 1) {
- LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n"));
+
+ // Trim the word list by eliminating very frequent terms
+ // (increasing the slack as we do it):
+ int slack = 0;
+ int doccnt = db.docCnt();
+ if (!doccnt)
+ doccnt = 1;
+ string swords;
+ for (list::iterator it = words.begin();
+ it != words.end(); it++) {
+ double freq = double(db.termDocCnt(*it)) / doccnt;
+ if (freq < freqThreshold) {
+ if (!swords.empty())
+ swords.append(1, ' ');
+ swords += *it;
+ } else {
+ LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
+ it->c_str(), 100 * freq));
+ slack++;
+ }
+ }
+
+ // We can't make a phrase with a single word :)
+ if (TextSplit::countWords(swords) <= 1) {
+ LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
return false;
}
-
+
SearchDataClauseDist *nclp =
- new SearchDataClauseDist(SCLT_PHRASE, words, 0, field);
+ new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field);
+
+ // If the toplevel conjunction is an OR, just OR the phrase, else
+ // deepen the tree.
if (m_tp == SCLT_OR) {
addClause(nclp);
} else {
@@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase()
// phrase.
SearchData *sd = new SearchData(m_tp);
sd->m_query = m_query;
+ sd->m_stemlang = m_stemlang;
m_tp = SCLT_OR;
m_query.clear();
SearchDataClauseSub *oq =
@@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp,
list& exp,
string &sterm, const string& prefix)
{
- LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
- m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
+ LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
+ m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
exp.clear();
if (term.empty()) {
@@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp,
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// No stemming if there are wildcards or prevented globally.
- if (haswild || m_stemlang.empty())
+ if (haswild || m_stemlang.empty()) {
+ LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
+ }
if (nostemexp && !haswild) {
sterm = term;
@@ -631,6 +671,8 @@ void multiply_groups(vector >::const_iterator vvit,
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
list &pqueries)
{
+ LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
+ span.c_str(), int(nostemexp)));
list exp;
string sterm; // dumb version of user term
@@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
{
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
+ LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
+ stemlang.c_str()));
m_terms.clear();
m_groups.clear();
diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h
index 96ac04df..cf390d1c 100644
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@@ -98,11 +98,13 @@ public:
bool addClause(SearchDataClause *cl);
/** If this is a simple query (one field only, no distance clauses),
- add phrase made of query terms to query, so that docs containing the
- user terms in order will have higher relevance. This must be called
- before toNativeQuery().
- */
- bool maybeAddAutoPhrase();
+ * add phrase made of query terms to query, so that docs containing the
+ * user terms in order will have higher relevance. This must be called
+ * before toNativeQuery().
+ * @param threshold: don't use terms more frequent than the value
+ * (proportion of docs where they occur)
+ */
+ bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
/** Set/get top subdirectory for filtering results */
void setTopdir(const string& t, bool excl = false, float w = 1.0)