Add frequency threshold to avoid adding common term to the automatic phrase search extension. Use autophrase by default with simple search, with a default freq threshold at 2%

This commit is contained in:
Jean-Francois Dockes 2011-10-04 09:03:43 +02:00
parent 4ced9bee49
commit bb2685c2f5
7 changed files with 142 additions and 62 deletions

View File

@ -113,7 +113,9 @@ void rwSettings(bool writing)
Bool, false); Bool, false);
SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false); SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false);
SETTING_RW(prefs.ssearchAutoPhrase, SETTING_RW(prefs.ssearchAutoPhrase,
"/Recoll/prefs/ssearchAutoPhrase", Bool, false); "/Recoll/prefs/ssearchAutoPhrase", Bool, true);
SETTING_RW(prefs.ssearchAutoPhraseThreshPC,
"/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0);
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8); SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8);
SETTING_RW(prefs.collapseDuplicates, SETTING_RW(prefs.collapseDuplicates,
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false); "/Recoll/prefs/reslist/collapseDuplicates", Bool, false);

View File

@ -85,6 +85,7 @@ class PrefsPack {
QStringList ssearchHistory; QStringList ssearchHistory;
// Make phrase out of search terms and add to search in simple search // Make phrase out of search terms and add to search in simple search
bool ssearchAutoPhrase; bool ssearchAutoPhrase;
double ssearchAutoPhraseThreshPC;
// Ignored file types in adv search (startup default) // Ignored file types in adv search (startup default)
QStringList asearchIgnFilTyps; QStringList asearchIgnFilTyps;
bool fileTypesByCats; bool fileTypesByCats;

View File

@ -132,7 +132,8 @@ void SSearch::startSimpleSearch()
if (tp == SST_LANG) { if (tp == SST_LANG) {
string reason; string reason;
if (prefs.autoSuffsEnable) if (prefs.autoSuffsEnable)
sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8()); sdata = wasaStringToRcl(theconfig, u8, reason,
(const char *)prefs.autoSuffs.toUtf8());
else else
sdata = wasaStringToRcl(theconfig, u8, reason); sdata = wasaStringToRcl(theconfig, u8, reason);
if (sdata == 0) { if (sdata == 0) {
@ -140,51 +141,40 @@ void SSearch::startSimpleSearch()
QString::fromAscii(reason.c_str())); QString::fromAscii(reason.c_str()));
return; return;
} }
if (prefs.ssearchAutoPhrase) {
sdata->maybeAddAutoPhrase();
}
} else { } else {
sdata = new Rcl::SearchData(Rcl::SCLT_OR); sdata = new Rcl::SearchData(Rcl::SCLT_OR);
if (sdata == 0) { if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Out of memory")); QMessageBox::warning(0, "Recoll", tr("Out of memory"));
return; return;
} }
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (!TextSplit::hasVisibleWhite(u8))
isreallyaphrase = true;
// Maybe add automatic phrase ? For ALL and ANY, and not if
// there is already a phrase or wildcard terms.
if (!isreallyaphrase &&
prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
u8.find_first_of("\"*[]?") == string::npos &&
TextSplit::countWords(u8) > 1) {
sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
u8, 0));
}
Rcl::SearchDataClause *clp = 0; Rcl::SearchDataClause *clp = 0;
switch (tp) { if (tp == SST_FNM) {
case SST_ANY:
default:
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
break;
case SST_ALL:
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
break;
case SST_FNM:
clp = new Rcl::SearchDataClauseFilename(u8); clp = new Rcl::SearchDataClauseFilename(u8);
break; } else if (!TextSplit::hasVisibleWhite(u8)) {
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0);
} else {
// ANY or ALL, several words.
if (tp == SST_ANY) {
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
} else {
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
}
} }
sdata->addClause(clp); sdata->addClause(clp);
} }
if (prefs.ssearchAutoPhrase && rcldb) {
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
sdata->maybeAddAutoPhrase(*rcldb,
prefs.ssearchAutoPhraseThreshPC / 100.0);
}
// Search terms history // Search terms history
// We want to have the new text at the top and any older identical // We want to have the new text at the top and any older identical

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>527</width> <width>530</width>
<height>559</height> <height>559</height>
</rect> </rect>
</property> </property>
@ -21,6 +21,9 @@
<layout class="QVBoxLayout"> <layout class="QVBoxLayout">
<item> <item>
<widget class="QTabWidget" name="tabWidget"> <widget class="QTabWidget" name="tabWidget">
<property name="currentIndex">
<number>1</number>
</property>
<widget class="QWidget" name="tab"> <widget class="QWidget" name="tab">
<attribute name="title"> <attribute name="title">
<string>User interface</string> <string>User interface</string>
@ -300,12 +303,12 @@
</item> </item>
<item> <item>
<widget class="QCheckBox" name="previewPlainPreCB"> <widget class="QCheckBox" name="previewPlainPreCB">
<property name="toolTip">
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
</property>
<property name="text"> <property name="text">
<string>Use &lt;PRE&gt; tags instead of &lt;BR&gt;to display plain text as html.</string> <string>Use &lt;PRE&gt; tags instead of &lt;BR&gt;to display plain text as html.</string>
</property> </property>
<property name="toolTip">
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
</property>
<property name="checked"> <property name="checked">
<bool>false</bool> <bool>false</bool>
</property> </property>
@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear
</property> </property>
</widget> </widget>
</item> </item>
<item>
<layout class="QHBoxLayout">
<item>
<widget class="QLabel" name="textLabel33">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Preferred">
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="toolTip">
<string>Frequency percentage threshold over which we do not use terms inside autophrase.
Frequent terms are a major performance issue with phrases.
Skipped terms augment the phrase slack, and reduce the autophrase efficiency.
The default value is 2 (percent). </string>
</property>
<property name="text">
<string>Autophrase term frequency threshold percentage</string>
</property>
<property name="wordWrap">
<bool>false</bool>
</property>
</widget>
</item>
<item>
<widget class="QDoubleSpinBox" name="autoPThreshSB">
<property name="singleStep">
<double>0.200000000000000</double>
</property>
<property name="value">
<double>2.000000000000000</double>
</property>
</widget>
</item>
</layout>
</item>
<item> <item>
<widget class="Line" name="line2"> <widget class="Line" name="line2">
<property name="frameShape"> <property name="frameShape">

View File

@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs()
stemLangCMB->setCurrentIndex(cur); stemLangCMB->setCurrentIndex(cur);
autoPhraseCB->setChecked(prefs.ssearchAutoPhrase); autoPhraseCB->setChecked(prefs.ssearchAutoPhrase);
autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC);
buildAbsCB->setChecked(prefs.queryBuildAbstract); buildAbsCB->setChecked(prefs.queryBuildAbstract);
replAbsCB->setEnabled(prefs.queryBuildAbstract); replAbsCB->setEnabled(prefs.queryBuildAbstract);
@ -199,6 +200,7 @@ void UIPrefsDialog::accept()
prefs.queryStemLang = stemLangCMB->currentText(); prefs.queryStemLang = stemLangCMB->currentText();
} }
prefs.ssearchAutoPhrase = autoPhraseCB->isChecked(); prefs.ssearchAutoPhrase = autoPhraseCB->isChecked();
prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value();
prefs.queryBuildAbstract = buildAbsCB->isChecked(); prefs.queryBuildAbstract = buildAbsCB->isChecked();
prefs.queryReplaceAbstract = buildAbsCB->isChecked() && prefs.queryReplaceAbstract = buildAbsCB->isChecked() &&
replAbsCB->isChecked(); replAbsCB->isChecked();

View File

@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
bool SearchData::toNativeQuery(Rcl::Db &db, void *d) bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{ {
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
m_stemlang.c_str()));
Xapian::Query xq; Xapian::Query xq;
m_reason.erase(); m_reason.erase();
@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
return true; return true;
} }
// This is called by the GUI simple search if the option is set: add
bool SearchData::maybeAddAutoPhrase() // (OR) phrase to a query (if it is simple enough) so that results
// where the search terms are close and in order will come up on top.
// We remove very common terms from the query to avoid performance issues.
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
{ {
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n")); LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
if (!m_query.size()) { if (!m_query.size()) {
@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase()
} }
string field; string field;
string words; list<string> words;
// Walk the clause list. If we find any non simple clause or different // Walk the clause list. If we find any non simple clause or different
// field names, bail out. // field names, bail out.
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
SClType tp = (*it)->m_tp; SClType tp = (*it)->m_tp;
if (tp != SCLT_AND && tp != SCLT_OR) { if (tp != SCLT_AND && tp != SCLT_OR) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n")); LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n"));
return false; return false;
} }
SearchDataClauseSimple *clp = SearchDataClauseSimple *clp =
@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase()
field = clp->getfield(); field = clp->getfield();
} else { } else {
if (clp->getfield().compare(field)) { if (clp->getfield().compare(field)) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n")); LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
return false; return false;
} }
} }
if (!words.empty())
words += " "; // If there are wildcards or quotes in there, bail out
words += clp->gettext(); if (clp->gettext().find_first_of("\"*[?") != string::npos) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
return false;
}
// Do a simple word-split here, don't bother with the full-blown
// textsplit. The autophrase thing is just "best effort", it's
// normal that it won't work in strange cases.
vector<string> wl;
stringToStrings(clp->gettext(), wl);
words.insert(words.end(), wl.begin(), wl.end());
} }
// If there are wildcards or quotes in there, or this is a single word,
// bail out // Trim the word list by eliminating very frequent terms
if (words.find_first_of("\"*[?") != string::npos && // (increasing the slack as we do it):
TextSplit::countWords(words) <= 1) { int slack = 0;
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n")); int doccnt = db.docCnt();
if (!doccnt)
doccnt = 1;
string swords;
for (list<string>::iterator it = words.begin();
it != words.end(); it++) {
double freq = double(db.termDocCnt(*it)) / doccnt;
if (freq < freqThreshold) {
if (!swords.empty())
swords.append(1, ' ');
swords += *it;
} else {
LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
it->c_str(), 100 * freq));
slack++;
}
}
// We can't make a phrase with a single word :)
if (TextSplit::countWords(swords) <= 1) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
return false; return false;
} }
SearchDataClauseDist *nclp = SearchDataClauseDist *nclp =
new SearchDataClauseDist(SCLT_PHRASE, words, 0, field); new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field);
// If the toplevel conjunction is an OR, just OR the phrase, else
// deepen the tree.
if (m_tp == SCLT_OR) { if (m_tp == SCLT_OR) {
addClause(nclp); addClause(nclp);
} else { } else {
@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase()
// phrase. // phrase.
SearchData *sd = new SearchData(m_tp); SearchData *sd = new SearchData(m_tp);
sd->m_query = m_query; sd->m_query = m_query;
sd->m_stemlang = m_stemlang;
m_tp = SCLT_OR; m_tp = SCLT_OR;
m_query.clear(); m_query.clear();
SearchDataClauseSub *oq = SearchDataClauseSub *oq =
@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp,
list<string>& exp, list<string>& exp,
string &sterm, const string& prefix) string &sterm, const string& prefix)
{ {
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp)); m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase(); sterm.erase();
exp.clear(); exp.clear();
if (term.empty()) { if (term.empty()) {
@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp,
bool haswild = term.find_first_of(cstr_minwilds) != string::npos; bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// No stemming if there are wildcards or prevented globally. // No stemming if there are wildcards or prevented globally.
if (haswild || m_stemlang.empty()) if (haswild || m_stemlang.empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true; nostemexp = true;
}
if (nostemexp && !haswild) { if (nostemexp && !haswild) {
sterm = term; sterm = term;
@ -631,6 +671,8 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
list<Xapian::Query> &pqueries) list<Xapian::Query> &pqueries)
{ {
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
span.c_str(), int(nostemexp)));
list<string> exp; list<string> exp;
string sterm; // dumb version of user term string sterm; // dumb version of user term
@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
{ {
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null: const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang; stemlang;
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
stemlang.c_str()));
m_terms.clear(); m_terms.clear();
m_groups.clear(); m_groups.clear();

View File

@ -98,11 +98,13 @@ public:
bool addClause(SearchDataClause *cl); bool addClause(SearchDataClause *cl);
/** If this is a simple query (one field only, no distance clauses), /** If this is a simple query (one field only, no distance clauses),
add phrase made of query terms to query, so that docs containing the * add phrase made of query terms to query, so that docs containing the
user terms in order will have higher relevance. This must be called * user terms in order will have higher relevance. This must be called
before toNativeQuery(). * before toNativeQuery().
*/ * @param threshold: don't use terms more frequent than the value
bool maybeAddAutoPhrase(); * (proportion of docs where they occur)
*/
bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
/** Set/get top subdirectory for filtering results */ /** Set/get top subdirectory for filtering results */
void setTopdir(const string& t, bool excl = false, float w = 1.0) void setTopdir(const string& t, bool excl = false, float w = 1.0)