Add frequency threshold to avoid adding common term to the automatic phrase search extension. Use autophrase by default with simple search, with a default freq threshold at 2%

This commit is contained in:
Jean-Francois Dockes 2011-10-04 09:03:43 +02:00
parent 4ced9bee49
commit bb2685c2f5
7 changed files with 142 additions and 62 deletions

View File

@ -113,7 +113,9 @@ void rwSettings(bool writing)
Bool, false);
SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false);
SETTING_RW(prefs.ssearchAutoPhrase,
"/Recoll/prefs/ssearchAutoPhrase", Bool, false);
"/Recoll/prefs/ssearchAutoPhrase", Bool, true);
SETTING_RW(prefs.ssearchAutoPhraseThreshPC,
"/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0);
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8);
SETTING_RW(prefs.collapseDuplicates,
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false);

View File

@ -85,6 +85,7 @@ class PrefsPack {
QStringList ssearchHistory;
// Make phrase out of search terms and add to search in simple search
bool ssearchAutoPhrase;
double ssearchAutoPhraseThreshPC;
// Ignored file types in adv search (startup default)
QStringList asearchIgnFilTyps;
bool fileTypesByCats;

View File

@ -132,7 +132,8 @@ void SSearch::startSimpleSearch()
if (tp == SST_LANG) {
string reason;
if (prefs.autoSuffsEnable)
sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8());
sdata = wasaStringToRcl(theconfig, u8, reason,
(const char *)prefs.autoSuffs.toUtf8());
else
sdata = wasaStringToRcl(theconfig, u8, reason);
if (sdata == 0) {
@ -140,51 +141,40 @@ void SSearch::startSimpleSearch()
QString::fromAscii(reason.c_str()));
return;
}
if (prefs.ssearchAutoPhrase) {
sdata->maybeAddAutoPhrase();
}
} else {
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
return;
}
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (!TextSplit::hasVisibleWhite(u8))
isreallyaphrase = true;
// Maybe add automatic phrase ? For ALL and ANY, and not if
// there is already a phrase or wildcard terms.
if (!isreallyaphrase &&
prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
u8.find_first_of("\"*[]?") == string::npos &&
TextSplit::countWords(u8) > 1) {
sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
u8, 0));
}
Rcl::SearchDataClause *clp = 0;
switch (tp) {
case SST_ANY:
default:
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
break;
case SST_ALL:
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
break;
case SST_FNM:
if (tp == SST_FNM) {
clp = new Rcl::SearchDataClauseFilename(u8);
break;
} else if (!TextSplit::hasVisibleWhite(u8)) {
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0);
} else {
// ANY or ALL, several words.
if (tp == SST_ANY) {
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
} else {
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
}
}
sdata->addClause(clp);
}
if (prefs.ssearchAutoPhrase && rcldb) {
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
sdata->maybeAddAutoPhrase(*rcldb,
prefs.ssearchAutoPhraseThreshPC / 100.0);
}
// Search terms history
// We want to have the new text at the top and any older identical

View File

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>527</width>
<width>530</width>
<height>559</height>
</rect>
</property>
@ -21,6 +21,9 @@
<layout class="QVBoxLayout">
<item>
<widget class="QTabWidget" name="tabWidget">
<property name="currentIndex">
<number>1</number>
</property>
<widget class="QWidget" name="tab">
<attribute name="title">
<string>User interface</string>
@ -300,12 +303,12 @@
</item>
<item>
<widget class="QCheckBox" name="previewPlainPreCB">
<property name="toolTip">
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
</property>
<property name="text">
<string>Use &lt;PRE&gt; tags instead of &lt;BR&gt;to display plain text as html.</string>
</property>
<property name="toolTip">
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
</property>
<property name="checked">
<bool>false</bool>
</property>
@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear
</property>
</widget>
</item>
<item>
<layout class="QHBoxLayout">
<item>
<widget class="QLabel" name="textLabel33">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Preferred">
<horstretch>1</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="toolTip">
<string>Frequency percentage threshold over which we do not use terms inside autophrase.
Frequent terms are a major performance issue with phrases.
Skipped terms augment the phrase slack, and reduce the autophrase efficiency.
The default value is 2 (percent). </string>
</property>
<property name="text">
<string>Autophrase term frequency threshold percentage</string>
</property>
<property name="wordWrap">
<bool>false</bool>
</property>
</widget>
</item>
<item>
<widget class="QDoubleSpinBox" name="autoPThreshSB">
<property name="singleStep">
<double>0.200000000000000</double>
</property>
<property name="value">
<double>2.000000000000000</double>
</property>
</widget>
</item>
</layout>
</item>
<item>
<widget class="Line" name="line2">
<property name="frameShape">

View File

@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs()
stemLangCMB->setCurrentIndex(cur);
autoPhraseCB->setChecked(prefs.ssearchAutoPhrase);
autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC);
buildAbsCB->setChecked(prefs.queryBuildAbstract);
replAbsCB->setEnabled(prefs.queryBuildAbstract);
@ -199,6 +200,7 @@ void UIPrefsDialog::accept()
prefs.queryStemLang = stemLangCMB->currentText();
}
prefs.ssearchAutoPhrase = autoPhraseCB->isChecked();
prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value();
prefs.queryBuildAbstract = buildAbsCB->isChecked();
prefs.queryReplaceAbstract = buildAbsCB->isChecked() &&
replAbsCB->isChecked();

View File

@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
m_stemlang.c_str()));
Xapian::Query xq;
m_reason.erase();
@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
return true;
}
bool SearchData::maybeAddAutoPhrase()
// This is called by the GUI simple search if the option is set: add
// (OR) phrase to a query (if it is simple enough) so that results
// where the search terms are close and in order will come up on top.
// We remove very common terms from the query to avoid performance issues.
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
{
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
if (!m_query.size()) {
@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase()
}
string field;
string words;
list<string> words;
// Walk the clause list. If we find any non simple clause or different
// field names, bail out.
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
SClType tp = (*it)->m_tp;
if (tp != SCLT_AND && tp != SCLT_OR) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n"));
LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n"));
return false;
}
SearchDataClauseSimple *clp =
@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase()
field = clp->getfield();
} else {
if (clp->getfield().compare(field)) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n"));
LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
return false;
}
}
if (!words.empty())
words += " ";
words += clp->gettext();
// If there are wildcards or quotes in there, bail out
if (clp->gettext().find_first_of("\"*[?") != string::npos) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
return false;
}
// Do a simple word-split here, don't bother with the full-blown
// textsplit. The autophrase thing is just "best effort", it's
// normal that it won't work in strange cases.
vector<string> wl;
stringToStrings(clp->gettext(), wl);
words.insert(words.end(), wl.begin(), wl.end());
}
// If there are wildcards or quotes in there, or this is a single word,
// bail out
if (words.find_first_of("\"*[?") != string::npos &&
TextSplit::countWords(words) <= 1) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n"));
// Trim the word list by eliminating very frequent terms
// (increasing the slack as we do it):
int slack = 0;
int doccnt = db.docCnt();
if (!doccnt)
doccnt = 1;
string swords;
for (list<string>::iterator it = words.begin();
it != words.end(); it++) {
double freq = double(db.termDocCnt(*it)) / doccnt;
if (freq < freqThreshold) {
if (!swords.empty())
swords.append(1, ' ');
swords += *it;
} else {
LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
it->c_str(), 100 * freq));
slack++;
}
}
// We can't make a phrase with a single word :)
if (TextSplit::countWords(swords) <= 1) {
LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
return false;
}
SearchDataClauseDist *nclp =
new SearchDataClauseDist(SCLT_PHRASE, words, 0, field);
new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field);
// If the toplevel conjunction is an OR, just OR the phrase, else
// deepen the tree.
if (m_tp == SCLT_OR) {
addClause(nclp);
} else {
@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase()
// phrase.
SearchData *sd = new SearchData(m_tp);
sd->m_query = m_query;
sd->m_stemlang = m_stemlang;
m_tp = SCLT_OR;
m_query.clear();
SearchDataClauseSub *oq =
@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp,
list<string>& exp,
string &sterm, const string& prefix)
{
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
exp.clear();
if (term.empty()) {
@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp,
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// No stemming if there are wildcards or prevented globally.
if (haswild || m_stemlang.empty())
if (haswild || m_stemlang.empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
}
if (nostemexp && !haswild) {
sterm = term;
@ -631,6 +671,8 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
list<Xapian::Query> &pqueries)
{
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
span.c_str(), int(nostemexp)));
list<string> exp;
string sterm; // dumb version of user term
@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
{
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
stemlang.c_str()));
m_terms.clear();
m_groups.clear();

View File

@ -98,11 +98,13 @@ public:
bool addClause(SearchDataClause *cl);
/** If this is a simple query (one field only, no distance clauses),
add phrase made of query terms to query, so that docs containing the
user terms in order will have higher relevance. This must be called
before toNativeQuery().
*/
bool maybeAddAutoPhrase();
* add phrase made of query terms to query, so that docs containing the
* user terms in order will have higher relevance. This must be called
* before toNativeQuery().
* @param threshold: don't use terms more frequent than the value
* (proportion of docs where they occur)
*/
bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
/** Set/get top subdirectory for filtering results */
void setTopdir(const string& t, bool excl = false, float w = 1.0)