Add frequency threshold to avoid adding common term to the automatic phrase search extension. Use autophrase by default with simple search, with a default freq threshold at 2%
This commit is contained in:
parent
4ced9bee49
commit
bb2685c2f5
@ -113,7 +113,9 @@ void rwSettings(bool writing)
|
||||
Bool, false);
|
||||
SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false);
|
||||
SETTING_RW(prefs.ssearchAutoPhrase,
|
||||
"/Recoll/prefs/ssearchAutoPhrase", Bool, false);
|
||||
"/Recoll/prefs/ssearchAutoPhrase", Bool, true);
|
||||
SETTING_RW(prefs.ssearchAutoPhraseThreshPC,
|
||||
"/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0);
|
||||
SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8);
|
||||
SETTING_RW(prefs.collapseDuplicates,
|
||||
"/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
|
||||
|
||||
@ -85,6 +85,7 @@ class PrefsPack {
|
||||
QStringList ssearchHistory;
|
||||
// Make phrase out of search terms and add to search in simple search
|
||||
bool ssearchAutoPhrase;
|
||||
double ssearchAutoPhraseThreshPC;
|
||||
// Ignored file types in adv search (startup default)
|
||||
QStringList asearchIgnFilTyps;
|
||||
bool fileTypesByCats;
|
||||
|
||||
@ -132,7 +132,8 @@ void SSearch::startSimpleSearch()
|
||||
if (tp == SST_LANG) {
|
||||
string reason;
|
||||
if (prefs.autoSuffsEnable)
|
||||
sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8());
|
||||
sdata = wasaStringToRcl(theconfig, u8, reason,
|
||||
(const char *)prefs.autoSuffs.toUtf8());
|
||||
else
|
||||
sdata = wasaStringToRcl(theconfig, u8, reason);
|
||||
if (sdata == 0) {
|
||||
@ -140,51 +141,40 @@ void SSearch::startSimpleSearch()
|
||||
QString::fromAscii(reason.c_str()));
|
||||
return;
|
||||
}
|
||||
if (prefs.ssearchAutoPhrase) {
|
||||
sdata->maybeAddAutoPhrase();
|
||||
}
|
||||
} else {
|
||||
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
|
||||
if (sdata == 0) {
|
||||
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
bool isreallyaphrase = false;
|
||||
if (!TextSplit::hasVisibleWhite(u8))
|
||||
isreallyaphrase = true;
|
||||
|
||||
// Maybe add automatic phrase ? For ALL and ANY, and not if
|
||||
// there is already a phrase or wildcard terms.
|
||||
if (!isreallyaphrase &&
|
||||
prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) &&
|
||||
u8.find_first_of("\"*[]?") == string::npos &&
|
||||
TextSplit::countWords(u8) > 1) {
|
||||
sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
|
||||
u8, 0));
|
||||
}
|
||||
Rcl::SearchDataClause *clp = 0;
|
||||
switch (tp) {
|
||||
case SST_ANY:
|
||||
default:
|
||||
clp = isreallyaphrase ?
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
|
||||
new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
|
||||
break;
|
||||
case SST_ALL:
|
||||
clp = isreallyaphrase ?
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) :
|
||||
new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
|
||||
break;
|
||||
case SST_FNM:
|
||||
if (tp == SST_FNM) {
|
||||
clp = new Rcl::SearchDataClauseFilename(u8);
|
||||
break;
|
||||
} else if (!TextSplit::hasVisibleWhite(u8)) {
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0);
|
||||
} else {
|
||||
// ANY or ALL, several words.
|
||||
if (tp == SST_ANY) {
|
||||
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8);
|
||||
} else {
|
||||
clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8);
|
||||
}
|
||||
}
|
||||
sdata->addClause(clp);
|
||||
}
|
||||
|
||||
if (prefs.ssearchAutoPhrase && rcldb) {
|
||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||
if (stemLang == "ALL") {
|
||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||
}
|
||||
sdata->setStemlang(stemLang);
|
||||
sdata->maybeAddAutoPhrase(*rcldb,
|
||||
prefs.ssearchAutoPhraseThreshPC / 100.0);
|
||||
}
|
||||
|
||||
// Search terms history
|
||||
|
||||
// We want to have the new text at the top and any older identical
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>527</width>
|
||||
<width>530</width>
|
||||
<height>559</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -21,6 +21,9 @@
|
||||
<layout class="QVBoxLayout">
|
||||
<item>
|
||||
<widget class="QTabWidget" name="tabWidget">
|
||||
<property name="currentIndex">
|
||||
<number>1</number>
|
||||
</property>
|
||||
<widget class="QWidget" name="tab">
|
||||
<attribute name="title">
|
||||
<string>User interface</string>
|
||||
@ -300,12 +303,12 @@
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="previewPlainPreCB">
|
||||
<property name="toolTip">
|
||||
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Use <PRE> tags instead of <BR>to display plain text as html.</string>
|
||||
</property>
|
||||
<property name="toolTip">
|
||||
<string>Lines in PRE text are not folded. Using BR loses indentation.</string>
|
||||
</property>
|
||||
<property name="checked">
|
||||
<bool>false</bool>
|
||||
</property>
|
||||
@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<layout class="QHBoxLayout">
|
||||
<item>
|
||||
<widget class="QLabel" name="textLabel33">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Preferred" vsizetype="Preferred">
|
||||
<horstretch>1</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="toolTip">
|
||||
<string>Frequency percentage threshold over which we do not use terms inside autophrase.
|
||||
Frequent terms are a major performance issue with phrases.
|
||||
Skipped terms augment the phrase slack, and reduce the autophrase efficiency.
|
||||
The default value is 2 (percent). </string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Autophrase term frequency threshold percentage</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>false</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QDoubleSpinBox" name="autoPThreshSB">
|
||||
<property name="singleStep">
|
||||
<double>0.200000000000000</double>
|
||||
</property>
|
||||
<property name="value">
|
||||
<double>2.000000000000000</double>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="Line" name="line2">
|
||||
<property name="frameShape">
|
||||
|
||||
@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs()
|
||||
stemLangCMB->setCurrentIndex(cur);
|
||||
|
||||
autoPhraseCB->setChecked(prefs.ssearchAutoPhrase);
|
||||
autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC);
|
||||
|
||||
buildAbsCB->setChecked(prefs.queryBuildAbstract);
|
||||
replAbsCB->setEnabled(prefs.queryBuildAbstract);
|
||||
@ -199,6 +200,7 @@ void UIPrefsDialog::accept()
|
||||
prefs.queryStemLang = stemLangCMB->currentText();
|
||||
}
|
||||
prefs.ssearchAutoPhrase = autoPhraseCB->isChecked();
|
||||
prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value();
|
||||
prefs.queryBuildAbstract = buildAbsCB->isChecked();
|
||||
prefs.queryReplaceAbstract = buildAbsCB->isChecked() &&
|
||||
replAbsCB->isChecked();
|
||||
|
||||
@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
{
|
||||
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
|
||||
m_stemlang.c_str()));
|
||||
Xapian::Query xq;
|
||||
m_reason.erase();
|
||||
|
||||
@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool SearchData::maybeAddAutoPhrase()
|
||||
// This is called by the GUI simple search if the option is set: add
|
||||
// (OR) phrase to a query (if it is simple enough) so that results
|
||||
// where the search terms are close and in order will come up on top.
|
||||
// We remove very common terms from the query to avoid performance issues.
|
||||
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||
{
|
||||
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
|
||||
if (!m_query.size()) {
|
||||
@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase()
|
||||
}
|
||||
|
||||
string field;
|
||||
string words;
|
||||
list<string> words;
|
||||
// Walk the clause list. If we find any non simple clause or different
|
||||
// field names, bail out.
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||
SClType tp = (*it)->m_tp;
|
||||
if (tp != SCLT_AND && tp != SCLT_OR) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n"));
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n"));
|
||||
return false;
|
||||
}
|
||||
SearchDataClauseSimple *clp =
|
||||
@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase()
|
||||
field = clp->getfield();
|
||||
} else {
|
||||
if (clp->getfield().compare(field)) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n"));
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!words.empty())
|
||||
words += " ";
|
||||
words += clp->gettext();
|
||||
|
||||
// If there are wildcards or quotes in there, bail out
|
||||
if (clp->gettext().find_first_of("\"*[?") != string::npos) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
|
||||
return false;
|
||||
}
|
||||
// Do a simple word-split here, don't bother with the full-blown
|
||||
// textsplit. The autophrase thing is just "best effort", it's
|
||||
// normal that it won't work in strange cases.
|
||||
vector<string> wl;
|
||||
stringToStrings(clp->gettext(), wl);
|
||||
words.insert(words.end(), wl.begin(), wl.end());
|
||||
}
|
||||
|
||||
// If there are wildcards or quotes in there, or this is a single word,
|
||||
// bail out
|
||||
if (words.find_first_of("\"*[?") != string::npos &&
|
||||
TextSplit::countWords(words) <= 1) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n"));
|
||||
|
||||
// Trim the word list by eliminating very frequent terms
|
||||
// (increasing the slack as we do it):
|
||||
int slack = 0;
|
||||
int doccnt = db.docCnt();
|
||||
if (!doccnt)
|
||||
doccnt = 1;
|
||||
string swords;
|
||||
for (list<string>::iterator it = words.begin();
|
||||
it != words.end(); it++) {
|
||||
double freq = double(db.termDocCnt(*it)) / doccnt;
|
||||
if (freq < freqThreshold) {
|
||||
if (!swords.empty())
|
||||
swords.append(1, ' ');
|
||||
swords += *it;
|
||||
} else {
|
||||
LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
|
||||
it->c_str(), 100 * freq));
|
||||
slack++;
|
||||
}
|
||||
}
|
||||
|
||||
// We can't make a phrase with a single word :)
|
||||
if (TextSplit::countWords(swords) <= 1) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
SearchDataClauseDist *nclp =
|
||||
new SearchDataClauseDist(SCLT_PHRASE, words, 0, field);
|
||||
new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field);
|
||||
|
||||
// If the toplevel conjunction is an OR, just OR the phrase, else
|
||||
// deepen the tree.
|
||||
if (m_tp == SCLT_OR) {
|
||||
addClause(nclp);
|
||||
} else {
|
||||
@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase()
|
||||
// phrase.
|
||||
SearchData *sd = new SearchData(m_tp);
|
||||
sd->m_query = m_query;
|
||||
sd->m_stemlang = m_stemlang;
|
||||
m_tp = SCLT_OR;
|
||||
m_query.clear();
|
||||
SearchDataClauseSub *oq =
|
||||
@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp,
|
||||
list<string>& exp,
|
||||
string &sterm, const string& prefix)
|
||||
{
|
||||
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
|
||||
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
|
||||
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||
sterm.erase();
|
||||
exp.clear();
|
||||
if (term.empty()) {
|
||||
@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp,
|
||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||
|
||||
// No stemming if there are wildcards or prevented globally.
|
||||
if (haswild || m_stemlang.empty())
|
||||
if (haswild || m_stemlang.empty()) {
|
||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
if (nostemexp && !haswild) {
|
||||
sterm = term;
|
||||
@ -631,6 +671,8 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
||||
list<Xapian::Query> &pqueries)
|
||||
{
|
||||
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
|
||||
span.c_str(), int(nostemexp)));
|
||||
list<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
|
||||
@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
{
|
||||
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
|
||||
stemlang;
|
||||
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
||||
stemlang.c_str()));
|
||||
|
||||
m_terms.clear();
|
||||
m_groups.clear();
|
||||
|
||||
@ -98,11 +98,13 @@ public:
|
||||
bool addClause(SearchDataClause *cl);
|
||||
|
||||
/** If this is a simple query (one field only, no distance clauses),
|
||||
add phrase made of query terms to query, so that docs containing the
|
||||
user terms in order will have higher relevance. This must be called
|
||||
before toNativeQuery().
|
||||
*/
|
||||
bool maybeAddAutoPhrase();
|
||||
* add phrase made of query terms to query, so that docs containing the
|
||||
* user terms in order will have higher relevance. This must be called
|
||||
* before toNativeQuery().
|
||||
* @param threshold: don't use terms more frequent than the value
|
||||
* (proportion of docs where they occur)
|
||||
*/
|
||||
bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
|
||||
|
||||
/** Set/get top subdirectory for filtering results */
|
||||
void setTopdir(const string& t, bool excl = false, float w = 1.0)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user