diff --git a/src/qtgui/guiutils.cpp b/src/qtgui/guiutils.cpp index 96b3375c..3b73b544 100644 --- a/src/qtgui/guiutils.cpp +++ b/src/qtgui/guiutils.cpp @@ -113,7 +113,9 @@ void rwSettings(bool writing) Bool, false); SETTING_RW(prefs.catgToolBar, "/Recoll/prefs/catgToolBar", Bool, false); SETTING_RW(prefs.ssearchAutoPhrase, - "/Recoll/prefs/ssearchAutoPhrase", Bool, false); + "/Recoll/prefs/ssearchAutoPhrase", Bool, true); + SETTING_RW(prefs.ssearchAutoPhraseThreshPC, + "/Recoll/prefs/ssearchAutoPhraseThreshPC", Double, 2.0); SETTING_RW(prefs.respagesize, "/Recoll/prefs/reslist/pagelen", Int, 8); SETTING_RW(prefs.collapseDuplicates, "/Recoll/prefs/reslist/collapseDuplicates", Bool, false); diff --git a/src/qtgui/guiutils.h b/src/qtgui/guiutils.h index aa573150..4e3d5d56 100644 --- a/src/qtgui/guiutils.h +++ b/src/qtgui/guiutils.h @@ -85,6 +85,7 @@ class PrefsPack { QStringList ssearchHistory; // Make phrase out of search terms and add to search in simple search bool ssearchAutoPhrase; + double ssearchAutoPhraseThreshPC; // Ignored file types in adv search (startup default) QStringList asearchIgnFilTyps; bool fileTypesByCats; diff --git a/src/qtgui/ssearch_w.cpp b/src/qtgui/ssearch_w.cpp index 33ba5c06..fc75b5f4 100644 --- a/src/qtgui/ssearch_w.cpp +++ b/src/qtgui/ssearch_w.cpp @@ -132,7 +132,8 @@ void SSearch::startSimpleSearch() if (tp == SST_LANG) { string reason; if (prefs.autoSuffsEnable) - sdata = wasaStringToRcl(theconfig, u8, reason, (const char *)prefs.autoSuffs.toUtf8()); + sdata = wasaStringToRcl(theconfig, u8, reason, + (const char *)prefs.autoSuffs.toUtf8()); else sdata = wasaStringToRcl(theconfig, u8, reason); if (sdata == 0) { @@ -140,51 +141,40 @@ void SSearch::startSimpleSearch() QString::fromAscii(reason.c_str())); return; } - if (prefs.ssearchAutoPhrase) { - sdata->maybeAddAutoPhrase(); - } } else { sdata = new Rcl::SearchData(Rcl::SCLT_OR); if (sdata == 0) { QMessageBox::warning(0, "Recoll", tr("Out of memory")); return; } - - // If there is no white space inside the query, then the user - // certainly means it as a phrase. - bool isreallyaphrase = false; - if (!TextSplit::hasVisibleWhite(u8)) - isreallyaphrase = true; - - // Maybe add automatic phrase ? For ALL and ANY, and not if - // there is already a phrase or wildcard terms. - if (!isreallyaphrase && - prefs.ssearchAutoPhrase && (tp == SST_ANY || tp == SST_ALL) && - u8.find_first_of("\"*[]?") == string::npos && - TextSplit::countWords(u8) > 1) { - sdata->addClause(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, - u8, 0)); - } Rcl::SearchDataClause *clp = 0; - switch (tp) { - case SST_ANY: - default: - clp = isreallyaphrase ? - new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) : - new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8); - break; - case SST_ALL: - clp = isreallyaphrase ? - new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0) : - new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8); - break; - case SST_FNM: + if (tp == SST_FNM) { clp = new Rcl::SearchDataClauseFilename(u8); - break; + } else if (!TextSplit::hasVisibleWhite(u8)) { + // If there is no white space inside the query, then the user + // certainly means it as a phrase. + clp = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, u8, 0); + } else { + // ANY or ALL, several words. + if (tp == SST_ANY) { + clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, u8); + } else { + clp = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, u8); + } } sdata->addClause(clp); } + if (prefs.ssearchAutoPhrase && rcldb) { + string stemLang = (const char *)prefs.queryStemLang.toAscii(); + if (stemLang == "ALL") { + theconfig->getConfParam("indexstemminglanguages", stemLang); + } + sdata->setStemlang(stemLang); + sdata->maybeAddAutoPhrase(*rcldb, + prefs.ssearchAutoPhraseThreshPC / 100.0); + } + // Search terms history // We want to have the new text at the top and any older identical diff --git a/src/qtgui/uiprefs.ui b/src/qtgui/uiprefs.ui index 6809f1ab..071f1e64 100644 --- a/src/qtgui/uiprefs.ui +++ b/src/qtgui/uiprefs.ui @@ -6,7 +6,7 @@ 0 0 - 527 + 530 559 @@ -21,6 +21,9 @@ + + 1 + User interface @@ -300,12 +303,12 @@ + + Lines in PRE text are not folded. Using BR loses indentation. + Use <PRE> tags instead of <BR>to display plain text as html. - - Lines in PRE text are not folded. Using BR loses indentation. - false @@ -350,6 +353,42 @@ This should give higher precedence to the results where the search terms appear + + + + + + + 1 + 0 + + + + Frequency percentage threshold over which we do not use terms inside autophrase. +Frequent terms are a major performance issue with phrases. +Skipped terms augment the phrase slack, and reduce the autophrase efficiency. +The default value is 2 (percent). + + + Autophrase term frequency threshold percentage + + + false + + + + + + + 0.200000000000000 + + + 2.000000000000000 + + + + + diff --git a/src/qtgui/uiprefs_w.cpp b/src/qtgui/uiprefs_w.cpp index fcd75a43..16fae8b8 100644 --- a/src/qtgui/uiprefs_w.cpp +++ b/src/qtgui/uiprefs_w.cpp @@ -140,6 +140,7 @@ void UIPrefsDialog::setFromPrefs() stemLangCMB->setCurrentIndex(cur); autoPhraseCB->setChecked(prefs.ssearchAutoPhrase); + autoPThreshSB->setValue(prefs.ssearchAutoPhraseThreshPC); buildAbsCB->setChecked(prefs.queryBuildAbstract); replAbsCB->setEnabled(prefs.queryBuildAbstract); @@ -199,6 +200,7 @@ void UIPrefsDialog::accept() prefs.queryStemLang = stemLangCMB->currentText(); } prefs.ssearchAutoPhrase = autoPhraseCB->isChecked(); + prefs.ssearchAutoPhraseThreshPC = autoPThreshSB->value(); prefs.queryBuildAbstract = buildAbsCB->isChecked(); prefs.queryReplaceAbstract = buildAbsCB->isChecked() && replAbsCB->isChecked(); diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index be834213..77d60748 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -168,6 +168,8 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector& tps) bool SearchData::toNativeQuery(Rcl::Db &db, void *d) { + LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n", + m_stemlang.c_str())); Xapian::Query xq; m_reason.erase(); @@ -309,8 +311,11 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) return true; } - -bool SearchData::maybeAddAutoPhrase() +// This is called by the GUI simple search if the option is set: add +// (OR) phrase to a query (if it is simple enough) so that results +// where the search terms are close and in order will come up on top. +// We remove very common terms from the query to avoid performance issues. +bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) { LOGDEB0(("SearchData::maybeAddAutoPhrase()\n")); if (!m_query.size()) { @@ -319,13 +324,13 @@ bool SearchData::maybeAddAutoPhrase() } string field; - string words; + list words; // Walk the clause list. If we find any non simple clause or different // field names, bail out. for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { SClType tp = (*it)->m_tp; if (tp != SCLT_AND && tp != SCLT_OR) { - LOGDEB2(("SearchData::maybeAddAutoPhrase: complex clause\n")); + LOGDEB2(("SearchData::maybeAddAutoPhrase: rejected clause\n")); return false; } SearchDataClauseSimple *clp = @@ -338,25 +343,57 @@ bool SearchData::maybeAddAutoPhrase() field = clp->getfield(); } else { if (clp->getfield().compare(field)) { - LOGDEB2(("SearchData::maybeAddAutoPhrase: different fields\n")); + LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n")); return false; } } - if (!words.empty()) - words += " "; - words += clp->gettext(); + + // If there are wildcards or quotes in there, bail out + if (clp->gettext().find_first_of("\"*[?") != string::npos) { + LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n")); + return false; + } + // Do a simple word-split here, don't bother with the full-blown + // textsplit. The autophrase thing is just "best effort", it's + // normal that it won't work in strange cases. + vector wl; + stringToStrings(clp->gettext(), wl); + words.insert(words.end(), wl.begin(), wl.end()); } - // If there are wildcards or quotes in there, or this is a single word, - // bail out - if (words.find_first_of("\"*[?") != string::npos && - TextSplit::countWords(words) <= 1) { - LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards or single word\n")); + + // Trim the word list by eliminating very frequent terms + // (increasing the slack as we do it): + int slack = 0; + int doccnt = db.docCnt(); + if (!doccnt) + doccnt = 1; + string swords; + for (list::iterator it = words.begin(); + it != words.end(); it++) { + double freq = double(db.termDocCnt(*it)) / doccnt; + if (freq < freqThreshold) { + if (!swords.empty()) + swords.append(1, ' '); + swords += *it; + } else { + LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n", + it->c_str(), 100 * freq)); + slack++; + } + } + + // We can't make a phrase with a single word :) + if (TextSplit::countWords(swords) <= 1) { + LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n")); return false; } - + SearchDataClauseDist *nclp = - new SearchDataClauseDist(SCLT_PHRASE, words, 0, field); + new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field); + + // If the toplevel conjunction is an OR, just OR the phrase, else + // deepen the tree. if (m_tp == SCLT_OR) { addClause(nclp); } else { @@ -365,6 +402,7 @@ bool SearchData::maybeAddAutoPhrase() // phrase. SearchData *sd = new SearchData(m_tp); sd->m_query = m_query; + sd->m_stemlang = m_stemlang; m_tp = SCLT_OR; m_query.clear(); SearchDataClauseSub *oq = @@ -556,8 +594,8 @@ void StringToXapianQ::expandTerm(bool nostemexp, list& exp, string &sterm, const string& prefix) { - LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", - m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp)); + LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", + m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp)); sterm.erase(); exp.clear(); if (term.empty()) { @@ -567,8 +605,10 @@ void StringToXapianQ::expandTerm(bool nostemexp, bool haswild = term.find_first_of(cstr_minwilds) != string::npos; // No stemming if there are wildcards or prevented globally. - if (haswild || m_stemlang.empty()) + if (haswild || m_stemlang.empty()) { + LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n")); nostemexp = true; + } if (nostemexp && !haswild) { sterm = term; @@ -631,6 +671,8 @@ void multiply_groups(vector >::const_iterator vvit, void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, list &pqueries) { + LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n", + span.c_str(), int(nostemexp))); list exp; string sterm; // dumb version of user term @@ -866,6 +908,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, { const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null: stemlang; + LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n", + stemlang.c_str())); m_terms.clear(); m_groups.clear(); diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 96ac04df..cf390d1c 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -98,11 +98,13 @@ public: bool addClause(SearchDataClause *cl); /** If this is a simple query (one field only, no distance clauses), - add phrase made of query terms to query, so that docs containing the - user terms in order will have higher relevance. This must be called - before toNativeQuery(). - */ - bool maybeAddAutoPhrase(); + * add phrase made of query terms to query, so that docs containing the + * user terms in order will have higher relevance. This must be called + * before toNativeQuery(). + * @param threshold: don't use terms more frequent than the value + * (proportion of docs where they occur) + */ + bool maybeAddAutoPhrase(Rcl::Db &db, double threshold); /** Set/get top subdirectory for filtering results */ void setTopdir(const string& t, bool excl = false, float w = 1.0)