diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index bcf15a5b..4715a6f7 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -569,9 +569,9 @@ recoll - The indexing configuration GUI + The index configuration GUI - Most parameters for a given indexing configuration can + Most parameters for a given index configuration can be set from a recoll GUI running on this configuration (either as default, or by setting RECOLL_CONFDIR or the @@ -4219,6 +4219,24 @@ skippedPaths = ~/somedir/∗.txt + maxTermExpand + Maximum expansion count for a single term (e.g.: + when using wildcards). The default of 10000 is reasonable and + will avoid queries that appear frozen while the engine is + walking the term list. + + + + maxXapianClauses + Maximum number of elementary clauses we can add + to a single Xapian query. In some cases, the result of term + expansion can be multiplicative, and we want to avoid using + excessive memory. The default of 100 000 should be both + high enough in most cases and compatible with current + typical hardware configurations. + + + nonumbers If this set to true, no terms will be generated for numbers. For example "123", "1.5e6", 192.168.1.4, would not diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp index 50e9891c..9b9048f8 100644 --- a/src/qtgui/confgui/confguiindex.cpp +++ b/src/qtgui/confgui/confguiindex.cpp @@ -195,6 +195,34 @@ ConfSearchPanelW::ConfSearchPanelW(QWidget *parent, ConfNull *config) )); vboxLayout->addWidget(cp2); + ConfLink lnk3(new ConfLinkRclRep(config, "maxTermExpand")); + ConfParamIntW* cp3 = + new ConfParamIntW(this, lnk3, + tr("Maximum term expansion count"), + tr("

Maximum expansion count for a single term " + "(e.g.: when using wildcards). The default " + "of 10 000 is reasonable and will avoid " + "queries that appear frozen while the engine is " + "walking the term list." + )); + vboxLayout->addWidget(cp3); + + + ConfLink lnk4(new ConfLinkRclRep(config, "maxXapianClauses")); + ConfParamIntW* cp4 = + new ConfParamIntW(this, lnk4, + tr("Maximum Xapian clauses count"), + tr("

Maximum number of elementary clauses we " + "add to a single Xapian query. In some cases, " + "the result of term expansion can be " + "multiplicative, and we want to avoid using " + "excessive memory. The default of 100 000 " + "should be both high enough in most cases " + "and compatible with current typical hardware " + "configurations." + )); + vboxLayout->addWidget(cp4); + vboxLayout->insertStretch(-1); } diff --git a/src/query/docseq.h b/src/query/docseq.h index 66a53c86..de10369b 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -138,7 +138,10 @@ class DocSequence { { return std::list(); } - + virtual std::string getReason() + { + return m_reason; + } /** Optional functionality. */ virtual bool canFilter() {return false;} virtual bool canSort() {return false;} @@ -154,6 +157,7 @@ class DocSequence { protected: static std::string o_sort_trans; static std::string o_filt_trans; + std::string m_reason; private: std::string m_title; }; @@ -206,6 +210,12 @@ public: return false; return m_seq->getEnclosing(doc, pdoc); } + virtual std::string getReason() + { + if (m_seq.isNull()) + return false; + return m_seq->getReason(); + } virtual std::string title() {return m_seq->title();} virtual RefCntr getSourceSeq() {return m_seq;} diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index c7ece824..73ed0057 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -51,14 +51,16 @@ string DocSequenceDb::getDescription() bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh) { - setQuery(); + if (!setQuery()) + return false; if (sh) sh->erase(); return m_q->getDoc(num, doc); } int DocSequenceDb::getResCnt() { - setQuery(); + if (!setQuery()) + return false; if (m_rescnt < 0) { m_rescnt= m_q->getResCnt(); } @@ -71,7 +73,8 @@ static const string cstr_mre("[...]"); bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vpabs) { LOGDEB(("DocSequenceDb::getAbstract/pair\n")); - setQuery(); + if (!setQuery()) + return false; // Have to put the limit somewhere. int maxoccs = 500; @@ -93,7 +96,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vpabs) bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vabs) { - setQuery(); + if (!setQuery()) + return false; if (m_q->whatDb() && m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) { m_q->makeDocAbstract(doc, vabs); @@ -105,7 +109,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vabs) int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term) { - setQuery(); + if (!setQuery()) + return false; if (m_q->whatDb()) { return m_q->getFirstMatchPage(doc, term); } @@ -114,7 +119,8 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term) bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc) { - setQuery(); + if (!setQuery()) + return false; string udi; if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath, udi)) @@ -124,7 +130,8 @@ bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc) list DocSequenceDb::expand(Rcl::Doc &doc) { - setQuery(); + if (!setQuery()) + return list(); vector v = m_q->expand(doc); return list(v.begin(), v.end()); } @@ -209,13 +216,10 @@ bool DocSequenceDb::setQuery() return true; m_rescnt = -1; m_needSetQuery = !m_q->setQuery(m_fsdata); - -#if 0 - HighlightData hld; - m_fsdata->getTerms(hld); - string str; - hld.toString(str); - fprintf(stderr, "DocSequenceDb::setQuery: terms: %s\n", str.c_str()); -#endif + if (m_needSetQuery) { + m_reason = m_q->getReason(); + LOGERR(("DocSequenceDb::setQuery: rclquery::setQuery failed: %s\n", + m_reason.c_str())); + } return !m_needSetQuery; } diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h index 05a42235..a987f9ff 100644 --- a/src/query/docseqdb.h +++ b/src/query/docseqdb.h @@ -67,6 +67,7 @@ class DocSequenceDb : public DocSequence { bool m_isFiltered; bool m_isSorted; bool m_needSetQuery; // search data changed, need to reapply before fetch + bool setQuery(); }; diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 6c85e4a0..22a9da63 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -319,7 +319,10 @@ int recollq(RclConfig **cfp, int argc, char **argv) query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true); } Chrono chron; - query.setQuery(rq); + if (!query.setQuery(rq)) { + cerr << "Query setup failed: " << query.getReason() << endl; + return(1); + } int cnt = query.getResCnt(); if (!(op_flags & OPT_b)) { cout << "Recoll query: " << rq->getDescription() << endl; diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp index 2164dbda..3cda2cd3 100644 --- a/src/query/reslistpager.cpp +++ b/src/query/reslistpager.cpp @@ -337,37 +337,43 @@ void ResListPager::displayPage(RclConfig *config) if (pageEmpty()) { chunk << trans("

No results found
"); - HighlightData hldata; - m_docSource->getTerms(hldata); - vector uterms(hldata.uterms.begin(), hldata.uterms.end()); - if (!uterms.empty()) { - map > spellings; - suggest(uterms, spellings); - if (!spellings.empty()) { - if (o_index_stripchars) { - chunk << - trans("

Alternate spellings (accents suppressed): ") - << "

"; - } else { - chunk << - trans("

Alternate spellings: ") - << "

"; + string reason = m_docSource->getReason(); + if (!reason.empty()) { + chunk << "
" << escapeHtml(reason) << + "

"; + } else { + HighlightData hldata; + m_docSource->getTerms(hldata); + vector uterms(hldata.uterms.begin(), hldata.uterms.end()); + if (!uterms.empty()) { + map > spellings; + suggest(uterms, spellings); + if (!spellings.empty()) { + if (o_index_stripchars) { + chunk << + trans("

Alternate spellings (accents suppressed): ") + << "

"; + } else { + chunk << + trans("

Alternate spellings: ") + << "

"; - } - - for (map >::const_iterator it0 = - spellings.begin(); it0 != spellings.end(); it0++) { - chunk << "" << it0->first << " : "; - for (vector::const_iterator it = - it0->second.begin(); - it != it0->second.end(); it++) { - chunk << *it << " "; } - chunk << "
"; + + for (map >::const_iterator it0 = + spellings.begin(); it0 != spellings.end(); it0++) { + chunk << "" << it0->first << " : "; + for (vector::const_iterator it = + it0->second.begin(); + it != it0->second.end(); it++) { + chunk << *it << " "; + } + chunk << "
"; + } + chunk << "

"; } - chunk << "

"; - } - } + } + } } else { unsigned int resCnt = m_docSource->getResCnt(); if (m_winfirst + m_respage.size() < resCnt) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index bac800a2..f75e909d 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1431,7 +1431,7 @@ bool Db::purgeFile(const string &udi, bool *existed) } // File name wild card expansion. This is a specialisation ot termMatch -bool Db::filenameWildExp(const string& fnexp, vector& names) +bool Db::filenameWildExp(const string& fnexp, vector& names, int max) { string pattern = fnexp; names.clear(); @@ -1449,7 +1449,7 @@ bool Db::filenameWildExp(const string& fnexp, vector& names) LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str())); TermMatchResult result; - if (!termMatch(ET_WILD, string(), pattern, result, -1, + if (!termMatch(ET_WILD, string(), pattern, result, max, unsplitFilenameFieldName)) return false; for (vector::const_iterator it = result.entries.begin(); @@ -1459,7 +1459,7 @@ bool Db::filenameWildExp(const string& fnexp, vector& names) if (names.empty()) { // Build an impossible query: we know its impossible because we // control the prefixes! - names.push_back("XNONENoMatchingTerms"); + names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms"); } return true; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 6f1d630f..e930e4b8 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -315,7 +315,7 @@ class Db { bool maxYearSpan(int *minyear, int *maxyear); /** Wildcard expansion specific to file names. Internal/sdata use only */ - bool filenameWildExp(const string& exp, vector& names); + bool filenameWildExp(const string& exp, vector& names, int max); /** Set parameters for synthetic abstract generation */ void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 0ac5948e..475cb1da 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -192,9 +192,14 @@ bool Query::setQuery(RefCntr sdata) m_nq->clear(); m_sd = sdata; + + int maxexp = 10000; + m_db->getConf()->getConfParam("maxTermExpand", &maxexp); + int maxcl = 100000; + m_db->getConf()->getConfParam("maxXapianClauses", &maxcl); Xapian::Query xq; - if (!sdata->toNativeQuery(*m_db, &xq)) { + if (!sdata->toNativeQuery(*m_db, &xq, maxexp, maxcl)) { m_reason += sdata->getReason(); return false; } diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 577960bb..62b2a2c0 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -201,14 +201,16 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector& tps) bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, vector& query, - string& reason, void *d) + string& reason, void *d, + int maxexp, int maxcl) { Xapian::Query xq; for (qlist_it_t it = query.begin(); it != query.end(); it++) { Xapian::Query nq; - if (!(*it)->toNativeQuery(db, &nq)) { - LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n")); - reason = (*it)->getReason(); + if (!(*it)->toNativeQuery(db, &nq, maxexp, maxcl)) { + LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n", + (*it)->getReason().c_str())); + reason += (*it)->getReason() + " "; return false; } if (nq.empty()) { @@ -236,6 +238,13 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, } else { xq = Xapian::Query(op, xq, nq); } + if (int(xq.get_length()) >= maxcl) { + LOGERR(("Maximum Xapian query size exceeded." + " Maybe increase maxXapianClauses.")); + m_reason += "Maximum Xapian query size exceeded." + " Maybe increase maxXapianClauses."; + return false; + } } if (xq.empty()) xq = Xapian::Query::MatchAll; @@ -244,7 +253,7 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, return true; } -bool SearchData::toNativeQuery(Rcl::Db &db, void *d) +bool SearchData::toNativeQuery(Rcl::Db &db, void *d, int maxexp, int maxcl) { LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str())); m_reason.erase(); @@ -252,8 +261,9 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) // Walk the clause list translating each in turn and building the // Xapian query tree Xapian::Query xq; - if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) { - LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n")); + if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq, maxexp, maxcl)) { + LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", + m_reason.c_str())); return false; } @@ -620,10 +630,10 @@ private: class StringToXapianQ { public: StringToXapianQ(Db& db, HighlightData& hld, const string& field, - const string &stmlng, bool boostUser) + const string &stmlng, bool boostUser, int maxexp, int maxcl) : m_db(db), m_field(field), m_stemlang(stmlng), m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false), - m_autocasesens(true) + m_autocasesens(true), m_maxexp(maxexp), m_maxcl(maxcl), m_curcl(0) { m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens); m_db.getConf()->getConfParam("autocasesens", &m_autocasesens); @@ -635,15 +645,15 @@ public: vector &pqueries, int slack = 0, bool useNear = false); private: - void expandTerm(int mods, + bool expandTerm(string& ermsg, int mods, const string& term, vector& exp, string& sterm, const string& prefix); // After splitting entry on whitespace: process non-phrase element - void processSimpleSpan(const string& span, + void processSimpleSpan(string& ermsg, const string& span, int mods, vector &pqueries); // Process phrase/near element - void processPhraseOrNear(TextSplitQ *splitData, + void processPhraseOrNear(string& ermsg, TextSplitQ *splitData, int mods, vector &pqueries, bool useNear, int slack); @@ -655,6 +665,9 @@ private: HighlightData& m_hld; bool m_autodiacsens; bool m_autocasesens; + int m_maxexp; + int m_maxcl; + int m_curcl; }; #if 1 @@ -679,7 +692,7 @@ static void listVector(const string& what, const vector&l) * has it already. Used in the simple case where there is nothing to expand, * and we just return the prefixed term (else Db::termMatch deals with it). */ -void StringToXapianQ::expandTerm(int mods, +bool StringToXapianQ::expandTerm(string& ermsg, int mods, const string& term, vector& oexp, string &sterm, const string& prefix) @@ -689,7 +702,7 @@ void StringToXapianQ::expandTerm(int mods, sterm.clear(); oexp.clear(); if (term.empty()) - return; + return true; bool haswild = term.find_first_of(cstr_minwilds) != string::npos; @@ -753,7 +766,7 @@ void StringToXapianQ::expandTerm(int mods, oexp.push_back(prefix + term); m_hld.terms[term] = m_hld.uterms.size() - 1; LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); - return; + return true; } // Make objects before the goto jungle to avoid compiler complaints @@ -770,7 +783,7 @@ void StringToXapianQ::expandTerm(int mods, // expansion, which means that we are casediac-sensitive. There // would be nothing to prevent us to expand from the casediac // synonyms first. To be done later - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field); + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang,term,res,m_maxexp,m_field); goto termmatchtoresult; } @@ -778,14 +791,14 @@ void StringToXapianQ::expandTerm(int mods, #ifdef RCL_INDEX_STRIPCHARS - m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, m_maxexp, m_field); #else if (o_index_stripchars) { // If the index is raw, we can only come here if nostemexp is unset // and we just need stem expansion. - m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang,term,res,m_maxexp,m_field); goto termmatchtoresult; } @@ -854,12 +867,17 @@ exptotermatch: LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); for (vector::const_iterator it = lexp.begin(); it != lexp.end(); it++) { - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field); + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,m_maxexp,m_field); } #endif // Term match entries to vector of terms termmatchtoresult: + if (int(res.entries.size()) >= m_maxexp) { + ermsg = "Maximum term expansion size exceeded." + " Maybe increase maxTermExpand."; + return false; + } for (vector::const_iterator it = res.entries.begin(); it != res.entries.end(); it++) { oexp.push_back(it->term); @@ -876,6 +894,7 @@ termmatchtoresult: m_hld.terms[strip_prefix(*it)] = term; } LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); + return true; } // Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d @@ -912,7 +931,7 @@ void multiply_groups(vector >::const_iterator vvit, } } -void StringToXapianQ::processSimpleSpan(const string& span, +void StringToXapianQ::processSimpleSpan(string& ermsg, const string& span, int mods, vector &pqueries) { @@ -927,7 +946,8 @@ void StringToXapianQ::processSimpleSpan(const string& span, prefix = wrap_prefix(ftp->pfx); } - expandTerm(mods, span, exp, sterm, prefix); + if (!expandTerm(ermsg, mods, span, exp, sterm, prefix)) + return; // Set up the highlight data. No prefix should go in there for (vector::const_iterator it = exp.begin(); @@ -939,6 +959,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, // Push either term or OR of stem-expanded set Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); + m_curcl += exp.size(); // If sterm (simplified original user term) is not null, give it a // relevance boost. We do this even if no expansion occurred (else @@ -957,7 +978,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, // NEAR xapian query, the elements of which can themselves be OR // queries if the terms get expanded by stemming or wildcards (we // don't do stemming for PHRASE though) -void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, +void StringToXapianQ::processPhraseOrNear(string& ermsg, TextSplitQ *splitData, int mods, vector &pqueries, bool useNear, int slack) @@ -999,7 +1020,8 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, lmods |= SearchDataClause::SDCM_NOSTEMMING; string sterm; vector exp; - expandTerm(lmods, *it, exp, sterm, prefix); + if (!expandTerm(ermsg, lmods, *it, exp, sterm, prefix)) + return; LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); listVector("", exp); // groups is used for highlighting, we don't want prefixes in there. @@ -1011,6 +1033,9 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, groups.push_back(noprefs); orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); + m_curcl += exp.size(); + if (m_curcl >= m_maxcl) + return; #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF if (exp.size() > 1) hadmultiple = true; @@ -1099,7 +1124,7 @@ bool StringToXapianQ::processUserString(const string &iq, "slack %d near %d\n", iq.c_str(), m_field.c_str(), mods, slack, useNear)); ermsg.erase(); - + m_curcl = 0; const StopList stops = m_db.getStopList(); // Simple whitespace-split input into user-level words and @@ -1165,12 +1190,18 @@ bool StringToXapianQ::processUserString(const string &iq, if (splitter.nostemexps.front()) lmods |= SearchDataClause::SDCM_NOSTEMMING; m_hld.ugroups.push_back(vector(1, *it)); - processSimpleSpan(splitter.terms.front(), lmods, pqueries); + processSimpleSpan(ermsg,splitter.terms.front(),lmods, pqueries); } break; default: m_hld.ugroups.push_back(vector(1, *it)); - processPhraseOrNear(&splitter, mods, pqueries, useNear, slack); + processPhraseOrNear(ermsg, &splitter, mods, pqueries, + useNear, slack); + } + if (m_curcl >= m_maxcl) { + ermsg = "Maximum Xapian query size exceeded." + " Maybe increase maxXapianClauses."; + break; } } } catch (const Xapian::Error &e) { @@ -1190,7 +1221,8 @@ bool StringToXapianQ::processUserString(const string &iq, } // Translate a simple OR, AND, or EXCL search clause. -bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) +bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, + int maxexp, int maxcl) { LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n", getStemLang().c_str())); @@ -1216,7 +1248,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) (m_parentSearch && !m_parentSearch->haveWildCards()) || (m_parentSearch == 0 && !m_haveWildCards); - StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm); + StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm, + maxexp, maxcl); if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries)) return false; if (pqueries.empty()) { @@ -1240,13 +1273,14 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) // about expanding multiple fragments in the past. We just take the // value blanks and all and expand this against the indexed unsplit // file names -bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) +bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, + int maxexp, int) { Xapian::Query *qp = (Xapian::Query *)p; *qp = Xapian::Query(); vector names; - db.filenameWildExp(m_text, names); + db.filenameWildExp(m_text, names, maxexp); *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); if (m_weight != 1.0) { @@ -1256,7 +1290,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) } // Translate NEAR or PHRASE clause. -bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p) +bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, + int maxexp, int maxcl) { LOGDEB(("SearchDataClauseDist::toNativeQuery\n")); @@ -1281,7 +1316,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p) } string s = cstr_dquote + m_text + cstr_dquote; bool useNear = (m_tp == SCLT_NEAR); - StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm); + StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm, + maxexp, maxcl); if (!tr.processUserString(s, getModifiers(), m_reason, pqueries, m_slack, useNear)) return false; diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index eed14769..a62d8691 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -89,8 +89,7 @@ public: bool haveWildCards() {return m_haveWildCards;} /** Translate to Xapian query. rcldb knows about the void* */ - bool toNativeQuery(Rcl::Db &db, void *); - + bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl); /** We become the owner of cl and will delete it */ bool addClause(SearchDataClause *cl); @@ -175,7 +174,7 @@ private: bool expandFileTypes(RclConfig *cfg, std::vector& exptps); bool clausesToQuery(Rcl::Db &db, SClType tp, std::vector& query, - string& reason, void *d); + string& reason, void *d, int, int); /* Copyconst and assignment private and forbidden */ SearchData(const SearchData &) {} @@ -192,7 +191,7 @@ public: m_modifiers(SDCM_NONE), m_weight(1.0) {} virtual ~SearchDataClause() {} - virtual bool toNativeQuery(Rcl::Db &db, void *) = 0; + virtual bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl) = 0; bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;} virtual std::string getReason() const {return m_reason;} virtual void getTerms(HighlightData & hldata) const = 0; @@ -266,7 +265,7 @@ public: } /** Translate to Xapian query */ - virtual bool toNativeQuery(Rcl::Db &, void *); + virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl); virtual void getTerms(HighlightData& hldata) const { @@ -307,7 +306,7 @@ public: { } - virtual bool toNativeQuery(Rcl::Db &, void *); + virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl); }; /** @@ -326,7 +325,7 @@ public: { } - virtual bool toNativeQuery(Rcl::Db &, void *); + virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl); private: int m_slack; }; @@ -338,9 +337,12 @@ public: : SearchDataClause(tp), m_sub(sub) { } - virtual bool toNativeQuery(Rcl::Db &db, void *p) + virtual bool toNativeQuery(Rcl::Db &db, void *p, int maxexp, int maxcl) { - return m_sub->toNativeQuery(db, p); + bool ret = m_sub->toNativeQuery(db, p, maxexp, maxcl); + if (!ret) + m_reason = m_sub->getReason(); + return ret; } virtual void getTerms(HighlightData& hldata) const diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index f6af2b3b..f9e5961e 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -103,6 +103,17 @@ indexstemminglanguages = english # Actually, this seems a reasonable default for all until someone protests. unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl +# Maximum expansion count for a single term (ie: when using wildcards). +# We used to not limit this at all (except for filenames where the limit +# was too low at 1000), but it is unreasonable with a big index. +# Default 10 000 +maxTermExpand = 10000 + +# Maximum number of clauses we add to a single Xapian query. In some cases, +# the result of term expansion can be multiplicative, and we want to avoid +# eating all the memory. Default 100 000 +maxXapianClauses = 100000 + # Where to store the database (directory). This may be an absolute path, # else it is taken as relative to the configuration directory (-c argument # or $RECOLL_CONFDIR). @@ -132,18 +143,6 @@ filtersdir = @prefix@/share/recoll/filters # want to change the icons displayed in the result list iconsdir = @prefix@/share/recoll/images -# A list of characters, encoded in UTF-8, which should be handled specially -# when converting text to unaccented lowercase. For example, in Swedish, -# the letter a with diaeresis has full alphabet citizenship and should not -# be turned into an a. Each element in the space-separated list has the -# special character as first element and the translation following -# (multiple chars allowed. The handling of both the lowercase and -# upper-case versions of a character should be specified, as appartenance -# to the list will turn-off both standard accent and case -# processing. ** Changing the list implies a full reindex ** -# Example for Swedish: -# unac_except_trans = åå Åå ää Ää öö Öö - # Should we use the system's 'file -i' command as a final step in file type # identification ? This may be useful, but will usually cause the # indexation of many bogus 'text' files