implemented proper limitation and error reporting in case of truncation for term and query expansions

This commit is contained in:
Jean-Francois Dockes 2012-10-05 12:36:19 +02:00
parent bfd111ecaa
commit c9f6612c10
13 changed files with 217 additions and 105 deletions

View File

@ -569,9 +569,9 @@ recoll
<sect2 id="rcl.indexing.config.gui"> <sect2 id="rcl.indexing.config.gui">
<title>The indexing configuration GUI</title> <title>The index configuration GUI</title>
<para>Most parameters for a given indexing configuration can <para>Most parameters for a given index configuration can
be set from a <command>recoll</command> GUI running on this be set from a <command>recoll</command> GUI running on this
configuration (either as default, or by setting configuration (either as default, or by setting
<envar>RECOLL_CONFDIR</envar> or the <option>-c</option> <envar>RECOLL_CONFDIR</envar> or the <option>-c</option>
@ -4219,6 +4219,24 @@ skippedPaths = ~/somedir/&lowast;.txt
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry><term><varname>maxTermExpand</varname></term>
<listitem><para>Maximum expansion count for a single term (e.g.:
when using wildcards). The default of 10000 is reasonable and
will avoid queries that appear frozen while the engine is
walking the term list.</para>
</listitem>
</varlistentry>
<varlistentry><term><varname>maxXapianClauses</varname></term>
<listitem><para>Maximum number of elementary clauses we can add
to a single Xapian query. In some cases, the result of term
expansion can be multiplicative, and we want to avoid using
excessive memory. The default of 100 000 should be both
high enough in most cases and compatible with current
typical hardware configurations.</para>
</listitem>
</varlistentry>
<varlistentry><term><varname>nonumbers</varname></term> <varlistentry><term><varname>nonumbers</varname></term>
<listitem><para>If this set to true, no terms will be generated <listitem><para>If this set to true, no terms will be generated
for numbers. For example "123", "1.5e6", 192.168.1.4, would not for numbers. For example "123", "1.5e6", 192.168.1.4, would not

View File

@ -195,6 +195,34 @@ ConfSearchPanelW::ConfSearchPanelW(QWidget *parent, ConfNull *config)
)); ));
vboxLayout->addWidget(cp2); vboxLayout->addWidget(cp2);
ConfLink lnk3(new ConfLinkRclRep(config, "maxTermExpand"));
ConfParamIntW* cp3 =
new ConfParamIntW(this, lnk3,
tr("Maximum term expansion count"),
tr("<p>Maximum expansion count for a single term "
"(e.g.: when using wildcards). The default "
"of 10 000 is reasonable and will avoid "
"queries that appear frozen while the engine is "
"walking the term list."
));
vboxLayout->addWidget(cp3);
ConfLink lnk4(new ConfLinkRclRep(config, "maxXapianClauses"));
ConfParamIntW* cp4 =
new ConfParamIntW(this, lnk4,
tr("Maximum Xapian clauses count"),
tr("<p>Maximum number of elementary clauses we "
"add to a single Xapian query. In some cases, "
"the result of term expansion can be "
"multiplicative, and we want to avoid using "
"excessive memory. The default of 100 000 "
"should be both high enough in most cases "
"and compatible with current typical hardware "
"configurations."
));
vboxLayout->addWidget(cp4);
vboxLayout->insertStretch(-1); vboxLayout->insertStretch(-1);
} }

View File

@ -138,7 +138,10 @@ class DocSequence {
{ {
return std::list<std::string>(); return std::list<std::string>();
} }
virtual std::string getReason()
{
return m_reason;
}
/** Optional functionality. */ /** Optional functionality. */
virtual bool canFilter() {return false;} virtual bool canFilter() {return false;}
virtual bool canSort() {return false;} virtual bool canSort() {return false;}
@ -154,6 +157,7 @@ class DocSequence {
protected: protected:
static std::string o_sort_trans; static std::string o_sort_trans;
static std::string o_filt_trans; static std::string o_filt_trans;
std::string m_reason;
private: private:
std::string m_title; std::string m_title;
}; };
@ -206,6 +210,12 @@ public:
return false; return false;
return m_seq->getEnclosing(doc, pdoc); return m_seq->getEnclosing(doc, pdoc);
} }
virtual std::string getReason()
{
if (m_seq.isNull())
return false;
return m_seq->getReason();
}
virtual std::string title() {return m_seq->title();} virtual std::string title() {return m_seq->title();}
virtual RefCntr<DocSequence> getSourceSeq() {return m_seq;} virtual RefCntr<DocSequence> getSourceSeq() {return m_seq;}

View File

@ -51,14 +51,16 @@ string DocSequenceDb::getDescription()
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh) bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh)
{ {
setQuery(); if (!setQuery())
return false;
if (sh) sh->erase(); if (sh) sh->erase();
return m_q->getDoc(num, doc); return m_q->getDoc(num, doc);
} }
int DocSequenceDb::getResCnt() int DocSequenceDb::getResCnt()
{ {
setQuery(); if (!setQuery())
return false;
if (m_rescnt < 0) { if (m_rescnt < 0) {
m_rescnt= m_q->getResCnt(); m_rescnt= m_q->getResCnt();
} }
@ -71,7 +73,8 @@ static const string cstr_mre("[...]");
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs) bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs)
{ {
LOGDEB(("DocSequenceDb::getAbstract/pair\n")); LOGDEB(("DocSequenceDb::getAbstract/pair\n"));
setQuery(); if (!setQuery())
return false;
// Have to put the limit somewhere. // Have to put the limit somewhere.
int maxoccs = 500; int maxoccs = 500;
@ -93,7 +96,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs)
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs) bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
{ {
setQuery(); if (!setQuery())
return false;
if (m_q->whatDb() && if (m_q->whatDb() &&
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) { m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) {
m_q->makeDocAbstract(doc, vabs); m_q->makeDocAbstract(doc, vabs);
@ -105,7 +109,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term) int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
{ {
setQuery(); if (!setQuery())
return false;
if (m_q->whatDb()) { if (m_q->whatDb()) {
return m_q->getFirstMatchPage(doc, term); return m_q->getFirstMatchPage(doc, term);
} }
@ -114,7 +119,8 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc) bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{ {
setQuery(); if (!setQuery())
return false;
string udi; string udi;
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath, if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
udi)) udi))
@ -124,7 +130,8 @@ bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
list<string> DocSequenceDb::expand(Rcl::Doc &doc) list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{ {
setQuery(); if (!setQuery())
return list<string>();
vector<string> v = m_q->expand(doc); vector<string> v = m_q->expand(doc);
return list<string>(v.begin(), v.end()); return list<string>(v.begin(), v.end());
} }
@ -209,13 +216,10 @@ bool DocSequenceDb::setQuery()
return true; return true;
m_rescnt = -1; m_rescnt = -1;
m_needSetQuery = !m_q->setQuery(m_fsdata); m_needSetQuery = !m_q->setQuery(m_fsdata);
if (m_needSetQuery) {
#if 0 m_reason = m_q->getReason();
HighlightData hld; LOGERR(("DocSequenceDb::setQuery: rclquery::setQuery failed: %s\n",
m_fsdata->getTerms(hld); m_reason.c_str()));
string str; }
hld.toString(str);
fprintf(stderr, "DocSequenceDb::setQuery: terms: %s\n", str.c_str());
#endif
return !m_needSetQuery; return !m_needSetQuery;
} }

View File

@ -67,6 +67,7 @@ class DocSequenceDb : public DocSequence {
bool m_isFiltered; bool m_isFiltered;
bool m_isSorted; bool m_isSorted;
bool m_needSetQuery; // search data changed, need to reapply before fetch bool m_needSetQuery; // search data changed, need to reapply before fetch
bool setQuery(); bool setQuery();
}; };

View File

@ -319,7 +319,10 @@ int recollq(RclConfig **cfp, int argc, char **argv)
query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true); query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
} }
Chrono chron; Chrono chron;
query.setQuery(rq); if (!query.setQuery(rq)) {
cerr << "Query setup failed: " << query.getReason() << endl;
return(1);
}
int cnt = query.getResCnt(); int cnt = query.getResCnt();
if (!(op_flags & OPT_b)) { if (!(op_flags & OPT_b)) {
cout << "Recoll query: " << rq->getDescription() << endl; cout << "Recoll query: " << rq->getDescription() << endl;

View File

@ -337,37 +337,43 @@ void ResListPager::displayPage(RclConfig *config)
if (pageEmpty()) { if (pageEmpty()) {
chunk << trans("<p><b>No results found</b><br>"); chunk << trans("<p><b>No results found</b><br>");
HighlightData hldata; string reason = m_docSource->getReason();
m_docSource->getTerms(hldata); if (!reason.empty()) {
vector<string> uterms(hldata.uterms.begin(), hldata.uterms.end()); chunk << "<blockquote>" << escapeHtml(reason) <<
if (!uterms.empty()) { "</blockquote></p>";
map<string, vector<string> > spellings; } else {
suggest(uterms, spellings); HighlightData hldata;
if (!spellings.empty()) { m_docSource->getTerms(hldata);
if (o_index_stripchars) { vector<string> uterms(hldata.uterms.begin(), hldata.uterms.end());
chunk << if (!uterms.empty()) {
trans("<p><i>Alternate spellings (accents suppressed): </i>") map<string, vector<string> > spellings;
<< "<br /><blockquote>"; suggest(uterms, spellings);
} else { if (!spellings.empty()) {
chunk << if (o_index_stripchars) {
trans("<p><i>Alternate spellings: </i>") chunk <<
<< "<br /><blockquote>"; trans("<p><i>Alternate spellings (accents suppressed): </i>")
<< "<br /><blockquote>";
} else {
chunk <<
trans("<p><i>Alternate spellings: </i>")
<< "<br /><blockquote>";
}
for (map<string, vector<string> >::const_iterator it0 =
spellings.begin(); it0 != spellings.end(); it0++) {
chunk << "<b>" << it0->first << "</b> : ";
for (vector<string>::const_iterator it =
it0->second.begin();
it != it0->second.end(); it++) {
chunk << *it << " ";
} }
chunk << "<br />";
for (map<string, vector<string> >::const_iterator it0 =
spellings.begin(); it0 != spellings.end(); it0++) {
chunk << "<b>" << it0->first << "</b> : ";
for (vector<string>::const_iterator it =
it0->second.begin();
it != it0->second.end(); it++) {
chunk << *it << " ";
}
chunk << "<br />";
}
chunk << "</blockquote></p>";
} }
chunk << "</blockquote></p>"; }
} }
}
} else { } else {
unsigned int resCnt = m_docSource->getResCnt(); unsigned int resCnt = m_docSource->getResCnt();
if (m_winfirst + m_respage.size() < resCnt) { if (m_winfirst + m_respage.size() < resCnt) {

View File

@ -1431,7 +1431,7 @@ bool Db::purgeFile(const string &udi, bool *existed)
} }
// File name wild card expansion. This is a specialisation ot termMatch // File name wild card expansion. This is a specialisation ot termMatch
bool Db::filenameWildExp(const string& fnexp, vector<string>& names) bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
{ {
string pattern = fnexp; string pattern = fnexp;
names.clear(); names.clear();
@ -1449,7 +1449,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str())); LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
TermMatchResult result; TermMatchResult result;
if (!termMatch(ET_WILD, string(), pattern, result, -1, if (!termMatch(ET_WILD, string(), pattern, result, max,
unsplitFilenameFieldName)) unsplitFilenameFieldName))
return false; return false;
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin(); for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
@ -1459,7 +1459,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
if (names.empty()) { if (names.empty()) {
// Build an impossible query: we know its impossible because we // Build an impossible query: we know its impossible because we
// control the prefixes! // control the prefixes!
names.push_back("XNONENoMatchingTerms"); names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
} }
return true; return true;
} }

View File

@ -315,7 +315,7 @@ class Db {
bool maxYearSpan(int *minyear, int *maxyear); bool maxYearSpan(int *minyear, int *maxyear);
/** Wildcard expansion specific to file names. Internal/sdata use only */ /** Wildcard expansion specific to file names. Internal/sdata use only */
bool filenameWildExp(const string& exp, vector<string>& names); bool filenameWildExp(const string& exp, vector<string>& names, int max);
/** Set parameters for synthetic abstract generation */ /** Set parameters for synthetic abstract generation */
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);

View File

@ -192,9 +192,14 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
m_nq->clear(); m_nq->clear();
m_sd = sdata; m_sd = sdata;
int maxexp = 10000;
m_db->getConf()->getConfParam("maxTermExpand", &maxexp);
int maxcl = 100000;
m_db->getConf()->getConfParam("maxXapianClauses", &maxcl);
Xapian::Query xq; Xapian::Query xq;
if (!sdata->toNativeQuery(*m_db, &xq)) { if (!sdata->toNativeQuery(*m_db, &xq, maxexp, maxcl)) {
m_reason += sdata->getReason(); m_reason += sdata->getReason();
return false; return false;
} }

View File

@ -201,14 +201,16 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query, vector<SearchDataClause*>& query,
string& reason, void *d) string& reason, void *d,
int maxexp, int maxcl)
{ {
Xapian::Query xq; Xapian::Query xq;
for (qlist_it_t it = query.begin(); it != query.end(); it++) { for (qlist_it_t it = query.begin(); it != query.end(); it++) {
Xapian::Query nq; Xapian::Query nq;
if (!(*it)->toNativeQuery(db, &nq)) { if (!(*it)->toNativeQuery(db, &nq, maxexp, maxcl)) {
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n")); LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
reason = (*it)->getReason(); (*it)->getReason().c_str()));
reason += (*it)->getReason() + " ";
return false; return false;
} }
if (nq.empty()) { if (nq.empty()) {
@ -236,6 +238,13 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
} else { } else {
xq = Xapian::Query(op, xq, nq); xq = Xapian::Query(op, xq, nq);
} }
if (int(xq.get_length()) >= maxcl) {
LOGERR(("Maximum Xapian query size exceeded."
" Maybe increase maxXapianClauses."));
m_reason += "Maximum Xapian query size exceeded."
" Maybe increase maxXapianClauses.";
return false;
}
} }
if (xq.empty()) if (xq.empty())
xq = Xapian::Query::MatchAll; xq = Xapian::Query::MatchAll;
@ -244,7 +253,7 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
return true; return true;
} }
bool SearchData::toNativeQuery(Rcl::Db &db, void *d) bool SearchData::toNativeQuery(Rcl::Db &db, void *d, int maxexp, int maxcl)
{ {
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str())); LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
m_reason.erase(); m_reason.erase();
@ -252,8 +261,9 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
// Walk the clause list translating each in turn and building the // Walk the clause list translating each in turn and building the
// Xapian query tree // Xapian query tree
Xapian::Query xq; Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) { if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq, maxexp, maxcl)) {
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n")); LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
m_reason.c_str()));
return false; return false;
} }
@ -620,10 +630,10 @@ private:
class StringToXapianQ { class StringToXapianQ {
public: public:
StringToXapianQ(Db& db, HighlightData& hld, const string& field, StringToXapianQ(Db& db, HighlightData& hld, const string& field,
const string &stmlng, bool boostUser) const string &stmlng, bool boostUser, int maxexp, int maxcl)
: m_db(db), m_field(field), m_stemlang(stmlng), : m_db(db), m_field(field), m_stemlang(stmlng),
m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false), m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false),
m_autocasesens(true) m_autocasesens(true), m_maxexp(maxexp), m_maxcl(maxcl), m_curcl(0)
{ {
m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens); m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);
m_db.getConf()->getConfParam("autocasesens", &m_autocasesens); m_db.getConf()->getConfParam("autocasesens", &m_autocasesens);
@ -635,15 +645,15 @@ public:
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
int slack = 0, bool useNear = false); int slack = 0, bool useNear = false);
private: private:
void expandTerm(int mods, bool expandTerm(string& ermsg, int mods,
const string& term, vector<string>& exp, const string& term, vector<string>& exp,
string& sterm, const string& prefix); string& sterm, const string& prefix);
// After splitting entry on whitespace: process non-phrase element // After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, void processSimpleSpan(string& ermsg, const string& span,
int mods, int mods,
vector<Xapian::Query> &pqueries); vector<Xapian::Query> &pqueries);
// Process phrase/near element // Process phrase/near element
void processPhraseOrNear(TextSplitQ *splitData, void processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
int mods, int mods,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
bool useNear, int slack); bool useNear, int slack);
@ -655,6 +665,9 @@ private:
HighlightData& m_hld; HighlightData& m_hld;
bool m_autodiacsens; bool m_autodiacsens;
bool m_autocasesens; bool m_autocasesens;
int m_maxexp;
int m_maxcl;
int m_curcl;
}; };
#if 1 #if 1
@ -679,7 +692,7 @@ static void listVector(const string& what, const vector<string>&l)
* has it already. Used in the simple case where there is nothing to expand, * has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it). * and we just return the prefixed term (else Db::termMatch deals with it).
*/ */
void StringToXapianQ::expandTerm(int mods, bool StringToXapianQ::expandTerm(string& ermsg, int mods,
const string& term, const string& term,
vector<string>& oexp, string &sterm, vector<string>& oexp, string &sterm,
const string& prefix) const string& prefix)
@ -689,7 +702,7 @@ void StringToXapianQ::expandTerm(int mods,
sterm.clear(); sterm.clear();
oexp.clear(); oexp.clear();
if (term.empty()) if (term.empty())
return; return true;
bool haswild = term.find_first_of(cstr_minwilds) != string::npos; bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
@ -753,7 +766,7 @@ void StringToXapianQ::expandTerm(int mods,
oexp.push_back(prefix + term); oexp.push_back(prefix + term);
m_hld.terms[term] = m_hld.uterms.size() - 1; m_hld.terms[term] = m_hld.uterms.size() - 1;
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return; return true;
} }
// Make objects before the goto jungle to avoid compiler complaints // Make objects before the goto jungle to avoid compiler complaints
@ -770,7 +783,7 @@ void StringToXapianQ::expandTerm(int mods,
// expansion, which means that we are casediac-sensitive. There // expansion, which means that we are casediac-sensitive. There
// would be nothing to prevent us to expand from the casediac // would be nothing to prevent us to expand from the casediac
// synonyms first. To be done later // synonyms first. To be done later
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field); m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang,term,res,m_maxexp,m_field);
goto termmatchtoresult; goto termmatchtoresult;
} }
@ -778,14 +791,14 @@ void StringToXapianQ::expandTerm(int mods,
#ifdef RCL_INDEX_STRIPCHARS #ifdef RCL_INDEX_STRIPCHARS
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, m_maxexp, m_field);
#else #else
if (o_index_stripchars) { if (o_index_stripchars) {
// If the index is raw, we can only come here if nostemexp is unset // If the index is raw, we can only come here if nostemexp is unset
// and we just need stem expansion. // and we just need stem expansion.
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field); m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang,term,res,m_maxexp,m_field);
goto termmatchtoresult; goto termmatchtoresult;
} }
@ -854,12 +867,17 @@ exptotermatch:
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
for (vector<string>::const_iterator it = lexp.begin(); for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) { it != lexp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field); m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,m_maxexp,m_field);
} }
#endif #endif
// Term match entries to vector of terms // Term match entries to vector of terms
termmatchtoresult: termmatchtoresult:
if (int(res.entries.size()) >= m_maxexp) {
ermsg = "Maximum term expansion size exceeded."
" Maybe increase maxTermExpand.";
return false;
}
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) { it != res.entries.end(); it++) {
oexp.push_back(it->term); oexp.push_back(it->term);
@ -876,6 +894,7 @@ termmatchtoresult:
m_hld.terms[strip_prefix(*it)] = term; m_hld.terms[strip_prefix(*it)] = term;
} }
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return true;
} }
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d // Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
@ -912,7 +931,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
} }
} }
void StringToXapianQ::processSimpleSpan(const string& span, void StringToXapianQ::processSimpleSpan(string& ermsg, const string& span,
int mods, int mods,
vector<Xapian::Query> &pqueries) vector<Xapian::Query> &pqueries)
{ {
@ -927,7 +946,8 @@ void StringToXapianQ::processSimpleSpan(const string& span,
prefix = wrap_prefix(ftp->pfx); prefix = wrap_prefix(ftp->pfx);
} }
expandTerm(mods, span, exp, sterm, prefix); if (!expandTerm(ermsg, mods, span, exp, sterm, prefix))
return;
// Set up the highlight data. No prefix should go in there // Set up the highlight data. No prefix should go in there
for (vector<string>::const_iterator it = exp.begin(); for (vector<string>::const_iterator it = exp.begin();
@ -939,6 +959,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
// Push either term or OR of stem-expanded set // Push either term or OR of stem-expanded set
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
m_curcl += exp.size();
// If sterm (simplified original user term) is not null, give it a // If sterm (simplified original user term) is not null, give it a
// relevance boost. We do this even if no expansion occurred (else // relevance boost. We do this even if no expansion occurred (else
@ -957,7 +978,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
// NEAR xapian query, the elements of which can themselves be OR // NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we // queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though) // don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, void StringToXapianQ::processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
int mods, int mods,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
bool useNear, int slack) bool useNear, int slack)
@ -999,7 +1020,8 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
lmods |= SearchDataClause::SDCM_NOSTEMMING; lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm; string sterm;
vector<string> exp; vector<string> exp;
expandTerm(lmods, *it, exp, sterm, prefix); if (!expandTerm(ermsg, lmods, *it, exp, sterm, prefix))
return;
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp); listVector("", exp);
// groups is used for highlighting, we don't want prefixes in there. // groups is used for highlighting, we don't want prefixes in there.
@ -1011,6 +1033,9 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
groups.push_back(noprefs); groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end())); exp.begin(), exp.end()));
m_curcl += exp.size();
if (m_curcl >= m_maxcl)
return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1) if (exp.size() > 1)
hadmultiple = true; hadmultiple = true;
@ -1099,7 +1124,7 @@ bool StringToXapianQ::processUserString(const string &iq,
"slack %d near %d\n", "slack %d near %d\n",
iq.c_str(), m_field.c_str(), mods, slack, useNear)); iq.c_str(), m_field.c_str(), mods, slack, useNear));
ermsg.erase(); ermsg.erase();
m_curcl = 0;
const StopList stops = m_db.getStopList(); const StopList stops = m_db.getStopList();
// Simple whitespace-split input into user-level words and // Simple whitespace-split input into user-level words and
@ -1165,12 +1190,18 @@ bool StringToXapianQ::processUserString(const string &iq,
if (splitter.nostemexps.front()) if (splitter.nostemexps.front())
lmods |= SearchDataClause::SDCM_NOSTEMMING; lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hld.ugroups.push_back(vector<string>(1, *it)); m_hld.ugroups.push_back(vector<string>(1, *it));
processSimpleSpan(splitter.terms.front(), lmods, pqueries); processSimpleSpan(ermsg,splitter.terms.front(),lmods, pqueries);
} }
break; break;
default: default:
m_hld.ugroups.push_back(vector<string>(1, *it)); m_hld.ugroups.push_back(vector<string>(1, *it));
processPhraseOrNear(&splitter, mods, pqueries, useNear, slack); processPhraseOrNear(ermsg, &splitter, mods, pqueries,
useNear, slack);
}
if (m_curcl >= m_maxcl) {
ermsg = "Maximum Xapian query size exceeded."
" Maybe increase maxXapianClauses.";
break;
} }
} }
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
@ -1190,7 +1221,8 @@ bool StringToXapianQ::processUserString(const string &iq,
} }
// Translate a simple OR, AND, or EXCL search clause. // Translate a simple OR, AND, or EXCL search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
int maxexp, int maxcl)
{ {
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n", LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
getStemLang().c_str())); getStemLang().c_str()));
@ -1216,7 +1248,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
(m_parentSearch && !m_parentSearch->haveWildCards()) || (m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards); (m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm); StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
maxexp, maxcl);
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries)) if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
@ -1240,13 +1273,14 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
// about expanding multiple fragments in the past. We just take the // about expanding multiple fragments in the past. We just take the
// value blanks and all and expand this against the indexed unsplit // value blanks and all and expand this against the indexed unsplit
// file names // file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
int maxexp, int)
{ {
Xapian::Query *qp = (Xapian::Query *)p; Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query(); *qp = Xapian::Query();
vector<string> names; vector<string> names;
db.filenameWildExp(m_text, names); db.filenameWildExp(m_text, names, maxexp);
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
if (m_weight != 1.0) { if (m_weight != 1.0) {
@ -1256,7 +1290,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
} }
// Translate NEAR or PHRASE clause. // Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p) bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
int maxexp, int maxcl)
{ {
LOGDEB(("SearchDataClauseDist::toNativeQuery\n")); LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
@ -1281,7 +1316,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
} }
string s = cstr_dquote + m_text + cstr_dquote; string s = cstr_dquote + m_text + cstr_dquote;
bool useNear = (m_tp == SCLT_NEAR); bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm); StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
maxexp, maxcl);
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries, if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
m_slack, useNear)) m_slack, useNear))
return false; return false;

View File

@ -89,8 +89,7 @@ public:
bool haveWildCards() {return m_haveWildCards;} bool haveWildCards() {return m_haveWildCards;}
/** Translate to Xapian query. rcldb knows about the void* */ /** Translate to Xapian query. rcldb knows about the void* */
bool toNativeQuery(Rcl::Db &db, void *); bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl);
/** We become the owner of cl and will delete it */ /** We become the owner of cl and will delete it */
bool addClause(SearchDataClause *cl); bool addClause(SearchDataClause *cl);
@ -175,7 +174,7 @@ private:
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps); bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
bool clausesToQuery(Rcl::Db &db, SClType tp, bool clausesToQuery(Rcl::Db &db, SClType tp,
std::vector<SearchDataClause*>& query, std::vector<SearchDataClause*>& query,
string& reason, void *d); string& reason, void *d, int, int);
/* Copyconst and assignment private and forbidden */ /* Copyconst and assignment private and forbidden */
SearchData(const SearchData &) {} SearchData(const SearchData &) {}
@ -192,7 +191,7 @@ public:
m_modifiers(SDCM_NONE), m_weight(1.0) m_modifiers(SDCM_NONE), m_weight(1.0)
{} {}
virtual ~SearchDataClause() {} virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0; virtual bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl) = 0;
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;} bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
virtual std::string getReason() const {return m_reason;} virtual std::string getReason() const {return m_reason;}
virtual void getTerms(HighlightData & hldata) const = 0; virtual void getTerms(HighlightData & hldata) const = 0;
@ -266,7 +265,7 @@ public:
} }
/** Translate to Xapian query */ /** Translate to Xapian query */
virtual bool toNativeQuery(Rcl::Db &, void *); virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
virtual void getTerms(HighlightData& hldata) const virtual void getTerms(HighlightData& hldata) const
{ {
@ -307,7 +306,7 @@ public:
{ {
} }
virtual bool toNativeQuery(Rcl::Db &, void *); virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
}; };
/** /**
@ -326,7 +325,7 @@ public:
{ {
} }
virtual bool toNativeQuery(Rcl::Db &, void *); virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
private: private:
int m_slack; int m_slack;
}; };
@ -338,9 +337,12 @@ public:
: SearchDataClause(tp), m_sub(sub) : SearchDataClause(tp), m_sub(sub)
{ {
} }
virtual bool toNativeQuery(Rcl::Db &db, void *p) virtual bool toNativeQuery(Rcl::Db &db, void *p, int maxexp, int maxcl)
{ {
return m_sub->toNativeQuery(db, p); bool ret = m_sub->toNativeQuery(db, p, maxexp, maxcl);
if (!ret)
m_reason = m_sub->getReason();
return ret;
} }
virtual void getTerms(HighlightData& hldata) const virtual void getTerms(HighlightData& hldata) const

View File

@ -103,6 +103,17 @@ indexstemminglanguages = english
# Actually, this seems a reasonable default for all until someone protests. # Actually, this seems a reasonable default for all until someone protests.
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
# Maximum expansion count for a single term (ie: when using wildcards).
# We used to not limit this at all (except for filenames where the limit
# was too low at 1000), but it is unreasonable with a big index.
# Default 10 000
maxTermExpand = 10000
# Maximum number of clauses we add to a single Xapian query. In some cases,
# the result of term expansion can be multiplicative, and we want to avoid
# eating all the memory. Default 100 000
maxXapianClauses = 100000
# Where to store the database (directory). This may be an absolute path, # Where to store the database (directory). This may be an absolute path,
# else it is taken as relative to the configuration directory (-c argument # else it is taken as relative to the configuration directory (-c argument
# or $RECOLL_CONFDIR). # or $RECOLL_CONFDIR).
@ -132,18 +143,6 @@ filtersdir = @prefix@/share/recoll/filters
# want to change the icons displayed in the result list # want to change the icons displayed in the result list
iconsdir = @prefix@/share/recoll/images iconsdir = @prefix@/share/recoll/images
# A list of characters, encoded in UTF-8, which should be handled specially
# when converting text to unaccented lowercase. For example, in Swedish,
# the letter a with diaeresis has full alphabet citizenship and should not
# be turned into an a. Each element in the space-separated list has the
# special character as first element and the translation following
# (multiple chars allowed. The handling of both the lowercase and
# upper-case versions of a character should be specified, as appartenance
# to the list will turn-off both standard accent and case
# processing. ** Changing the list implies a full reindex **
# Example for Swedish:
# unac_except_trans = åå Åå ää Ää öö Öö
# Should we use the system's 'file -i' command as a final step in file type # Should we use the system's 'file -i' command as a final step in file type
# identification ? This may be useful, but will usually cause the # identification ? This may be useful, but will usually cause the
# indexation of many bogus 'text' files # indexation of many bogus 'text' files