From c9f6612c10f517b7da0b7e42d58f086f4555383d Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Fri, 5 Oct 2012 12:36:19 +0200
Subject: [PATCH] implemented proper limitation and error reporting in case of
truncation for term and query expansions
---
src/doc/user/usermanual.sgml | 22 ++++++-
src/qtgui/confgui/confguiindex.cpp | 28 ++++++++
src/query/docseq.h | 12 +++-
src/query/docseqdb.cpp | 34 +++++-----
src/query/docseqdb.h | 1 +
src/query/recollq.cpp | 5 +-
src/query/reslistpager.cpp | 62 ++++++++++--------
src/rcldb/rcldb.cpp | 6 +-
src/rcldb/rcldb.h | 2 +-
src/rcldb/rclquery.cpp | 7 +-
src/rcldb/searchdata.cpp | 100 ++++++++++++++++++++---------
src/rcldb/searchdata.h | 20 +++---
src/sampleconf/recoll.conf.in | 23 ++++---
13 files changed, 217 insertions(+), 105 deletions(-)
diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml
index bcf15a5b..4715a6f7 100644
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@@ -569,9 +569,9 @@ recoll
- The indexing configuration GUI
+ The index configuration GUI
- Most parameters for a given indexing configuration can
+ Most parameters for a given index configuration can
be set from a recoll GUI running on this
configuration (either as default, or by setting
RECOLL_CONFDIR or the
@@ -4219,6 +4219,24 @@ skippedPaths = ~/somedir/∗.txt
+ maxTermExpand
+ Maximum expansion count for a single term (e.g.:
+ when using wildcards). The default of 10000 is reasonable and
+ will avoid queries that appear frozen while the engine is
+ walking the term list.
+
+
+
+ maxXapianClauses
+ Maximum number of elementary clauses we can add
+ to a single Xapian query. In some cases, the result of term
+ expansion can be multiplicative, and we want to avoid using
+ excessive memory. The default of 100 000 should be both
+ high enough in most cases and compatible with current
+ typical hardware configurations.
+
+
+
nonumbersIf this set to true, no terms will be generated
for numbers. For example "123", "1.5e6", 192.168.1.4, would not
diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp
index 50e9891c..9b9048f8 100644
--- a/src/qtgui/confgui/confguiindex.cpp
+++ b/src/qtgui/confgui/confguiindex.cpp
@@ -195,6 +195,34 @@ ConfSearchPanelW::ConfSearchPanelW(QWidget *parent, ConfNull *config)
));
vboxLayout->addWidget(cp2);
+ ConfLink lnk3(new ConfLinkRclRep(config, "maxTermExpand"));
+ ConfParamIntW* cp3 =
+ new ConfParamIntW(this, lnk3,
+ tr("Maximum term expansion count"),
+ tr("
Maximum expansion count for a single term "
+ "(e.g.: when using wildcards). The default "
+ "of 10 000 is reasonable and will avoid "
+ "queries that appear frozen while the engine is "
+ "walking the term list."
+ ));
+ vboxLayout->addWidget(cp3);
+
+
+ ConfLink lnk4(new ConfLinkRclRep(config, "maxXapianClauses"));
+ ConfParamIntW* cp4 =
+ new ConfParamIntW(this, lnk4,
+ tr("Maximum Xapian clauses count"),
+ tr("
Maximum number of elementary clauses we "
+ "add to a single Xapian query. In some cases, "
+ "the result of term expansion can be "
+ "multiplicative, and we want to avoid using "
+ "excessive memory. The default of 100 000 "
+ "should be both high enough in most cases "
+ "and compatible with current typical hardware "
+ "configurations."
+ ));
+ vboxLayout->addWidget(cp4);
+
vboxLayout->insertStretch(-1);
}
diff --git a/src/query/docseq.h b/src/query/docseq.h
index 66a53c86..de10369b 100644
--- a/src/query/docseq.h
+++ b/src/query/docseq.h
@@ -138,7 +138,10 @@ class DocSequence {
{
return std::list();
}
-
+ virtual std::string getReason()
+ {
+ return m_reason;
+ }
/** Optional functionality. */
virtual bool canFilter() {return false;}
virtual bool canSort() {return false;}
@@ -154,6 +157,7 @@ class DocSequence {
protected:
static std::string o_sort_trans;
static std::string o_filt_trans;
+ std::string m_reason;
private:
std::string m_title;
};
@@ -206,6 +210,12 @@ public:
return false;
return m_seq->getEnclosing(doc, pdoc);
}
+ virtual std::string getReason()
+ {
+ if (m_seq.isNull())
+ return false;
+ return m_seq->getReason();
+ }
virtual std::string title() {return m_seq->title();}
virtual RefCntr getSourceSeq() {return m_seq;}
diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp
index c7ece824..73ed0057 100644
--- a/src/query/docseqdb.cpp
+++ b/src/query/docseqdb.cpp
@@ -51,14 +51,16 @@ string DocSequenceDb::getDescription()
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh)
{
- setQuery();
+ if (!setQuery())
+ return false;
if (sh) sh->erase();
return m_q->getDoc(num, doc);
}
int DocSequenceDb::getResCnt()
{
- setQuery();
+ if (!setQuery())
+ return false;
if (m_rescnt < 0) {
m_rescnt= m_q->getResCnt();
}
@@ -71,7 +73,8 @@ static const string cstr_mre("[...]");
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vpabs)
{
LOGDEB(("DocSequenceDb::getAbstract/pair\n"));
- setQuery();
+ if (!setQuery())
+ return false;
// Have to put the limit somewhere.
int maxoccs = 500;
@@ -93,7 +96,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vpabs)
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vabs)
{
- setQuery();
+ if (!setQuery())
+ return false;
if (m_q->whatDb() &&
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) {
m_q->makeDocAbstract(doc, vabs);
@@ -105,7 +109,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vabs)
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
{
- setQuery();
+ if (!setQuery())
+ return false;
if (m_q->whatDb()) {
return m_q->getFirstMatchPage(doc, term);
}
@@ -114,7 +119,8 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{
- setQuery();
+ if (!setQuery())
+ return false;
string udi;
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
udi))
@@ -124,7 +130,8 @@ bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
list DocSequenceDb::expand(Rcl::Doc &doc)
{
- setQuery();
+ if (!setQuery())
+ return list();
vector v = m_q->expand(doc);
return list(v.begin(), v.end());
}
@@ -209,13 +216,10 @@ bool DocSequenceDb::setQuery()
return true;
m_rescnt = -1;
m_needSetQuery = !m_q->setQuery(m_fsdata);
-
-#if 0
- HighlightData hld;
- m_fsdata->getTerms(hld);
- string str;
- hld.toString(str);
- fprintf(stderr, "DocSequenceDb::setQuery: terms: %s\n", str.c_str());
-#endif
+ if (m_needSetQuery) {
+ m_reason = m_q->getReason();
+ LOGERR(("DocSequenceDb::setQuery: rclquery::setQuery failed: %s\n",
+ m_reason.c_str()));
+ }
return !m_needSetQuery;
}
diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h
index 05a42235..a987f9ff 100644
--- a/src/query/docseqdb.h
+++ b/src/query/docseqdb.h
@@ -67,6 +67,7 @@ class DocSequenceDb : public DocSequence {
bool m_isFiltered;
bool m_isSorted;
bool m_needSetQuery; // search data changed, need to reapply before fetch
+
bool setQuery();
};
diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp
index 6c85e4a0..22a9da63 100644
--- a/src/query/recollq.cpp
+++ b/src/query/recollq.cpp
@@ -319,7 +319,10 @@ int recollq(RclConfig **cfp, int argc, char **argv)
query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
}
Chrono chron;
- query.setQuery(rq);
+ if (!query.setQuery(rq)) {
+ cerr << "Query setup failed: " << query.getReason() << endl;
+ return(1);
+ }
int cnt = query.getResCnt();
if (!(op_flags & OPT_b)) {
cout << "Recoll query: " << rq->getDescription() << endl;
diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp
index 2164dbda..3cda2cd3 100644
--- a/src/query/reslistpager.cpp
+++ b/src/query/reslistpager.cpp
@@ -337,37 +337,43 @@ void ResListPager::displayPage(RclConfig *config)
if (pageEmpty()) {
chunk << trans("
No results found ");
- HighlightData hldata;
- m_docSource->getTerms(hldata);
- vector uterms(hldata.uterms.begin(), hldata.uterms.end());
- if (!uterms.empty()) {
- map > spellings;
- suggest(uterms, spellings);
- if (!spellings.empty()) {
- if (o_index_stripchars) {
- chunk <<
- trans("