From a16d047f8db05a3504d202a7addbf695eb37c36f Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 8 Oct 2012 14:30:14 +0200 Subject: [PATCH] Snippet generation: limit positions walk to max hit position. Return status code when truncated walk possibly generated incomplete snippets. Implement config variabl for max pos walk --- src/rcldb/rclabstract.cpp | 33 +++++++++++++++++++++++---------- src/rcldb/rcldb.cpp | 2 +- src/rcldb/rclquery.cpp | 4 +++- src/rcldb/rclquery.h | 4 +++- src/rcldb/searchdata.cpp | 4 ++-- 5 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index 690d1372..c60743c8 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -342,6 +342,10 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, // them with their snippets. unordered_set searchTermPositions; + // Remember max position. Used to stop walking positions lists while + // populating the adjacent slots. + unsigned int maxpos = 0; + // Total number of occurences for all terms. We stop when we have too much unsigned int totaloccs = 0; @@ -419,6 +423,8 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, if (ii == (unsigned int)ipos) { sparseDoc[ii] = qterm; searchTermPositions.insert(ii); + if (ii > maxpos) + maxpos = ii; } else if (ii > (unsigned int)ipos && ii < (unsigned int)ipos + qtrmwrdcnt) { sparseDoc[ii] = occupiedmarker; @@ -460,6 +466,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, } LOGABS(("makeAbstract:%d:chosen number of positions %d\n", chron.millis(), totaloccs)); + maxpos += ctxwords + 1; // This can happen if there are term occurences in the keywords // etc. but not elsewhere ? @@ -472,31 +479,37 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, // around the query terms. We arbitrarily truncate the list to // avoid taking forever. If we do cutoff, the abstract may be // inconsistant (missing words, potentially altering meaning), - // which is bad. + // which is bad. { Xapian::TermIterator term; - int cutoff = 500 * 1000; - + int cutoff = m_q->m_snipMaxPosWalk; for (term = xrdb.termlist_begin(docid); term != xrdb.termlist_end(docid); term++) { // Ignore prefixed terms if (has_prefix(*term)) continue; - if (cutoff-- < 0) { - ret = ABSRES_TRUNC; - LOGDEB0(("makeAbstract: max term count cutoff\n")); + if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { + ret = ABSRES_TERMMISS; + LOGDEB0(("makeAbstract: max term count cutoff %d\n", + m_q->m_snipMaxPosWalk)); break; } + map::iterator vit; Xapian::PositionIterator pos; for (pos = xrdb.positionlist_begin(docid, *term); pos != xrdb.positionlist_end(docid, *term); pos++) { - if (cutoff-- < 0) { - ret = ABSRES_TRUNC; - LOGDEB0(("makeAbstract: max term count cutoff\n")); + if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { + ret = ABSRES_TERMMISS; + LOGDEB0(("makeAbstract: max term count cutoff %d\n", + m_q->m_snipMaxPosWalk)); + break; + } + // If we are beyond the max possible position, stop + // for this term + if (*pos > maxpos) { break; } - map::iterator vit; if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) { // Don't replace a term: the terms list is in // alphabetic order, and we may have several terms diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index f75e909d..01e45525 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1618,7 +1618,7 @@ bool Db::termMatch(MatchType typ, const string &lang, case 0: is = prefix; break; default: is = prefix + droot.substr(0, es); break; } - LOGDEB(("termMatch: initsec: [%s]\n", is.c_str())); + LOGDEB1(("termMatch: initsec: [%s]\n", is.c_str())); for (int tries = 0; tries < 2; tries++) { try { diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 475cb1da..d2e2dae6 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -141,8 +141,10 @@ private: Query::Query(Db *db) : m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true), - m_collapseDuplicates(false), m_resCnt(-1) + m_collapseDuplicates(false), m_resCnt(-1), m_snipMaxPosWalk(1000000) { + if (db) + db->getConf()->getConfParam("snippetMaxPosWalk", &m_snipMaxPosWalk); } Query::~Query() diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index a9e9d893..b9ad8aa1 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -32,7 +32,8 @@ class Doc; enum abstract_result { ABSRES_ERROR = 0, ABSRES_OK = 1, - ABSRES_TRUNC = 2 + ABSRES_TRUNC = 2, + ABSRES_TERMMISS = 3 }; // Snippet entry for makeDocAbstract @@ -126,6 +127,7 @@ private: bool m_collapseDuplicates; int m_resCnt; RefCntr m_sd; + int m_snipMaxPosWalk; /* Copyconst and assignement private and forbidden */ Query(const Query &) {} diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 9d4ac3ec..61b35328 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -598,8 +598,8 @@ public: if (m_ts->lastpos < pos) m_ts->lastpos = pos; bool noexpand = be ? m_ts->curnostemexp : true; - LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", - term.c_str(), pos, noexpand)); + LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", + term.c_str(), pos, noexpand)); if (m_terms[pos].size() < term.size()) { m_terms[pos] = term; m_nste[pos] = noexpand;