building abstract from stored text: limit count of terms explored to avoid taking forever on monster (multi mega-terms) documents

This commit is contained in:
Jean-Francois Dockes 2019-05-17 09:37:39 +02:00
parent 37e203d535
commit fdb14e60ac

View File

@ -37,6 +37,13 @@
using namespace std; using namespace std;
// #define DEBUGABSTRACT
#ifdef DEBUGABSTRACT
#define LOGABS LOGDEB
#else
#define LOGABS LOGDEB2
#endif
// We now let plaintorich do the highlight tags insertions which is // We now let plaintorich do the highlight tags insertions which is
// wasteful because we have most of the information (but the perf hit // wasteful because we have most of the information (but the perf hit
// is small because it's only called on the output fragments, not on // is small because it's only called on the output fragments, not on
@ -104,9 +111,11 @@ public:
const HighlightData& hdata, const HighlightData& hdata,
unordered_map<string, double>& wordcoefs, unordered_map<string, double>& wordcoefs,
unsigned int ctxwords, unsigned int ctxwords,
Flags flags = TXTS_NONE) Flags flags,
unsigned int maxterms)
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()), : TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) { m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords),
maxtermcount(maxterms) {
// Take note of the group (phrase/near) terms because we need // Take note of the group (phrase/near) terms because we need
// to compute the position lists for them. // to compute the position lists for them.
@ -123,7 +132,12 @@ public:
// add/update fragment definition. // add/update fragment definition.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) { virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
LOGDEB2("takeword: " << term << endl); LOGDEB2("takeword: " << term << endl);
// Limit time taken with monster documents. The resulting
// abstract will be incorrect or inexistant, but this is
// better than taking forever (the default cutoff is 10E6)
if (maxtermcount && termcount++ > maxtermcount) {
return false;
}
// Remember recent past // Remember recent past
m_prevterms.push_back(pair<int,int>(bts,bte)); m_prevterms.push_back(pair<int,int>(bts,bte));
if (m_prevterms.size() > m_ctxwords+1) { if (m_prevterms.size() > m_ctxwords+1) {
@ -329,7 +343,10 @@ private:
unsigned int m_ctxwords; unsigned int m_ctxwords;
// Result: begin and end byte positions of query terms/groups in text // Result: begin and end byte positions of query terms/groups in text
vector<MatchFragment> m_fragments; vector<MatchFragment> m_fragments;
unsigned int termcount{0};
unsigned int maxtermcount{0};
}; };
int Query::Native::abstractFromText( int Query::Native::abstractFromText(
@ -341,14 +358,18 @@ int Query::Native::abstractFromText(
int ctxwords, int ctxwords,
unsigned int maxtotaloccs, unsigned int maxtotaloccs,
vector<Snippet>& vabs, vector<Snippet>& vabs,
Chrono& Chrono& chron
) )
{ {
(void)chron;
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
string rawtext; string rawtext;
if (!ndb->getRawText(docid, rawtext)) { if (!ndb->getRawText(docid, rawtext)) {
LOGDEB0("abstractFromText: can't fetch text\n"); LOGDEB0("abstractFromText: can't fetch text\n");
return ABSRES_ERROR; return ABSRES_ERROR;
} }
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
chron.millis() << "mS\n");
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \ #if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
(defined(RAWTEXT_IN_DATA)) (defined(RAWTEXT_IN_DATA))
@ -373,10 +394,13 @@ int Query::Native::abstractFromText(
if (m_q->m_sd) { if (m_q->m_sd) {
m_q->m_sd->getTerms(hld); m_q->m_sd->getTerms(hld);
} }
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords, TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
TextSplit::TXTS_ONLYSPANS); TextSplit::TXTS_ONLYSPANS,
m_q->m_snipMaxPosWalk);
splitter.text_to_words(rawtext); splitter.text_to_words(rawtext);
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
splitter.updgroups(); splitter.updgroups();
// Sort the fragments by decreasing weight // Sort the fragments by decreasing weight