building abstract from stored text: limit count of terms explored to avoid taking forever on monster (multi mega-terms) documents
This commit is contained in:
parent
37e203d535
commit
fdb14e60ac
@ -37,6 +37,13 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
// #define DEBUGABSTRACT
|
||||||
|
#ifdef DEBUGABSTRACT
|
||||||
|
#define LOGABS LOGDEB
|
||||||
|
#else
|
||||||
|
#define LOGABS LOGDEB2
|
||||||
|
#endif
|
||||||
|
|
||||||
// We now let plaintorich do the highlight tags insertions which is
|
// We now let plaintorich do the highlight tags insertions which is
|
||||||
// wasteful because we have most of the information (but the perf hit
|
// wasteful because we have most of the information (but the perf hit
|
||||||
// is small because it's only called on the output fragments, not on
|
// is small because it's only called on the output fragments, not on
|
||||||
@ -104,9 +111,11 @@ public:
|
|||||||
const HighlightData& hdata,
|
const HighlightData& hdata,
|
||||||
unordered_map<string, double>& wordcoefs,
|
unordered_map<string, double>& wordcoefs,
|
||||||
unsigned int ctxwords,
|
unsigned int ctxwords,
|
||||||
Flags flags = TXTS_NONE)
|
Flags flags,
|
||||||
|
unsigned int maxterms)
|
||||||
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
||||||
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
|
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords),
|
||||||
|
maxtermcount(maxterms) {
|
||||||
|
|
||||||
// Take note of the group (phrase/near) terms because we need
|
// Take note of the group (phrase/near) terms because we need
|
||||||
// to compute the position lists for them.
|
// to compute the position lists for them.
|
||||||
@ -123,7 +132,12 @@ public:
|
|||||||
// add/update fragment definition.
|
// add/update fragment definition.
|
||||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||||
LOGDEB2("takeword: " << term << endl);
|
LOGDEB2("takeword: " << term << endl);
|
||||||
|
// Limit time taken with monster documents. The resulting
|
||||||
|
// abstract will be incorrect or inexistant, but this is
|
||||||
|
// better than taking forever (the default cutoff is 10E6)
|
||||||
|
if (maxtermcount && termcount++ > maxtermcount) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// Remember recent past
|
// Remember recent past
|
||||||
m_prevterms.push_back(pair<int,int>(bts,bte));
|
m_prevterms.push_back(pair<int,int>(bts,bte));
|
||||||
if (m_prevterms.size() > m_ctxwords+1) {
|
if (m_prevterms.size() > m_ctxwords+1) {
|
||||||
@ -330,6 +344,9 @@ private:
|
|||||||
|
|
||||||
// Result: begin and end byte positions of query terms/groups in text
|
// Result: begin and end byte positions of query terms/groups in text
|
||||||
vector<MatchFragment> m_fragments;
|
vector<MatchFragment> m_fragments;
|
||||||
|
|
||||||
|
unsigned int termcount{0};
|
||||||
|
unsigned int maxtermcount{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
int Query::Native::abstractFromText(
|
int Query::Native::abstractFromText(
|
||||||
@ -341,14 +358,18 @@ int Query::Native::abstractFromText(
|
|||||||
int ctxwords,
|
int ctxwords,
|
||||||
unsigned int maxtotaloccs,
|
unsigned int maxtotaloccs,
|
||||||
vector<Snippet>& vabs,
|
vector<Snippet>& vabs,
|
||||||
Chrono&
|
Chrono& chron
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
(void)chron;
|
||||||
|
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
|
||||||
string rawtext;
|
string rawtext;
|
||||||
if (!ndb->getRawText(docid, rawtext)) {
|
if (!ndb->getRawText(docid, rawtext)) {
|
||||||
LOGDEB0("abstractFromText: can't fetch text\n");
|
LOGDEB0("abstractFromText: can't fetch text\n");
|
||||||
return ABSRES_ERROR;
|
return ABSRES_ERROR;
|
||||||
}
|
}
|
||||||
|
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
|
||||||
|
chron.millis() << "mS\n");
|
||||||
|
|
||||||
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
||||||
(defined(RAWTEXT_IN_DATA))
|
(defined(RAWTEXT_IN_DATA))
|
||||||
@ -373,10 +394,13 @@ int Query::Native::abstractFromText(
|
|||||||
if (m_q->m_sd) {
|
if (m_q->m_sd) {
|
||||||
m_q->m_sd->getTerms(hld);
|
m_q->m_sd->getTerms(hld);
|
||||||
}
|
}
|
||||||
|
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
|
||||||
|
|
||||||
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
|
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
|
||||||
TextSplit::TXTS_ONLYSPANS);
|
TextSplit::TXTS_ONLYSPANS,
|
||||||
|
m_q->m_snipMaxPosWalk);
|
||||||
splitter.text_to_words(rawtext);
|
splitter.text_to_words(rawtext);
|
||||||
|
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
|
||||||
splitter.updgroups();
|
splitter.updgroups();
|
||||||
|
|
||||||
// Sort the fragments by decreasing weight
|
// Sort the fragments by decreasing weight
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user