abstract: we used to discard snippets too early, before they might get a phrase weight boost

This commit is contained in:
Jean-Francois Dockes 2019-05-24 08:51:03 +02:00
parent 15dc419fec
commit 2a945c9443

View File

@ -136,6 +136,14 @@ public:
// abstract will be incorrect or inexistant, but this is // abstract will be incorrect or inexistant, but this is
// better than taking forever (the default cutoff is 10E6) // better than taking forever (the default cutoff is 10E6)
if (maxtermcount && termcount++ > maxtermcount) { if (maxtermcount && termcount++ > maxtermcount) {
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
maxtermcount << endl);
return false;
}
// Also limit the number of fragments (just in case safety)
if (m_fragments.size() > maxtermcount / 100) {
LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
maxtermcount/100 << endl);
return false; return false;
} }
// Remember recent past // Remember recent past
@ -157,8 +165,8 @@ public:
if (m_terms.find(dumb) != m_terms.end()) { if (m_terms.find(dumb) != m_terms.end()) {
// This word is a search term. Extend or create fragment // This word is a search term. Extend or create fragment
LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first << LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
", " << m_curfrag.second << " remain " << ", " << m_curfrag.second << " remain " <<
m_remainingWords << endl); m_remainingWords << endl);
double coef = m_wordcoefs[dumb]; double coef = m_wordcoefs[dumb];
if (!m_remainingWords) { if (!m_remainingWords) {
// No current fragment. Start one // No current fragment. Start one
@ -172,7 +180,7 @@ public:
m_curtermcoef = coef; m_curtermcoef = coef;
} else { } else {
LOGDEB2("Extending current fragment: " << m_remainingWords << LOGDEB2("Extending current fragment: " << m_remainingWords <<
" -> " << m_ctxwords << endl); " -> " << m_ctxwords << endl);
m_extcount++; m_extcount++;
#ifdef COMPUTE_HLZONES #ifdef COMPUTE_HLZONES
if (m_prevwordhit) { if (m_prevwordhit) {
@ -223,18 +231,25 @@ public:
m_remainingWords--; m_remainingWords--;
m_curfrag.second = bte; m_curfrag.second = bte;
if (m_remainingWords == 0) { if (m_remainingWords == 0) {
if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) { // We used to not push weak fragments if we had a lot
// Don't push bad fragments if we have a lot already // already. This can cause problems if the fragments
m_fragments.push_back(MatchFragment(m_curfrag.first, // we drop are actually group fragments (which have
m_curfrag.second, // not got their boost yet). The right cut value is
m_curfragcoef, // difficult to determine, because the absolute values
// of the coefs depend on many things (index size,
// etc.) The old test was if (m_totalcoef < 5.0 ||
// m_curfragcoef >= 1.0) We now just avoid creating a
// monster by testing the current fragments count at
// the top of the function
m_fragments.push_back(MatchFragment(m_curfrag.first,
m_curfrag.second,
m_curfragcoef,
#ifdef COMPUTE_HLZONES #ifdef COMPUTE_HLZONES
m_curhlzones, m_curhlzones,
#endif #endif
m_curhitpos, m_curhitpos,
m_curterm m_curterm
)); ));
}
m_totalcoef += m_curfragcoef; m_totalcoef += m_curfragcoef;
m_curfragcoef = 0.0; m_curfragcoef = 0.0;
m_curtermcoef = 0.0; m_curtermcoef = 0.0;
@ -252,6 +267,8 @@ public:
// find the group matches. We process everything as NEAR (no // find the group matches. We process everything as NEAR (no
// PHRASE specific processing). // PHRASE specific processing).
void updgroups() { void updgroups() {
LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
" fragments" << endl);
vector<GroupMatchEntry> tboffs; vector<GroupMatchEntry> tboffs;
// Look for matches to PHRASE and NEAR term groups and finalize // Look for matches to PHRASE and NEAR term groups and finalize
@ -283,7 +300,7 @@ public:
); );
// Give a boost to fragments which contain a group match // Give a boost to fragments which contain a group match
// (phrase/near), they are dear to the user's heart. list are // (phrase/near), they are dear to the user's heart. Lists are
// sorted, so we never go back in the fragment list (can // sorted, so we never go back in the fragment list (can
// always start the search where we previously stopped). // always start the search where we previously stopped).
if (m_fragments.empty()) { if (m_fragments.empty()) {
@ -292,8 +309,8 @@ public:
auto fragit = m_fragments.begin(); auto fragit = m_fragments.begin();
for (const auto& grpmatch : tboffs) { for (const auto& grpmatch : tboffs) {
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
"-" << grpmatch.offs.second << " curfrag " << "-" << grpmatch.offs.second << " curfrag " <<
fragit->start << "-" << fragit->stop << endl); fragit->start << "-" << fragit->stop << endl);
while (fragit->stop < grpmatch.offs.first) { while (fragit->stop < grpmatch.offs.first) {
fragit++; fragit++;
if (fragit == m_fragments.end()) { if (fragit == m_fragments.end()) {
@ -413,7 +430,6 @@ int Query::Native::abstractFromText(
} }
); );
vector<int> vpbreaks; vector<int> vpbreaks;
ndb->getPagePositions(docid, vpbreaks); ndb->getPagePositions(docid, vpbreaks);