From 567401233a11cf4e6279503bf1f4429394062ee9 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 3 Jan 2018 15:28:46 +0100 Subject: [PATCH] Building abstract/snippets from the doc text: process phrase/group terms --- src/rcldb/rclabsfromtext.cpp | 221 ++++++++++++++++++++++++++++------- 1 file changed, 180 insertions(+), 41 deletions(-) diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index b65eab51..9d37e22b 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -36,43 +36,74 @@ using namespace std; +// We now let plaintorich do the highlight tags insertions which is +// wasteful because we have most of the information (but the perf hit +// is small because it's only called on the output fragments, not on +// the whole text). The highlight zone computation code has been left +// around just in case I change my mind. +#undef COMPUTE_HLZONES namespace Rcl { -#warning NEAR and PHRASE +// Chars we turn to spaces in the Snippets +static const string cstr_nc("\n\r\x0c\\"); -// Text splitter for finding the match terms in the doc text. +// Fragment descriptor. A fragment is a text area with one or several +// matched terms and some context. It is ranked according to the +// matched term weights and the near/phrase matches get a boost. +struct MatchFragment { + // Start/End byte offsets of fragment in the document text + int start; + int stop; + // Weight for this fragment (bigger better) + double coef; +#ifdef COMPUTE_HLZONES + // Highlight areas (each is one or several contiguous match + // terms). Because a fragment extends around a match, there + // can be several contiguous or separate matches in a given + // fragment. + vector> hlzones; +#endif + // Position of the first matched term (for page number computations) + unsigned int hitpos; + // "best term" for this match (e.g. for use as ext app search term) + string term; + + MatchFragment(int sta, int sto, double c, +#ifdef COMPUTE_HLZONES + vector>& hl, +#endif + unsigned int pos, string& trm) + : start(sta), stop(sto), coef(c), hitpos(pos) { +#ifdef COMPUTE_HLZONES + hlzones.swap(hl); +#endif + term.swap(trm); + } +}; + + +// Text splitter for finding the match areas in the document text. class TextSplitABS : public TextSplit { public: - struct MatchEntry { - // Start/End byte offsets of fragment in the document text - int start; - int stop; - double coef; - // Position of the first matched term. - unsigned int hitpos; - // "best term" for this match - string term; - // Hilight areas (each is one or several contiguous match terms). - vector> hlzones; - - MatchEntry(int sta, int sto, double c, vector>& hl, - unsigned int pos, string& trm) - : start(sta), stop(sto), coef(c), hitpos(pos) { - hlzones.swap(hl); - term.swap(trm); - } - }; - - TextSplitABS(const vector& matchTerms, + const HighlightData& hdata, unordered_map& wordcoefs, unsigned int ctxwords, Flags flags = TXTS_NONE) - : TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()), - m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) { - LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl); + : TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()), + m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) { + + // Take note of the group (phrase/near) terms because we need + // to compute the position lists for them. + for (const auto& group : hdata.groups) { + if (group.size() > 1) { + for (const auto& term: group) { + m_gterms.insert(term); + } + } + } } // Accept a word and its position. If the word is a matched term, @@ -80,7 +111,7 @@ public: virtual bool takeword(const std::string& term, int pos, int bts, int bte) { LOGDEB2("takeword: " << term << endl); - // Recent past + // Remember recent past m_prevterms.push_back(pair(bts,bte)); if (m_prevterms.size() > m_ctxwords+1) { m_prevterms.pop_front(); @@ -103,28 +134,35 @@ public: m_remainingWords << endl); double coef = m_wordcoefs[dumb]; if (!m_remainingWords) { - // No current fragment + // No current fragment. Start one m_curhitpos = baseTextPosition + pos; m_curfrag.first = m_prevterms.front().first; m_curfrag.second = m_prevterms.back().second; +#ifdef COMPUTE_HLZONES m_curhlzones.push_back(pair(bts, bte)); +#endif m_curterm = term; m_curtermcoef = coef; } else { LOGDEB2("Extending current fragment: " << m_remainingWords << " -> " << m_ctxwords << endl); m_extcount++; +#ifdef COMPUTE_HLZONES if (m_prevwordhit) { m_curhlzones.back().second = bte; } else { m_curhlzones.push_back(pair(bts, bte)); } +#endif if (coef > m_curtermcoef) { m_curterm = term; m_curtermcoef = coef; } } + +#ifdef COMPUTE_HLZONES m_prevwordhit = true; +#endif m_curfragcoef += coef; m_remainingWords = m_ctxwords + 1; if (m_extcount > 3) { @@ -134,10 +172,24 @@ public: m_remainingWords = 1; m_extcount = 0; } - } else { + + // If the term is part of a near/phrase group, update its + // positions list + if (m_gterms.find(dumb) != m_gterms.end()) { + // Term group (phrase/near) handling + m_plists[dumb].push_back(pos); + m_gpostobytes[pos] = pair(bts, bte); + LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " << + bte << "\n"); + } + } +#ifdef COMPUTE_HLZONES + else { + // Not a matched term m_prevwordhit = false; } - +#endif + if (m_remainingWords) { // Fragment currently open. Time to close ? @@ -146,10 +198,12 @@ public: if (m_remainingWords == 0) { if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) { // Don't push bad fragments if we have a lot already - m_fragments.push_back(MatchEntry(m_curfrag.first, + m_fragments.push_back(MatchFragment(m_curfrag.first, m_curfrag.second, m_curfragcoef, +#ifdef COMPUTE_HLZONES m_curhlzones, +#endif m_curhitpos, m_curterm )); @@ -161,10 +215,67 @@ public: } return true; } - const vector& getFragments() { + + const vector& getFragments() { return m_fragments; } + + // After the text is split: use the group terms positions lists to + // find the group matches. We process everything as NEAR (no + // PHRASE specific processing). + void updgroups() { + vector tboffs; + + // Look for matches to PHRASE and NEAR term groups and finalize + // the matched regions list (sort it by increasing start then + // decreasing length). We process all groups as NEAR (ignore order). + for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { + if (m_hdata.groups[i].size() > 1) { + matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs); + } + } + + // Sort the fragments by increasing start and decreasing width + std::sort(m_fragments.begin(), m_fragments.end(), + [](const MatchFragment& a, const MatchFragment& b) -> bool { + if (a.start != b.start) + return a.start < b.start; + return a.stop - a.start > b.stop - a.stop; + } + ); + + // Sort the group regions by increasing start and decreasing width. + std::sort(tboffs.begin(), tboffs.end(), + [](const GroupMatchEntry& a, const GroupMatchEntry& b) + -> bool { + if (a.offs.first != b.offs.first) + return a.offs.first < b.offs.first; + return a.offs.second > b.offs.second; + } + ); + + // Give a boost to fragments which contain a group match + // (phrase/near), they are dear to the user's heart. list are + // sorted, so we never go back in the fragment list (can + // always start the search where we previously stopped). + auto fragit = m_fragments.begin(); + for (const auto& grpmatch : tboffs) { + while (fragit->start > grpmatch.offs.first) { + fragit++; + if (fragit == m_fragments.end()) { + return; + } + } + if (fragit->stop >= grpmatch.offs.second) { + // grp in frag + fragit->coef += 10.0; + } + } + + return; + } + private: // Past terms because we need to go back for context before a hit deque> m_prevterms; @@ -173,8 +284,10 @@ private: double m_curfragcoef{0.0}; unsigned int m_remainingWords{0}; unsigned int m_extcount{0}; +#ifdef COMPUTE_HLZONES vector> m_curhlzones; bool m_prevwordhit{false}; +#endif // Current sum of fragment weights double m_totalcoef{0.0}; // Position of 1st term match (for page number computations) @@ -182,14 +295,21 @@ private: // "best" term string m_curterm; double m_curtermcoef{0.0}; + + // Group terms, extracted from m_hdata + unordered_set m_gterms; + // group/near terms word positions. + map > m_plists; + map > m_gpostobytes; // Input - set m_terms; + unordered_set m_terms; + const HighlightData& m_hdata; unordered_map& m_wordcoefs; unsigned int m_ctxwords; // Result: begin and end byte positions of query terms/groups in text - vector m_fragments; + vector m_fragments; }; int Query::Native::abstractFromText( @@ -256,26 +376,45 @@ int Query::Native::abstractFromText( wordcoefs[word] = mment.first; } } - TextSplitABS splitter(matchTerms, wordcoefs, ctxwords, + + // Note: getTerms() was already called by qualityTerms, so this is + // a bit wasteful. I guess that the performance impact is + // negligible though. To be checked ? We need the highlightdata for the + // phrase/near groups. + HighlightData hld; + if (m_q->m_sd) { + m_q->m_sd->getTerms(hld); + } + + TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_ONLYSPANS); splitter.text_to_words(rawtext); - const vector& res1 = splitter.getFragments(); - vector result(res1.begin(), res1.end()); + splitter.updgroups(); + + // Sort the fragments by decreasing weight + const vector& res1 = splitter.getFragments(); + vector result(res1.begin(), res1.end()); std::sort(result.begin(), result.end(), - [](const TextSplitABS::MatchEntry& a, - const TextSplitABS::MatchEntry& b) -> bool { + [](const MatchFragment& a, + const MatchFragment& b) -> bool { return a.coef > b.coef; } ); - static const string cstr_nc("\n\r\x0c\\"); + vector vpbreaks; ndb->getPagePositions(docid, vpbreaks); + + // Build the output snippets array by merging the fragments, their + // main term and the page positions. unsigned int count = 0; for (const auto& entry : result) { string frag = neutchars( rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc); -#if 0 + +#ifdef COMPUTE_HLZONES + // This would need to be modified to take tag parameters + // instead of the const strings static const string starthit(""); static const string endhit(""); size_t inslen = 0;