Building abstract/snippets from the doc text: process phrase/group terms

2018-01-03 15:28:46 +01:00 · 2018-01-03 15:28:46 +01:00 · 567401233a
commit 567401233a
parent 175ca9832f
1 changed files with 180 additions and 41 deletions
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@ -36,43 +36,74 @@

 using namespace std;

+// We now let plaintorich do the highlight tags insertions which is
+// wasteful because we have most of the information (but the perf hit
+// is small because it's only called on the output fragments, not on
+// the whole text). The highlight zone computation code has been left
+// around just in case I change my mind.
+#undef COMPUTE_HLZONES

 namespace Rcl {

-#warning NEAR and PHRASE
+// Chars we turn to spaces in the Snippets
+static const string cstr_nc("\n\r\x0c\\");

-// Text splitter for finding the match terms in the doc text.
+// Fragment descriptor. A fragment is a text area with one or several
+// matched terms and some context. It is ranked according to the
+// matched term weights and the near/phrase matches get a boost.
+struct MatchFragment {
+    // Start/End byte offsets of fragment in the document text
+    int start;
+    int stop;
+    // Weight for this fragment (bigger better)
+    double coef;
+#ifdef COMPUTE_HLZONES
+    // Highlight areas (each is one or several contiguous match
+    // terms). Because a fragment extends around a match, there
+    // can be several contiguous or separate matches in a given
+    // fragment.
+    vector<pair<int,int>> hlzones;
+#endif
+    // Position of the first matched term (for page number computations)
+    unsigned int hitpos;
+    // "best term" for this match (e.g. for use as ext app search term)
+    string term;
+        
+    MatchFragment(int sta, int sto, double c,
+#ifdef COMPUTE_HLZONES
+                  vector<pair<int,int>>& hl,
+#endif
+                  unsigned int pos, string& trm) 
+        : start(sta), stop(sto), coef(c), hitpos(pos) {
+#ifdef COMPUTE_HLZONES
+        hlzones.swap(hl);
+#endif
+        term.swap(trm);
+    }
+};
+
+
+// Text splitter for finding the match areas in the document text.
 class TextSplitABS : public TextSplit {
 public:

-    struct MatchEntry {
-        // Start/End byte offsets of fragment in the document text
-        int start;
-        int stop;
-        double coef;
-        // Position of the first matched term.
-        unsigned int hitpos;
-        // "best term" for this match
-        string term;
-        // Hilight areas (each is one or several contiguous match terms).
-        vector<pair<int,int>> hlzones;
-        
-        MatchEntry(int sta, int sto, double c, vector<pair<int,int>>& hl,
-                   unsigned int pos, string& trm) 
-            : start(sta), stop(sto), coef(c), hitpos(pos) {
-            hlzones.swap(hl);
-            term.swap(trm);
-        }
-    };
-
-
    TextSplitABS(const vector<string>& matchTerms,
+                 const HighlightData& hdata,
                 unordered_map<string, double>& wordcoefs,
                 unsigned int ctxwords,
                 Flags flags = TXTS_NONE)
-        :  TextSplit(flags),  m_terms(matchTerms.begin(), matchTerms.end()),
-           m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
-        LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl);
+        :  TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
+           m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
+
+        // Take note of the group (phrase/near) terms because we need
+        // to compute the position lists for them.
+        for (const auto& group : hdata.groups) {
+            if (group.size() > 1) {
+                for (const auto& term: group) {
+                    m_gterms.insert(term);
+                }
+            }
+        }
    }

    // Accept a word and its position. If the word is a matched term,
@ -80,7 +111,7 @@ public:
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
        LOGDEB2("takeword: " << term << endl);

-        // Recent past
+        // Remember recent past
        m_prevterms.push_back(pair<int,int>(bts,bte));
        if (m_prevterms.size() > m_ctxwords+1) {
            m_prevterms.pop_front();
@ -103,28 +134,35 @@ public:
                   m_remainingWords << endl);
            double coef = m_wordcoefs[dumb];
            if (!m_remainingWords) {
-                // No current fragment
+                // No current fragment. Start one
                m_curhitpos = baseTextPosition + pos;
                m_curfrag.first = m_prevterms.front().first;
                m_curfrag.second = m_prevterms.back().second;
+#ifdef COMPUTE_HLZONES
                m_curhlzones.push_back(pair<int,int>(bts, bte));
+#endif
                m_curterm = term;
                m_curtermcoef = coef;
            } else {
                LOGDEB2("Extending current fragment: " << m_remainingWords <<
                       " -> " << m_ctxwords << endl);
                m_extcount++;
+#ifdef COMPUTE_HLZONES
                if (m_prevwordhit) {
                    m_curhlzones.back().second = bte;
                } else {
                    m_curhlzones.push_back(pair<int,int>(bts, bte));
                }
+#endif
                if (coef > m_curtermcoef) {
                    m_curterm = term;
                    m_curtermcoef = coef;
                }
            }
+
+#ifdef COMPUTE_HLZONES
            m_prevwordhit = true;
+#endif
            m_curfragcoef += coef;
            m_remainingWords = m_ctxwords + 1;
            if (m_extcount > 3) {
@ -134,10 +172,24 @@ public:
                m_remainingWords = 1;
                m_extcount = 0;
            }
-        } else {
+
+            // If the term is part of a near/phrase group, update its
+            // positions list
+            if (m_gterms.find(dumb) != m_gterms.end()) {
+                // Term group (phrase/near) handling
+                m_plists[dumb].push_back(pos);
+                m_gpostobytes[pos] = pair<int,int>(bts, bte);
+                LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
+                        bte << "\n");
+            }
+        }
+#ifdef COMPUTE_HLZONES
+        else {
+            // Not a matched term
            m_prevwordhit = false;
        }
-       
+#endif
+
        
        if (m_remainingWords) {
            // Fragment currently open. Time to close ?
@ -146,10 +198,12 @@ public:
            if (m_remainingWords == 0) {
                if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
                    // Don't push bad fragments if we have a lot already
-                    m_fragments.push_back(MatchEntry(m_curfrag.first,
+                    m_fragments.push_back(MatchFragment(m_curfrag.first,
                                                     m_curfrag.second,
                                                     m_curfragcoef,
+#ifdef COMPUTE_HLZONES
                                                     m_curhlzones,
+#endif
                                                     m_curhitpos,
                                                     m_curterm
                                              ));
@ -161,10 +215,67 @@ public:
        }
        return true;
    }
-    const vector<MatchEntry>& getFragments() {
+    
+    const vector<MatchFragment>& getFragments() {
        return m_fragments;
    }

+
+    // After the text is split: use the group terms positions lists to
+    // find the group matches. We process everything as NEAR (no
+    // PHRASE specific processing).
+    void updgroups() {
+        vector<GroupMatchEntry> tboffs;
+
+        // Look for matches to PHRASE and NEAR term groups and finalize
+        // the matched regions list (sort it by increasing start then
+        // decreasing length). We process all groups as NEAR (ignore order).
+        for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
+            if (m_hdata.groups[i].size() > 1) {
+                matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
+            }
+        }
+
+        // Sort the fragments by increasing start and decreasing width
+        std::sort(m_fragments.begin(), m_fragments.end(),
+                  [](const MatchFragment& a, const MatchFragment& b) -> bool {
+                      if (a.start != b.start)
+                          return a.start < b.start;
+                      return a.stop - a.start > b.stop - a.stop;
+                  }
+            );
+        
+        // Sort the group regions by increasing start and decreasing width.  
+        std::sort(tboffs.begin(), tboffs.end(),
+                  [](const GroupMatchEntry& a, const GroupMatchEntry& b)
+                  -> bool {
+                      if (a.offs.first != b.offs.first)
+                          return a.offs.first < b.offs.first;
+                      return a.offs.second > b.offs.second;
+                  }
+            );
+
+        // Give a boost to fragments which contain a group match
+        // (phrase/near), they are dear to the user's heart.  list are
+        // sorted, so we never go back in the fragment list (can
+        // always start the search where we previously stopped).
+        auto fragit = m_fragments.begin();
+        for (const auto& grpmatch : tboffs) {
+            while (fragit->start > grpmatch.offs.first) {
+                fragit++;
+                if (fragit == m_fragments.end()) {
+                    return;
+                }
+            }
+            if (fragit->stop >= grpmatch.offs.second) {
+                // grp in frag
+                fragit->coef += 10.0;
+            }
+        }
+
+        return;
+    }
+    
 private:
    // Past terms because we need to go back for context before a hit
    deque<pair<int,int>>  m_prevterms;
@ -173,8 +284,10 @@ private:
    double m_curfragcoef{0.0};
    unsigned int m_remainingWords{0};
    unsigned int m_extcount{0};
+#ifdef COMPUTE_HLZONES
    vector<pair<int,int>> m_curhlzones;
    bool m_prevwordhit{false};
+#endif
    // Current sum of fragment weights
    double m_totalcoef{0.0};
    // Position of 1st term match (for page number computations)
@ -182,14 +295,21 @@ private:
    // "best" term
    string m_curterm;
    double m_curtermcoef{0.0};
+
+    // Group terms, extracted from m_hdata 
+    unordered_set<string> m_gterms;
+    // group/near terms word positions.
+    map<string, vector<int> > m_plists;
+    map<int, pair<int, int> > m_gpostobytes;
    
    // Input
-    set<string> m_terms;
+    unordered_set<string> m_terms;
+    const HighlightData& m_hdata;
    unordered_map<string, double>& m_wordcoefs;
    unsigned int m_ctxwords;

    // Result: begin and end byte positions of query terms/groups in text
-    vector<MatchEntry> m_fragments;  
+    vector<MatchFragment> m_fragments;  
 };

 int Query::Native::abstractFromText(
@ -256,26 +376,45 @@ int Query::Native::abstractFromText(
            wordcoefs[word] = mment.first;
        }
    }
-    TextSplitABS splitter(matchTerms, wordcoefs, ctxwords,
+
+    // Note: getTerms() was already called by qualityTerms, so this is
+    // a bit wasteful. I guess that the performance impact is
+    // negligible though. To be checked ? We need the highlightdata for the
+    // phrase/near groups.
+    HighlightData hld;
+    if (m_q->m_sd) {
+        m_q->m_sd->getTerms(hld);
+    }
+
+    TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
                          TextSplit::TXTS_ONLYSPANS);
    splitter.text_to_words(rawtext);
-    const vector<TextSplitABS::MatchEntry>& res1 = splitter.getFragments();
-    vector<TextSplitABS::MatchEntry> result(res1.begin(), res1.end());
+    splitter.updgroups();
+
+    // Sort the fragments by decreasing weight
+    const vector<MatchFragment>& res1 = splitter.getFragments();
+    vector<MatchFragment> result(res1.begin(), res1.end());
    std::sort(result.begin(), result.end(),
-              [](const TextSplitABS::MatchEntry& a,
-                 const TextSplitABS::MatchEntry& b) -> bool { 
+              [](const MatchFragment& a,
+                 const MatchFragment& b) -> bool { 
                  return a.coef > b.coef; 
              }
        );

-    static const string cstr_nc("\n\r\x0c\\");
+
    vector<int> vpbreaks;
    ndb->getPagePositions(docid, vpbreaks);
+
+    // Build the output snippets array by merging the fragments, their
+    // main term and the page positions. 
    unsigned int count = 0;
    for (const auto& entry : result) {
        string frag = neutchars(
            rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
-#if 0
+
+#ifdef COMPUTE_HLZONES
+        // This would need to be modified to take tag parameters
+        // instead of the const strings
        static const string starthit("<span style='color: blue;'>");
        static const string endhit("</span>");
        size_t inslen = 0;