From 567401233a11cf4e6279503bf1f4429394062ee9 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Wed, 3 Jan 2018 15:28:46 +0100
Subject: [PATCH] Building abstract/snippets from the doc text: process
 phrase/group terms

---
 src/rcldb/rclabsfromtext.cpp | 221 ++++++++++++++++++++++++++++-------
 1 file changed, 180 insertions(+), 41 deletions(-)
diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp
index b65eab51..9d37e22b 100644
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@@ -36,43 +36,74 @@
 
 using namespace std;
 
+// We now let plaintorich do the highlight tags insertions which is
+// wasteful because we have most of the information (but the perf hit
+// is small because it's only called on the output fragments, not on
+// the whole text). The highlight zone computation code has been left
+// around just in case I change my mind.
+#undef COMPUTE_HLZONES
 
 namespace Rcl {
 
-#warning NEAR and PHRASE
+// Chars we turn to spaces in the Snippets
+static const string cstr_nc("\n\r\x0c\\");
 
-// Text splitter for finding the match terms in the doc text.
+// Fragment descriptor. A fragment is a text area with one or several
+// matched terms and some context. It is ranked according to the
+// matched term weights and the near/phrase matches get a boost.
+struct MatchFragment {
+    // Start/End byte offsets of fragment in the document text
+    int start;
+    int stop;
+    // Weight for this fragment (bigger better)
+    double coef;
+#ifdef COMPUTE_HLZONES
+    // Highlight areas (each is one or several contiguous match
+    // terms). Because a fragment extends around a match, there
+    // can be several contiguous or separate matches in a given
+    // fragment.
+    vector<pair<int,int>> hlzones;
+#endif
+    // Position of the first matched term (for page number computations)
+    unsigned int hitpos;
+    // "best term" for this match (e.g. for use as ext app search term)
+    string term;
+        
+    MatchFragment(int sta, int sto, double c,
+#ifdef COMPUTE_HLZONES
+                  vector<pair<int,int>>& hl,
+#endif
+                  unsigned int pos, string& trm) 
+        : start(sta), stop(sto), coef(c), hitpos(pos) {
+#ifdef COMPUTE_HLZONES
+        hlzones.swap(hl);
+#endif
+        term.swap(trm);
+    }
+};
+
+
+// Text splitter for finding the match areas in the document text.
 class TextSplitABS : public TextSplit {
 public:
 
-    struct MatchEntry {
-        // Start/End byte offsets of fragment in the document text
-        int start;
-        int stop;
-        double coef;
-        // Position of the first matched term.
-        unsigned int hitpos;
-        // "best term" for this match
-        string term;
-        // Hilight areas (each is one or several contiguous match terms).
-        vector<pair<int,int>> hlzones;
-        
-        MatchEntry(int sta, int sto, double c, vector<pair<int,int>>& hl,
-                   unsigned int pos, string& trm) 
-            : start(sta), stop(sto), coef(c), hitpos(pos) {
-            hlzones.swap(hl);
-            term.swap(trm);
-        }
-    };
-
-
     TextSplitABS(const vector<string>& matchTerms,
+                 const HighlightData& hdata,
                  unordered_map<string, double>& wordcoefs,
                  unsigned int ctxwords,
                  Flags flags = TXTS_NONE)
-        :  TextSplit(flags),  m_terms(matchTerms.begin(), matchTerms.end()),
-           m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
-        LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl);
+        :  TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
+           m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
+
+        // Take note of the group (phrase/near) terms because we need
+        // to compute the position lists for them.
+        for (const auto& group : hdata.groups) {
+            if (group.size() > 1) {
+                for (const auto& term: group) {
+                    m_gterms.insert(term);
+                }
+            }
+        }
     }
 
     // Accept a word and its position. If the word is a matched term,
@@ -80,7 +111,7 @@ public:
     virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
         LOGDEB2("takeword: " << term << endl);
 
-        // Recent past
+        // Remember recent past
         m_prevterms.push_back(pair<int,int>(bts,bte));
         if (m_prevterms.size() > m_ctxwords+1) {
             m_prevterms.pop_front();
@@ -103,28 +134,35 @@ public:
                    m_remainingWords << endl);
             double coef = m_wordcoefs[dumb];
             if (!m_remainingWords) {
-                // No current fragment
+                // No current fragment. Start one
                 m_curhitpos = baseTextPosition + pos;
                 m_curfrag.first = m_prevterms.front().first;
                 m_curfrag.second = m_prevterms.back().second;
+#ifdef COMPUTE_HLZONES
                 m_curhlzones.push_back(pair<int,int>(bts, bte));
+#endif
                 m_curterm = term;
                 m_curtermcoef = coef;
             } else {
                 LOGDEB2("Extending current fragment: " << m_remainingWords <<
                        " -> " << m_ctxwords << endl);
                 m_extcount++;
+#ifdef COMPUTE_HLZONES
                 if (m_prevwordhit) {
                     m_curhlzones.back().second = bte;
                 } else {
                     m_curhlzones.push_back(pair<int,int>(bts, bte));
                 }
+#endif
                 if (coef > m_curtermcoef) {
                     m_curterm = term;
                     m_curtermcoef = coef;
                 }
             }
+
+#ifdef COMPUTE_HLZONES
             m_prevwordhit = true;
+#endif
             m_curfragcoef += coef;
             m_remainingWords = m_ctxwords + 1;
             if (m_extcount > 3) {
@@ -134,10 +172,24 @@ public:
                 m_remainingWords = 1;
                 m_extcount = 0;
             }
-        } else {
+
+            // If the term is part of a near/phrase group, update its
+            // positions list
+            if (m_gterms.find(dumb) != m_gterms.end()) {
+                // Term group (phrase/near) handling
+                m_plists[dumb].push_back(pos);
+                m_gpostobytes[pos] = pair<int,int>(bts, bte);
+                LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
+                        bte << "\n");
+            }
+        }
+#ifdef COMPUTE_HLZONES
+        else {
+            // Not a matched term
             m_prevwordhit = false;
         }
-       
+#endif
+
         
         if (m_remainingWords) {
             // Fragment currently open. Time to close ?
@@ -146,10 +198,12 @@ public:
             if (m_remainingWords == 0) {
                 if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
                     // Don't push bad fragments if we have a lot already
-                    m_fragments.push_back(MatchEntry(m_curfrag.first,
+                    m_fragments.push_back(MatchFragment(m_curfrag.first,
                                                      m_curfrag.second,
                                                      m_curfragcoef,
+#ifdef COMPUTE_HLZONES
                                                      m_curhlzones,
+#endif
                                                      m_curhitpos,
                                                      m_curterm
                                               ));
@@ -161,10 +215,67 @@ public:
         }
         return true;
     }
-    const vector<MatchEntry>& getFragments() {
+    
+    const vector<MatchFragment>& getFragments() {
         return m_fragments;
     }
 
+
+    // After the text is split: use the group terms positions lists to
+    // find the group matches. We process everything as NEAR (no
+    // PHRASE specific processing).
+    void updgroups() {
+        vector<GroupMatchEntry> tboffs;
+
+        // Look for matches to PHRASE and NEAR term groups and finalize
+        // the matched regions list (sort it by increasing start then
+        // decreasing length). We process all groups as NEAR (ignore order).
+        for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
+            if (m_hdata.groups[i].size() > 1) {
+                matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
+            }
+        }
+
+        // Sort the fragments by increasing start and decreasing width
+        std::sort(m_fragments.begin(), m_fragments.end(),
+                  [](const MatchFragment& a, const MatchFragment& b) -> bool {
+                      if (a.start != b.start)
+                          return a.start < b.start;
+                      return a.stop - a.start > b.stop - a.stop;
+                  }
+            );
+        
+        // Sort the group regions by increasing start and decreasing width.  
+        std::sort(tboffs.begin(), tboffs.end(),
+                  [](const GroupMatchEntry& a, const GroupMatchEntry& b)
+                  -> bool {
+                      if (a.offs.first != b.offs.first)
+                          return a.offs.first < b.offs.first;
+                      return a.offs.second > b.offs.second;
+                  }
+            );
+
+        // Give a boost to fragments which contain a group match
+        // (phrase/near), they are dear to the user's heart.  list are
+        // sorted, so we never go back in the fragment list (can
+        // always start the search where we previously stopped).
+        auto fragit = m_fragments.begin();
+        for (const auto& grpmatch : tboffs) {
+            while (fragit->start > grpmatch.offs.first) {
+                fragit++;
+                if (fragit == m_fragments.end()) {
+                    return;
+                }
+            }
+            if (fragit->stop >= grpmatch.offs.second) {
+                // grp in frag
+                fragit->coef += 10.0;
+            }
+        }
+
+        return;
+    }
+    
 private:
     // Past terms because we need to go back for context before a hit
     deque<pair<int,int>>  m_prevterms;
@@ -173,8 +284,10 @@ private:
     double m_curfragcoef{0.0};
     unsigned int m_remainingWords{0};
     unsigned int m_extcount{0};
+#ifdef COMPUTE_HLZONES
     vector<pair<int,int>> m_curhlzones;
     bool m_prevwordhit{false};
+#endif
     // Current sum of fragment weights
     double m_totalcoef{0.0};
     // Position of 1st term match (for page number computations)
@@ -182,14 +295,21 @@ private:
     // "best" term
     string m_curterm;
     double m_curtermcoef{0.0};
+
+    // Group terms, extracted from m_hdata 
+    unordered_set<string> m_gterms;
+    // group/near terms word positions.
+    map<string, vector<int> > m_plists;
+    map<int, pair<int, int> > m_gpostobytes;
     
     // Input
-    set<string> m_terms;
+    unordered_set<string> m_terms;
+    const HighlightData& m_hdata;
     unordered_map<string, double>& m_wordcoefs;
     unsigned int m_ctxwords;
 
     // Result: begin and end byte positions of query terms/groups in text
-    vector<MatchEntry> m_fragments;  
+    vector<MatchFragment> m_fragments;  
 };
 
 int Query::Native::abstractFromText(
@@ -256,26 +376,45 @@ int Query::Native::abstractFromText(
             wordcoefs[word] = mment.first;
         }
     }
-    TextSplitABS splitter(matchTerms, wordcoefs, ctxwords,
+
+    // Note: getTerms() was already called by qualityTerms, so this is
+    // a bit wasteful. I guess that the performance impact is
+    // negligible though. To be checked ? We need the highlightdata for the
+    // phrase/near groups.
+    HighlightData hld;
+    if (m_q->m_sd) {
+        m_q->m_sd->getTerms(hld);
+    }
+
+    TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
                           TextSplit::TXTS_ONLYSPANS);
     splitter.text_to_words(rawtext);
-    const vector<TextSplitABS::MatchEntry>& res1 = splitter.getFragments();
-    vector<TextSplitABS::MatchEntry> result(res1.begin(), res1.end());
+    splitter.updgroups();
+
+    // Sort the fragments by decreasing weight
+    const vector<MatchFragment>& res1 = splitter.getFragments();
+    vector<MatchFragment> result(res1.begin(), res1.end());
     std::sort(result.begin(), result.end(),
-              [](const TextSplitABS::MatchEntry& a,
-                 const TextSplitABS::MatchEntry& b) -> bool { 
+              [](const MatchFragment& a,
+                 const MatchFragment& b) -> bool { 
                   return a.coef > b.coef; 
               }
         );
 
-    static const string cstr_nc("\n\r\x0c\\");
+
     vector<int> vpbreaks;
     ndb->getPagePositions(docid, vpbreaks);
+
+    // Build the output snippets array by merging the fragments, their
+    // main term and the page positions. 
     unsigned int count = 0;
     for (const auto& entry : result) {
         string frag = neutchars(
             rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
-#if 0
+
+#ifdef COMPUTE_HLZONES
+        // This would need to be modified to take tag parameters
+        // instead of the const strings
         static const string starthit("<span style='color: blue;'>");
         static const string endhit("</span>");
         size_t inslen = 0;