Extracted some code from plaintorich, for reuse by the new snippets building code

2018-01-03 15:28:11 +01:00 · 2018-01-03 15:28:11 +01:00 · 175ca9832f
commit 175ca9832f
parent bb810f9ceb
3 changed files with 211 additions and 185 deletions
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@ -40,24 +40,13 @@ using std::set;
 #include "cancelcheck.h"
 #include "unacpp.h"

-struct MatchEntry {
-    // Start/End byte offsets in the document text
-    pair<int, int> offs;
-    // Index of the search group this comes from: this is to relate a 
-    // match to the original user input.
-    size_t grpidx;
-    MatchEntry(int sta, int sto, size_t idx) 
-        : offs(sta, sto), grpidx(idx) {
-    }
-};
-
 // Text splitter used to take note of the position of query terms
 // inside the result text. This is then used to insert highlight tags.
 class TextSplitPTR : public TextSplit {
 public:

    // Out: begin and end byte positions of query terms/groups in text
-    vector<MatchEntry> tboffs;  
+    vector<GroupMatchEntry> m_tboffs;

    TextSplitPTR(const HighlightData& hdata)
        :  m_wcount(0), m_hdata(hdata) {
@ -96,7 +85,7 @@ public:
        // If this word is a search term, remember its byte-offset span. 
        map<string, size_t>::const_iterator it = m_terms.find(dumb);
        if (it != m_terms.end()) {
-            tboffs.push_back(MatchEntry(bts, bte, it->second));
+            m_tboffs.push_back(GroupMatchEntry(bts, bte, it->second));
        }
        
        // If word is part of a search group, update its positions list
@ -119,8 +108,6 @@ public:
    virtual bool matchGroups();

 private:
-    virtual bool matchGroup(unsigned int idx);
-
    // Word count. Used to call checkCancel from time to time.
    int m_wcount;

@ -138,166 +125,6 @@ private:
 };


-/** Sort by shorter comparison class */
-class VecIntCmpShorter {
-public:
-    /** Return true if and only if a is strictly shorter than b. */
-    bool operator()(const vector<int> *a, const vector<int> *b) {
-        return a->size() < b->size();
-    }
-};
-
-#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS);    \
-        if ((POS) > (STO)) (STO) = (POS);}
-
-// Check that at least an entry from the first position list is inside
-// the window and recurse on next list. The window is readjusted as
-// the successive terms are found.
-//
-// @param window the search window width
-// @param plists the position list vector
-// @param i the position list to process (we then recurse with the next list)
-// @param min the current minimum pos for a found term
-// @param max the current maximum pos for a found term
-// @param sp, ep output: the found area
-// @param minpos bottom of search: this is the highest point of
-//    any previous match. We don't look below this as overlapping matches 
-//    make no sense for highlighting.
-static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
-                              unsigned int i, int min, int max, 
-                              int *sp, int *ep, int minpos)
-{
-    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
-            min << " max " << max << " minpos " << minpos << "\n");
-    int tmp = max + 1 - window;
-    if (tmp < minpos)
-        tmp = minpos;
-
-    // Find 1st position bigger than window start
-    vector<int>::iterator it = plists[i]->begin();
-    while (it != plists[i]->end() && *it < tmp)
-        it++;
-
-    // Look for position inside window. If not found, no match. If
-    // found: if this is the last list we're done, else recurse on
-    // next list after adjusting the window
-    while (it != plists[i]->end()) {
-        int pos = *it;
-        if (pos > min + window - 1) 
-            return false;
-        if (i + 1 == plists.size()) {
-            SETMINMAX(pos, *sp, *ep);
-            return true;
-        }
-        SETMINMAX(pos, min, max);
-        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
-            SETMINMAX(pos, *sp, *ep);
-            return true;
-        }
-        it++;
-    }
-    return false;
-}
-
-// Find NEAR matches for one group of terms, update highlight map
-bool TextSplitPTR::matchGroup(unsigned int grpidx)
-{
-    const vector<string>& terms = m_hdata.groups[grpidx];
-    int window = int(m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx]);
-
-    LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " <<
-            stringsToString(terms) << "\n");
-
-    // The position lists we are going to work with. We extract them from the 
-    // (string->plist) map
-    vector<vector<int>* > plists;
-    // A revert plist->term map. This is so that we can find who is who after
-    // sorting the plists by length.
-    map<vector<int>*, string> plistToTerm;
-
-    // Find the position list for each term in the group. It is
-    // possible that this particular group was not actually matched by
-    // the search, so that some terms are not found.
-    for (vector<string>::const_iterator it = terms.begin(); 
-         it != terms.end(); it++) {
-        map<string, vector<int> >::iterator pl = m_plists.find(*it);
-        if (pl == m_plists.end()) {
-            LOGDEB1("TextSplitPTR::matchGroup: [" << *it <<
-                    "] not found in m_plists\n");
-            return false;
-        }
-        plists.push_back(&(pl->second));
-        plistToTerm[&(pl->second)] = *it;
-    }
-    // I think this can't actually happen, was useful when we used to
-    // prune the groups, but doesn't hurt.
-    if (plists.size() < 2) {
-        LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n");
-        return false;
-    }
-    // Sort the positions lists so that the shorter is first
-    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
-
-    { // Debug
-        map<vector<int>*, string>::iterator it;
-        it =  plistToTerm.find(plists[0]);
-        if (it == plistToTerm.end()) {
-            // SuperWeird
-            LOGERR("matchGroup: term for first list not found !?!\n");
-            return false;
-        }
-        LOGDEB1("matchGroup: walking the shortest plist. Term [" <<
-                it->second << "], len " << plists[0]->size() << "\n");
-    }
-
-    // Minpos is the highest end of a found match. While looking for
-    // further matches, we don't want the search to extend before
-    // this, because it does not make sense for highlight regions to
-    // overlap
-    int minpos = 0;
-    // Walk the shortest plist and look for matches
-    for (vector<int>::iterator it = plists[0]->begin(); 
-         it != plists[0]->end(); it++) {
-        int pos = *it;
-        int sta = INT_MAX, sto = 0;
-        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
-        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
-            LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
-                    "," << sto << "]\n"); 
-            // Maybe extend the window by 1st term position, this was not
-            // done by do_prox..
-            SETMINMAX(pos, sta, sto);
-            minpos = sto+1;
-            // Translate the position window into a byte offset window
-            map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
-            map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
-            if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
-                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
-                        i1->second.first << " " << i2->second.second << "\n");
-                tboffs.push_back(MatchEntry(i1->second.first, 
-                                            i2->second.second, grpidx));
-            } else {
-                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
-                        << sto << "\n");
-            }
-        } else {
-            LOGDEB1("matchGroup: no group match found at this position\n");
-        }
-    }
-
-    return true;
-}
-
-/** Sort integer pairs by increasing first value and decreasing width */
-class PairIntCmpFirst {
-public:
-    bool operator()(const MatchEntry& a, const MatchEntry& b) {
-        if (a.offs.first != b.offs.first)
-            return a.offs.first < b.offs.first;
-        return a.offs.second > b.offs.second;
-    }
-};
-
 // Look for matches to PHRASE and NEAR term groups and finalize the
 // matched regions list (sort it by increasing start then decreasing
 // length)
@ -307,12 +134,18 @@ bool TextSplitPTR::matchGroups()
    for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
        if (m_hdata.groups[i].size() <= 1)
            continue;
-        matchGroup(i);
+        matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
    }

    // Sort regions by increasing start and decreasing width.  
    // The output process will skip overlapping entries.
-    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
+    std::sort(m_tboffs.begin(), m_tboffs.end(),
+              [](const GroupMatchEntry& a, const GroupMatchEntry& b) -> bool {
+                  if (a.offs.first != b.offs.first)
+                      return a.offs.first < b.offs.first;
+                  return a.offs.second > b.offs.second;
+              }
+        );
    return true;
 }

@ -357,7 +190,7 @@ bool PlainToRich::plaintorich(const string& in,
    // No term matches. Happens, for example on a snippet selected for
    // a term match when we are actually looking for a group match
    // (the snippet generator does this...).
-    if (splitter.tboffs.empty()) {
+    if (splitter.m_tboffs.empty()) {
        LOGDEB1("plaintorich: no term matches\n");
        ret = false;
    }
@ -365,12 +198,12 @@ bool PlainToRich::plaintorich(const string& in,
    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
-    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
-    vector<MatchEntry>::iterator tPosEnd = splitter.tboffs.end();
+    vector<GroupMatchEntry>::iterator tPosIt = splitter.m_tboffs.begin();
+    vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();

 #if 0
-    for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
-         it != splitter.tboffs.end(); it++) {
+    for (vector<pair<int, int> >::const_iterator it = splitter.m_tboffs.begin();
+         it != splitter.m_tboffs.end(); it++) {
        LOGDEB2("plaintorich: region: " << it->first << " "<<it->second<< "\n");
    }
 #endif
@ -420,7 +253,7 @@ bool PlainToRich::plaintorich(const string& in,
                }
                // Skip all highlight areas that would overlap this one
                int crend = tPosIt->offs.second;
-                while (tPosIt != splitter.tboffs.end() && 
+                while (tPosIt != splitter.m_tboffs.end() && 
                       tPosIt->offs.first < crend)
                    tPosIt++;
                inrcltag = 0;
--- a/src/utils/hldata.cpp
+++ b/src/utils/hldata.cpp
@ -16,12 +16,147 @@
 */
 #include "autoconfig.h"

-#include <stdio.h>
-
 #include "hldata.h"

+#include <algorithm>
+#include <limits.h>
+
+#include "log.h"
+
 using std::string;
 using std::map;
+using std::vector;
+using std::pair;
+
+bool do_proximity_test(int window, vector<const vector<int>*>& plists,
+                       unsigned int i, int min, int max, 
+                       int *sp, int *ep, int minpos)
+{
+    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
+            min << " max " << max << " minpos " << minpos << "\n");
+    int tmp = max + 1 - window;
+    if (tmp < minpos)
+        tmp = minpos;
+
+    // Find 1st position bigger than window start
+    auto it = plists[i]->begin();
+    while (it != plists[i]->end() && *it < tmp)
+        it++;
+
+    // Look for position inside window. If not found, no match. If
+    // found: if this is the last list we're done, else recurse on
+    // next list after adjusting the window
+    while (it != plists[i]->end()) {
+        int pos = *it;
+        if (pos > min + window - 1) 
+            return false;
+        if (i + 1 == plists.size()) {
+            setWinMinMax(pos, *sp, *ep);
+            return true;
+        }
+        setWinMinMax(pos, min, max);
+        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
+            setWinMinMax(pos, *sp, *ep);
+            return true;
+        }
+        it++;
+    }
+    return false;
+}
+
+// Find NEAR matches for one group of terms
+bool matchGroup(const HighlightData& hldata,
+                unsigned int grpidx,
+                const map<string, vector<int>>& inplists,
+                const map<int, pair<int,int>>& gpostobytes,
+                vector<GroupMatchEntry>& tboffs
+    )
+{
+    const vector<string>& terms = hldata.groups[grpidx];
+    int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]);
+
+    LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " <<
+            stringsToString(terms) << "\n");
+
+    // The position lists we are going to work with. We extract them from the 
+    // (string->plist) map
+    vector<const vector<int>*> plists;
+    // A revert plist->term map. This is so that we can find who is who after
+    // sorting the plists by length.
+    map<const vector<int>*, string> plistToTerm;
+
+    // Find the position list for each term in the group. It is
+    // possible that this particular group was not actually matched by
+    // the search, so that some terms are not found.
+    for (const auto& term : terms) {
+        map<string, vector<int> >::const_iterator pl = inplists.find(term);
+        if (pl == inplists.end()) {
+            LOGDEB1("TextSplitPTR::matchGroup: [" << term <<
+                    "] not found in plists\n");
+            return false;
+        }
+        plists.push_back(&(pl->second));
+        plistToTerm[&(pl->second)] = term;
+    }
+    // I think this can't actually happen, was useful when we used to
+    // prune the groups, but doesn't hurt.
+    if (plists.size() < 2) {
+        LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n");
+        return false;
+    }
+    // Sort the positions lists so that the shorter is first
+    std::sort(plists.begin(), plists.end(),
+              [](const vector<int> *a, const vector<int> *b) -> bool {
+                  return a->size() < b->size();
+              }
+        );
+
+    if (0) { // Debug
+        auto it = plistToTerm.find(plists[0]);
+        if (it == plistToTerm.end()) {
+            // SuperWeird
+            LOGERR("matchGroup: term for first list not found !?!\n");
+            return false;
+        }
+        LOGDEB1("matchGroup: walking the shortest plist. Term [" <<
+                it->second << "], len " << plists[0]->size() << "\n");
+    }
+
+    // Minpos is the highest end of a found match. While looking for
+    // further matches, we don't want the search to extend before
+    // this, because it does not make sense for highlight regions to
+    // overlap
+    int minpos = 0;
+    // Walk the shortest plist and look for matches
+    for (int pos : *(plists[0])) {
+        int sta = INT_MAX, sto = 0;
+        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
+        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
+            LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
+                    "," << sto << "]\n"); 
+            // Maybe extend the window by 1st term position, this was not
+            // done by do_prox..
+            setWinMinMax(pos, sta, sto);
+            minpos = sto + 1;
+            // Translate the position window into a byte offset window
+            auto i1 =  gpostobytes.find(sta);
+            auto i2 =  gpostobytes.find(sto);
+            if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
+                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
+                        i1->second.first << " " << i2->second.second << "\n");
+                tboffs.push_back(GroupMatchEntry(i1->second.first, 
+                                            i2->second.second, grpidx));
+            } else {
+                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
+                        << sto << "\n");
+            }
+        } else {
+            LOGDEB1("matchGroup: no group match found at this position\n");
+        }
+    }
+
+    return true;
+}

 void HighlightData::toString(string& out)
 {
--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@ -67,4 +67,62 @@ struct HighlightData {
    void toString(std::string& out);
 };

+inline void setWinMinMax(int pos, int& sta, int& sto)
+{
+    if (pos < sta) {
+        sta = pos;
+    }
+    if (pos > sto) {
+        sto = pos;
+    }
+}
+
+// Check that at least an entry from the first position list is inside
+// the window and recurse on next list. The window is readjusted as
+// the successive terms are found. Mostly copied from Xapian code.
+//
+// @param window the search window width
+// @param plists the position list vector
+// @param i the position list to process (we then recurse with the next list)
+// @param min the current minimum pos for a found term
+// @param max the current maximum pos for a found term
+// @param sp, ep output: the found area
+// @param minpos bottom of search: this is the highest point of
+//    any previous match. We don't look below this as overlapping matches 
+//    make no sense for highlighting.
+extern bool do_proximity_test(
+    int window, std::vector<const std::vector<int>*>& plists, 
+    unsigned int i, int min, int max, int *sp, int *ep, int minpos);
+
+
+/**** The following is used by plaintorich.cpp for finding zones to
+   highlight and by rclabsfromtext.cpp to choose fragments for the
+   abstract */
+
+struct GroupMatchEntry {
+    // Start/End byte offsets in the document text
+    std::pair<int, int> offs;
+    // Index of the search group this comes from: this is to relate a 
+    // match to the original user input.
+    size_t grpidx;
+    GroupMatchEntry(int sta, int sto, size_t idx) 
+        : offs(sta, sto), grpidx(idx) {
+    }
+};
+
+// Find NEAR matches for one group of terms.
+//
+// @param hldata Data about the user query
+// @param grpidx Index in hldata.groups for the group we process
+// @param inplists Position lists for the the group terms
+// @param gpostobytes Translation of term position to start/end byte offsets
+// @param[out] tboffs Found matches
+extern bool matchGroup(
+    const HighlightData& hldata,
+    unsigned int grpidx,
+    const std::map<std::string, std::vector<int>>& inplists,
+    const std::map<int, std::pair<int,int>>& gpostobytes,
+    std::vector<GroupMatchEntry>& tboffs
+    );
+
 #endif /* _hldata_h_included_ */