Extracted some code from plaintorich, for reuse by the new snippets building code

2018-01-03 15:28:11 +01:00 · 2018-01-03 15:28:11 +01:00 · 175ca9832f
commit 175ca9832f
parent bb810f9ceb
3 changed files with 211 additions and 185 deletions
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@ -40,24 +40,13 @@ using std::set;
 #include "cancelcheck.h"
 #include "unacpp.h"
 struct MatchEntry {
    // Start/End byte offsets in the document text
    pair<int, int> offs;
    // Index of the search group this comes from: this is to relate a 
    // match to the original user input.
    size_t grpidx;
    MatchEntry(int sta, int sto, size_t idx) 
        : offs(sta, sto), grpidx(idx) {
    }
 };
 // Text splitter used to take note of the position of query terms
 // inside the result text. This is then used to insert highlight tags.
 class TextSplitPTR : public TextSplit {
 public:
    // Out: begin and end byte positions of query terms/groups in text
-    vector<MatchEntry> tboffs;  
+    vector<GroupMatchEntry> m_tboffs;
    TextSplitPTR(const HighlightData& hdata)
        :  m_wcount(0), m_hdata(hdata) {
@ -96,7 +85,7 @@ public:
        // If this word is a search term, remember its byte-offset span. 
        map<string, size_t>::const_iterator it = m_terms.find(dumb);
        if (it != m_terms.end()) {
-            tboffs.push_back(MatchEntry(bts, bte, it->second));
+            m_tboffs.push_back(GroupMatchEntry(bts, bte, it->second));
        }
        // If word is part of a search group, update its positions list
@ -119,8 +108,6 @@ public:
    virtual bool matchGroups();
 private:
    virtual bool matchGroup(unsigned int idx);
    // Word count. Used to call checkCancel from time to time.
    int m_wcount;
@ -138,166 +125,6 @@ private:
 };
 /** Sort by shorter comparison class */
 class VecIntCmpShorter {
 public:
    /** Return true if and only if a is strictly shorter than b. */
    bool operator()(const vector<int> *a, const vector<int> *b) {
        return a->size() < b->size();
    }
 };
 #define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS);    \
        if ((POS) > (STO)) (STO) = (POS);}
 // Check that at least an entry from the first position list is inside
 // the window and recurse on next list. The window is readjusted as
 // the successive terms are found.
 //
 // @param window the search window width
 // @param plists the position list vector
 // @param i the position list to process (we then recurse with the next list)
 // @param min the current minimum pos for a found term
 // @param max the current maximum pos for a found term
 // @param sp, ep output: the found area
 // @param minpos bottom of search: this is the highest point of
 //    any previous match. We don't look below this as overlapping matches 
 //    make no sense for highlighting.
 static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
                              unsigned int i, int min, int max, 
                              int *sp, int *ep, int minpos)
 {
    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
            min << " max " << max << " minpos " << minpos << "\n");
    int tmp = max + 1 - window;
    if (tmp < minpos)
        tmp = minpos;
    // Find 1st position bigger than window start
    vector<int>::iterator it = plists[i]->begin();
    while (it != plists[i]->end() && *it < tmp)
        it++;
    // Look for position inside window. If not found, no match. If
    // found: if this is the last list we're done, else recurse on
    // next list after adjusting the window
    while (it != plists[i]->end()) {
        int pos = *it;
        if (pos > min + window - 1) 
            return false;
        if (i + 1 == plists.size()) {
            SETMINMAX(pos, *sp, *ep);
            return true;
        }
        SETMINMAX(pos, min, max);
        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
            SETMINMAX(pos, *sp, *ep);
            return true;
        }
        it++;
    }
    return false;
 }
 // Find NEAR matches for one group of terms, update highlight map
 bool TextSplitPTR::matchGroup(unsigned int grpidx)
 {
    const vector<string>& terms = m_hdata.groups[grpidx];
    int window = int(m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx]);
    LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " <<
            stringsToString(terms) << "\n");
    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
    vector<vector<int>* > plists;
    // A revert plist->term map. This is so that we can find who is who after
    // sorting the plists by length.
    map<vector<int>*, string> plistToTerm;
    // Find the position list for each term in the group. It is
    // possible that this particular group was not actually matched by
    // the search, so that some terms are not found.
    for (vector<string>::const_iterator it = terms.begin(); 
         it != terms.end(); it++) {
        map<string, vector<int> >::iterator pl = m_plists.find(*it);
        if (pl == m_plists.end()) {
            LOGDEB1("TextSplitPTR::matchGroup: [" << *it <<
                    "] not found in m_plists\n");
            return false;
        }
        plists.push_back(&(pl->second));
        plistToTerm[&(pl->second)] = *it;
    }
    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
    if (plists.size() < 2) {
        LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
    { // Debug
        map<vector<int>*, string>::iterator it;
        it =  plistToTerm.find(plists[0]);
        if (it == plistToTerm.end()) {
            // SuperWeird
            LOGERR("matchGroup: term for first list not found !?!\n");
            return false;
        }
        LOGDEB1("matchGroup: walking the shortest plist. Term [" <<
                it->second << "], len " << plists[0]->size() << "\n");
    }
    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
    // this, because it does not make sense for highlight regions to
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
    for (vector<int>::iterator it = plists[0]->begin(); 
         it != plists[0]->end(); it++) {
        int pos = *it;
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
            LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
                    "," << sto << "]\n"); 
            // Maybe extend the window by 1st term position, this was not
            // done by do_prox..
            SETMINMAX(pos, sta, sto);
            minpos = sto+1;
            // Translate the position window into a byte offset window
            map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
            map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
            if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
                        i1->second.first << " " << i2->second.second << "\n");
                tboffs.push_back(MatchEntry(i1->second.first, 
                                            i2->second.second, grpidx));
            } else {
                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
                        << sto << "\n");
            }
        } else {
            LOGDEB1("matchGroup: no group match found at this position\n");
        }
    }
    return true;
 }
 /** Sort integer pairs by increasing first value and decreasing width */
 class PairIntCmpFirst {
 public:
    bool operator()(const MatchEntry& a, const MatchEntry& b) {
        if (a.offs.first != b.offs.first)
            return a.offs.first < b.offs.first;
        return a.offs.second > b.offs.second;
    }
 };
 // Look for matches to PHRASE and NEAR term groups and finalize the
 // matched regions list (sort it by increasing start then decreasing
 // length)
@ -307,12 +134,18 @@ bool TextSplitPTR::matchGroups()
    for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
        if (m_hdata.groups[i].size() <= 1)
            continue;
-        matchGroup(i);
+        matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
    }
    // Sort regions by increasing start and decreasing width.  
    // The output process will skip overlapping entries.
-    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
+    std::sort(m_tboffs.begin(), m_tboffs.end(),
              [](const GroupMatchEntry& a, const GroupMatchEntry& b) -> bool {
                  if (a.offs.first != b.offs.first)
                      return a.offs.first < b.offs.first;
                  return a.offs.second > b.offs.second;
              }
        );
    return true;
 }
@ -357,7 +190,7 @@ bool PlainToRich::plaintorich(const string& in,
    // No term matches. Happens, for example on a snippet selected for
    // a term match when we are actually looking for a group match
    // (the snippet generator does this...).
-    if (splitter.tboffs.empty()) {
+    if (splitter.m_tboffs.empty()) {
        LOGDEB1("plaintorich: no term matches\n");
        ret = false;
    }
@ -365,12 +198,12 @@ bool PlainToRich::plaintorich(const string& in,
    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
-    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
+    vector<GroupMatchEntry>::iterator tPosIt = splitter.m_tboffs.begin();
-    vector<MatchEntry>::iterator tPosEnd = splitter.tboffs.end();
+    vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();
 #if 0
-    for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
+    for (vector<pair<int, int> >::const_iterator it = splitter.m_tboffs.begin();
-         it != splitter.tboffs.end(); it++) {
+         it != splitter.m_tboffs.end(); it++) {
        LOGDEB2("plaintorich: region: " << it->first << " "<<it->second<< "\n");
    }
 #endif
@ -420,7 +253,7 @@ bool PlainToRich::plaintorich(const string& in,
                }
                // Skip all highlight areas that would overlap this one
                int crend = tPosIt->offs.second;
-                while (tPosIt != splitter.tboffs.end() && 
+                while (tPosIt != splitter.m_tboffs.end() && 
                       tPosIt->offs.first < crend)
                    tPosIt++;
                inrcltag = 0;
--- a/src/utils/hldata.cpp
+++ b/src/utils/hldata.cpp
@ -16,12 +16,147 @@
 */
 #include "autoconfig.h"
 #include <stdio.h>
 #include "hldata.h"
 #include <algorithm>
 #include <limits.h>
 #include "log.h"
 using std::string;
 using std::map;
 using std::vector;
 using std::pair;
 bool do_proximity_test(int window, vector<const vector<int>*>& plists,
                       unsigned int i, int min, int max, 
                       int *sp, int *ep, int minpos)
 {
    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
            min << " max " << max << " minpos " << minpos << "\n");
    int tmp = max + 1 - window;
    if (tmp < minpos)
        tmp = minpos;
    // Find 1st position bigger than window start
    auto it = plists[i]->begin();
    while (it != plists[i]->end() && *it < tmp)
        it++;
    // Look for position inside window. If not found, no match. If
    // found: if this is the last list we're done, else recurse on
    // next list after adjusting the window
    while (it != plists[i]->end()) {
        int pos = *it;
        if (pos > min + window - 1) 
            return false;
        if (i + 1 == plists.size()) {
            setWinMinMax(pos, *sp, *ep);
            return true;
        }
        setWinMinMax(pos, min, max);
        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
            setWinMinMax(pos, *sp, *ep);
            return true;
        }
        it++;
    }
    return false;
 }
 // Find NEAR matches for one group of terms
 bool matchGroup(const HighlightData& hldata,
                unsigned int grpidx,
                const map<string, vector<int>>& inplists,
                const map<int, pair<int,int>>& gpostobytes,
                vector<GroupMatchEntry>& tboffs
    )
 {
    const vector<string>& terms = hldata.groups[grpidx];
    int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]);
    LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " <<
            stringsToString(terms) << "\n");
    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
    vector<const vector<int>*> plists;
    // A revert plist->term map. This is so that we can find who is who after
    // sorting the plists by length.
    map<const vector<int>*, string> plistToTerm;
    // Find the position list for each term in the group. It is
    // possible that this particular group was not actually matched by
    // the search, so that some terms are not found.
    for (const auto& term : terms) {
        map<string, vector<int> >::const_iterator pl = inplists.find(term);
        if (pl == inplists.end()) {
            LOGDEB1("TextSplitPTR::matchGroup: [" << term <<
                    "] not found in plists\n");
            return false;
        }
        plists.push_back(&(pl->second));
        plistToTerm[&(pl->second)] = term;
    }
    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
    if (plists.size() < 2) {
        LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(),
              [](const vector<int> *a, const vector<int> *b) -> bool {
                  return a->size() < b->size();
              }
        );
    if (0) { // Debug
        auto it = plistToTerm.find(plists[0]);
        if (it == plistToTerm.end()) {
            // SuperWeird
            LOGERR("matchGroup: term for first list not found !?!\n");
            return false;
        }
        LOGDEB1("matchGroup: walking the shortest plist. Term [" <<
                it->second << "], len " << plists[0]->size() << "\n");
    }
    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
    // this, because it does not make sense for highlight regions to
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
    for (int pos : *(plists[0])) {
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
            LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
                    "," << sto << "]\n"); 
            // Maybe extend the window by 1st term position, this was not
            // done by do_prox..
            setWinMinMax(pos, sta, sto);
            minpos = sto + 1;
            // Translate the position window into a byte offset window
            auto i1 =  gpostobytes.find(sta);
            auto i2 =  gpostobytes.find(sto);
            if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
                        i1->second.first << " " << i2->second.second << "\n");
                tboffs.push_back(GroupMatchEntry(i1->second.first, 
                                            i2->second.second, grpidx));
            } else {
                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
                        << sto << "\n");
            }
        } else {
            LOGDEB1("matchGroup: no group match found at this position\n");
        }
    }
    return true;
 }
 void HighlightData::toString(string& out)
 {
--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@ -67,4 +67,62 @@ struct HighlightData {
    void toString(std::string& out);
 };
 inline void setWinMinMax(int pos, int& sta, int& sto)
 {
    if (pos < sta) {
        sta = pos;
    }
    if (pos > sto) {
        sto = pos;
    }
 }
 // Check that at least an entry from the first position list is inside
 // the window and recurse on next list. The window is readjusted as
 // the successive terms are found. Mostly copied from Xapian code.
 //
 // @param window the search window width
 // @param plists the position list vector
 // @param i the position list to process (we then recurse with the next list)
 // @param min the current minimum pos for a found term
 // @param max the current maximum pos for a found term
 // @param sp, ep output: the found area
 // @param minpos bottom of search: this is the highest point of
 //    any previous match. We don't look below this as overlapping matches 
 //    make no sense for highlighting.
 extern bool do_proximity_test(
    int window, std::vector<const std::vector<int>*>& plists, 
    unsigned int i, int min, int max, int *sp, int *ep, int minpos);
 /**** The following is used by plaintorich.cpp for finding zones to
   highlight and by rclabsfromtext.cpp to choose fragments for the
   abstract */
 struct GroupMatchEntry {
    // Start/End byte offsets in the document text
    std::pair<int, int> offs;
    // Index of the search group this comes from: this is to relate a 
    // match to the original user input.
    size_t grpidx;
    GroupMatchEntry(int sta, int sto, size_t idx) 
        : offs(sta, sto), grpidx(idx) {
    }
 };
 // Find NEAR matches for one group of terms.
 //
 // @param hldata Data about the user query
 // @param grpidx Index in hldata.groups for the group we process
 // @param inplists Position lists for the the group terms
 // @param gpostobytes Translation of term position to start/end byte offsets
 // @param[out] tboffs Found matches
 extern bool matchGroup(
    const HighlightData& hldata,
    unsigned int grpidx,
    const std::map<std::string, std::vector<int>>& inplists,
    const std::map<int, std::pair<int,int>>& gpostobytes,
    std::vector<GroupMatchEntry>& tboffs
    );
 #endif /* _hldata_h_included_ */