hldata: cleanup + support phrases

2019-07-05 11:43:14 +02:00 · 2019-07-05 11:43:14 +02:00 · 0b16935016
commit 0b16935016
parent 262e7260d8
2 changed files with 134 additions and 107 deletions
--- a/src/utils/hldata.cpp
+++ b/src/utils/hldata.cpp
@ -29,42 +29,6 @@ using std::map;
 using std::vector;
 using std::pair;

-bool do_proximity_test(int window, vector<const vector<int>*>& plists,
-                       unsigned int i, int min, int max, 
-                       int *sp, int *ep, int minpos)
-{
-    LOGDEB1("do_prox_test: win " << window << " i " << i << " min " <<
-            min << " max " << max << " minpos " << minpos << "\n");
-    int tmp = max + 1 - window;
-    if (tmp < minpos)
-        tmp = minpos;
-
-    // Find 1st position bigger than window start
-    auto it = plists[i]->begin();
-    while (it != plists[i]->end() && *it < tmp)
-        it++;
-
-    // Look for position inside window. If not found, no match. If
-    // found: if this is the last list we're done, else recurse on
-    // next list after adjusting the window
-    while (it != plists[i]->end()) {
-        int pos = *it;
-        if (pos > min + window - 1) 
-            return false;
-        if (i + 1 == plists.size()) {
-            setWinMinMax(pos, *sp, *ep);
-            return true;
-        }
-        setWinMinMax(pos, min, max);
-        if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
-            setWinMinMax(pos, *sp, *ep);
-            return true;
-        }
-        it++;
-    }
-    return false;
-}
-
 #undef DEBUGGROUPS
 #ifdef DEBUGGROUPS
 #define LOGRP LOGDEB
@ -72,19 +36,91 @@ bool do_proximity_test(int window, vector<const vector<int>*>& plists,
 #define LOGRP LOGDEB1
 #endif

-// Find NEAR matches for one group of terms
+static inline void setWinMinMax(int pos, int& sta, int& sto)
+{
+    if (pos < sta) {
+        sta = pos;
+    }
+    if (pos > sto) {
+        sto = pos;
+    }
+}
+
+/*
+ * @param window the total width for the "near" area, in positions.
+
+ * @param plists the position vectors for the terms. The array is
+ *    sorted shorted first for optimization. The function does a
+ *    recursive call on the next array if the match is still possible
+ *    after dealing with the current one
+
+ * @param plist_idx the index for the position list we will work with.
+ * @param min, max the current minimum and maximum term positions.
+ * @param[output] sp, ep, the start and end positions of the found match.
+ * @param minpos  Highest end of a found match. While looking for
+ *   further matches, we don't want the search to extend before
+ *   this, because it does not make sense for highlight regions to
+ *   overlap.
+ * @param isphrase if true, the position lists are in term order, and
+ *     we only look for the next match beyond the current window top.
+ */
+static bool do_proximity_test(
+    const int window, vector<const vector<int>*>& plists,
+    unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
+    bool isphrase)
+{
+    LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx <<
+           " min " <<  min << " max " << max << " minpos " << minpos <<
+           " isphrase " << isphrase << "\n");
+
+    // Overlap interdiction: possibly adjust window start by input minpos
+    int actualminpos = isphrase ? max + 1 : max + 1 - window;
+    if (actualminpos < minpos)
+        actualminpos = minpos;
+
+    // Find 1st position bigger than window start
+    auto it = plists[plist_idx]->begin();
+    while (it != plists[plist_idx]->end() && *it < actualminpos)
+        it++;
+
+    // Look for position inside window. If not found, no match. If
+    // found: if this is the last list we're done, else recurse on
+    // next list after adjusting the window
+    while (it != plists[plist_idx]->end()) {
+        int pos = *it;
+        if (pos > min + window - 1) 
+            return false;
+        if (plist_idx + 1 == plists.size()) {
+            // Done: set return values
+            setWinMinMax(pos, *sp, *ep);
+            return true;
+        }
+        setWinMinMax(pos, min, max);
+        if (do_proximity_test(window,plists, plist_idx + 1,
+                              min, max, sp, ep, minpos)) {
+            return true;
+        }
+        it++;
+    }
+    return false;
+}
+
+
+// Find matches for one group of terms
 bool matchGroup(const HighlightData& hldata,
                unsigned int grpidx,
                const map<string, vector<int>>& inplists,
                const map<int, pair<int,int>>& gpostobytes,
-                vector<GroupMatchEntry>& tboffs
+                vector<GroupMatchEntry>& tboffs,
+                bool isphrase
    )
 {
-    const vector<string>& terms = hldata.groups[grpidx];
-    int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]);
+    isphrase=true;
+    const vector<string>& terms = hldata.index_term_groups[grpidx];
+    int window = int(terms.size() + hldata.slacks[grpidx]);

    LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
-            stringsToString(terms) << "\n");
+          stringsToString(terms) << "\n");

    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
@ -100,7 +136,7 @@ bool matchGroup(const HighlightData& hldata,
        map<string, vector<int> >::const_iterator pl = inplists.find(term);
        if (pl == inplists.end()) {
            LOGRP("TextSplitPTR::matchGroup: [" << term <<
-                    "] not found in plists\n");
+                  "] not found in plists\n");
            return false;
        }
        plists.push_back(&(pl->second));
@ -112,13 +148,16 @@ bool matchGroup(const HighlightData& hldata,
        LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }
-    // Sort the positions lists so that the shorter is first
-    std::sort(plists.begin(), plists.end(),
-              [](const vector<int> *a, const vector<int> *b) -> bool {
-                  return a->size() < b->size();
-              }
-        );

+    if (!isphrase) {
+        // Sort the positions lists so that the shorter is first
+        std::sort(plists.begin(), plists.end(),
+                  [](const vector<int> *a, const vector<int> *b) -> bool {
+                      return a->size() < b->size();
+                  }
+            );
+    }
+    
    if (0) { // Debug
        auto it = plistToTerm.find(plists[0]);
        if (it == plistToTerm.end()) {
@ -127,7 +166,7 @@ bool matchGroup(const HighlightData& hldata,
            return false;
        }
        LOGRP("matchGroup: walking the shortest plist. Term [" <<
-                it->second << "], len " << plists[0]->size() << "\n");
+              it->second << "], len " << plists[0]->size() << "\n");
    }

    // Minpos is the highest end of a found match. While looking for
@ -139,12 +178,11 @@ bool matchGroup(const HighlightData& hldata,
    for (int pos : *(plists[0])) {
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
-        if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
-            LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
-                    "," << sto << "]\n"); 
-            // Maybe extend the window by 1st term position, this was not
-            // done by do_prox..
+        if (do_proximity_test(
+                window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
            setWinMinMax(pos, sta, sto);
+            LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
+                   "," << sto << "]\n"); 
            minpos = sto + 1;
            // Translate the position window into a byte offset window
            auto i1 =  gpostobytes.find(sta);
@ -153,7 +191,7 @@ bool matchGroup(const HighlightData& hldata,
                LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " <<
                        i1->second.first << " " << i2->second.second << "\n");
                tboffs.push_back(GroupMatchEntry(i1->second.first, 
-                                            i2->second.second, grpidx));
+                                                 i2->second.second, grpidx));
            } else {
                LOGDEB0("matchGroup: no bpos found for " << sta << " or "
                        << sto << "\n");
@ -169,24 +207,23 @@ bool matchGroup(const HighlightData& hldata,
 void HighlightData::toString(string& out) const
 {
    out.append("\nUser terms (orthograph): ");
-    for (std::set<string>::const_iterator it = uterms.begin();
-            it != uterms.end(); it++) {
-        out.append(" [").append(*it).append("]");
+    for (const auto& term : uterms) {
+        out.append(" [").append(term).append("]");
    }
    out.append("\nUser terms to Query terms:");
-    for (map<string, string>::const_iterator it = terms.begin();
-            it != terms.end(); it++) {
-        out.append("[").append(it->first).append("]->[");
-        out.append(it->second).append("] ");
+    for (const auto& entry: terms) {
+        out.append("[").append(entry.first).append("]->[");
+        out.append(entry.second).append("] ");
    }
    out.append("\nGroups: ");
    char cbuf[200];
    sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
-            int(groups.size()), int(grpsugidx.size()), int(ugroups.size()));
+            int(index_term_groups.size()), int(grpsugidx.size()),
+            int(ugroups.size()));
    out.append(cbuf);

    size_t ugidx = (size_t) - 1;
-    for (unsigned int i = 0; i < groups.size(); i++) {
+    for (unsigned int i = 0; i < index_term_groups.size(); i++) {
        if (ugidx != grpsugidx[i]) {
            ugidx = grpsugidx[i];
            out.append("\n(");
@ -196,8 +233,8 @@ void HighlightData::toString(string& out) const
            out.append(") ->");
        }
        out.append(" {");
-        for (unsigned int j = 0; j < groups[i].size(); j++) {
-            out.append("[").append(groups[i][j]).append("]");
+        for (unsigned int j = 0; j < index_term_groups[i].size(); j++) {
+            out.append("[").append(index_term_groups[i][j]).append("]");
        }
        sprintf(cbuf, "%d", slacks[i]);
        out.append("}").append(cbuf);
@ -212,10 +249,12 @@ void HighlightData::append(const HighlightData& hl)
    size_t ugsz0 = ugroups.size();
    ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());

-    groups.insert(groups.end(), hl.groups.begin(), hl.groups.end());
+    index_term_groups.insert(index_term_groups.end(),
+                             hl.index_term_groups.begin(),
+                             hl.index_term_groups.end());
    slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
    for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
-            it != hl.grpsugidx.end(); it++) {
+         it != hl.grpsugidx.end(); it++) {
        grpsugidx.push_back(*it + ugsz0);
    }
 }
--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@ -40,8 +40,11 @@ struct HighlightData {
     * (unaccented and lowercased as needed depending on
     * configuration), and the list may include values
     * expanded from the original terms by stem or wildcard expansion.
+     * NEAR clauses are expanded to all possible combinations of the 
+     * stem-expanded member terms. Ex: 
+     * "clean floor"p -> (clean floor) (clean floors) (cleaning floor)...
     */
-    std::vector<std::vector<std::string> > groups;
+    std::vector<std::vector<std::string> > index_term_groups;
    /** Group slacks. Parallel to groups */
    std::vector<int> slacks;

@ -53,11 +56,10 @@ struct HighlightData {
     */
    std::vector<size_t> grpsugidx;

-    void clear()
-    {
+    void clear() {
 	uterms.clear();
 	ugroups.clear();
-	groups.clear();
+	index_term_groups.clear();
 	slacks.clear();
 	grpsugidx.clear();
    }
@ -67,35 +69,7 @@ struct HighlightData {
    void toString(std::string& out) const;
 };

-inline void setWinMinMax(int pos, int& sta, int& sto)
-{
-    if (pos < sta) {
-        sta = pos;
-    }
-    if (pos > sto) {
-        sto = pos;
-    }
-}
-
-// Check that at least an entry from the first position list is inside
-// the window and recurse on next list. The window is readjusted as
-// the successive terms are found. Mostly copied from Xapian code.
-//
-// @param window the search window width
-// @param plists the position list vector
-// @param i the position list to process (we then recurse with the next list)
-// @param min the current minimum pos for a found term
-// @param max the current maximum pos for a found term
-// @param sp, ep output: the found area
-// @param minpos bottom of search: this is the highest point of
-//    any previous match. We don't look below this as overlapping matches 
-//    make no sense for highlighting.
-extern bool do_proximity_test(
-    int window, std::vector<const std::vector<int>*>& plists, 
-    unsigned int i, int min, int max, int *sp, int *ep, int minpos);
-
-
-/**** The following is used by plaintorich.cpp for finding zones to
+/* The following is used by plaintorich.cpp for finding zones to
   highlight and by rclabsfromtext.cpp to choose fragments for the
   abstract */

@ -112,17 +86,31 @@ struct GroupMatchEntry {

 // Find NEAR matches for one group of terms.
 //
-// @param hldata Data about the user query
-// @param grpidx Index in hldata.groups for the group we process
-// @param inplists Position lists for the the group terms
-// @param gpostobytes Translation of term position to start/end byte offsets
-// @param[out] tboffs Found matches
+// @param hldata User query expansion descriptor (see above).
+//
+// @param grpidx Index in hldata.index_term_groups for the group we
+//     process. This is used by us to get the terms and slacks, and
+//     set in the output GroupMatchEntry structures to allow the
+//     caller to link a match with a specific user input (e.g. for
+//     walking the match in the GUI preview)
+//
+// @param inplists Position lists for the the group terms. This is the
+//     data used to look for matches.
+//
+// @param gpostobytes Translation of term position to start/end byte
+//     offsets. This is used to translate term positions to byte
+//     positions in the output, for ease of use by caller.
+//
+// @param[out] tboffs Found matches. Each match has a begin and end
+//     byte offset and an index linking to the origin data in the
+//     HighlightData structure.
 extern bool matchGroup(
    const HighlightData& hldata,
    unsigned int grpidx,
    const std::map<std::string, std::vector<int>>& inplists,
    const std::map<int, std::pair<int,int>>& gpostobytes,
-    std::vector<GroupMatchEntry>& tboffs
+    std::vector<GroupMatchEntry>& tboffs,
+    bool isphrase = false
    );

 #endif /* _hldata_h_included_ */