Highlighting and snippets extraction: reworked to handle phrases properly. Use a compound position list instead of multiplying the OR groups inside a near clause

2019-07-05 18:02:09 +02:00 · 2019-07-05 18:02:09 +02:00 · f877e7e459
commit f877e7e459
parent 00eb803f5d
7 changed files with 211 additions and 140 deletions
--- a/src/qtgui/preview_plaintorich.cpp
+++ b/src/qtgui/preview_plaintorich.cpp
@ -72,7 +72,7 @@ string  PlainToRichQtPreview::PlainToRichQtPreview::header()
 string PlainToRichQtPreview::startMatch(unsigned int grpidx)
 {
    LOGDEB2("startMatch, grpidx "  << (grpidx) << "\n" );
-    grpidx = m_hdata->grpsugidx[grpidx];
+    grpidx = m_hdata->index_term_groups[grpidx].grpsugidx;
    LOGDEB2("startMatch, ugrpidx "  << (grpidx) << "\n" );
    m_groupanchors[grpidx].push_back(++m_lastanchor);
    m_groupcuranchors[grpidx] = 0; 
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@ -53,15 +53,17 @@ public:
        :  m_wcount(0), m_hdata(hdata) {
        // We separate single terms and groups and extract the group
        // terms for computing positions list before looking for group
-        // matches
-        for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
-             vit != hdata.groups.end(); vit++) {
-            if (vit->size() == 1) {
-                m_terms[vit->front()] = vit - hdata.groups.begin();
-            } else if (vit->size() > 1) {
-                for (vector<string>::const_iterator it = vit->begin(); 
-                     it != vit->end(); it++) {
-                    m_gterms.insert(*it);
+        // matches. Single terms are stored with a reference to the
+        // entry they come with.
+        for (unsigned int i = 0; i < hdata.index_term_groups.size(); i++) {
+            const HighlightData::TermGroup& tg(hdata.index_term_groups[i]);
+            if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
+                m_terms[tg.term] = i;
+            } else {
+                for (const auto& group : tg.orgroups) {
+                    for (const auto& term : group) {
+                        m_gterms.insert(term);
+                    }
                }
            }
        }
@ -129,13 +131,13 @@ private:
 // Look for matches to PHRASE and NEAR term groups and finalize the
 // matched regions list (sort it by increasing start then decreasing
 // length)
-// Actually, we handle all groups as NEAR (ignore order).
 bool TextSplitPTR::matchGroups()
 {
-    for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
-        if (m_hdata.groups[i].size() <= 1)
-            continue;
-        matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
+    for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) {
+        if (m_hdata.index_term_groups[i].kind !=
+            HighlightData::TermGroup::TGK_TERM) {
+            matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
+        }
    }

    // Sort regions by increasing start and decreasing width.  
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@ -119,10 +119,12 @@ public:

        // Take note of the group (phrase/near) terms because we need
        // to compute the position lists for them.
-        for (const auto& group : hdata.groups) {
-            if (group.size() > 1) {
-                for (const auto& term: group) {
-                    m_gterms.insert(term);
+        for (const auto& tg : hdata.index_term_groups) {
+            if (tg.kind != HighlightData::TermGroup::TGK_TERM) {
+                for (const auto& group : tg.orgroups) {
+                    for (const auto& term: group) {
+                        m_gterms.insert(term);
+                    }
                }
            }
        }
@ -134,7 +136,9 @@ public:
        LOGDEB2("takeword: " << term << endl);
        // Limit time taken with monster documents. The resulting
        // abstract will be incorrect or inexistant, but this is
-        // better than taking forever (the default cutoff is 10E6)
+        // better than taking forever (the default cutoff value comes
+        // from the snippetMaxPosWalk configuration parameter, and is
+        // 10E6)
        if (maxtermcount && termcount++ > maxtermcount) {
            LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
                   maxtermcount << endl);
@ -276,8 +280,9 @@ public:
        // Look for matches to PHRASE and NEAR term groups and finalize
        // the matched regions list (sort it by increasing start then
        // decreasing length). We process all groups as NEAR (ignore order).
-        for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
-            if (m_hdata.groups[i].size() > 1) {
+        for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) {
+            if (m_hdata.index_term_groups[i].kind !=
+                HighlightData::TermGroup::TGK_TERM) {
                matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
            }
        }
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@ -163,7 +163,7 @@ double Query::Native::qualityTerms(Xapian::docid docid,
    // expanded from (by stemming)
    map<string, vector<string> > byRoot;
    for (const auto& term: terms) {
-        map<string, string>::const_iterator eit = hld.terms.find(term);
+        const auto eit = hld.terms.find(term);
        if (eit != hld.terms.end()) {
            byRoot[eit->second].push_back(term);
        } else {
@ -174,9 +174,7 @@ double Query::Native::qualityTerms(Xapian::docid docid,

 #ifdef DEBUGABSTRACT
    {
-        string deb;
-        hld.toString(deb);
-        LOGABS("qualityTerms: hld: " << deb << "\n");
+        LOGABS("qualityTerms: hld: " << hld.toString() << "\n");
        string byRootstr;
        for (const auto& entry : byRoot) {
            byRootstr.append("[").append(entry.first).append("]->");
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@ -603,11 +603,11 @@ void SearchDataClauseSimple::processSimpleSpan(
 	return;
    
    // Set up the highlight data. No prefix should go in there
-    for (vector<string>::const_iterator it = exp.begin(); 
-	 it != exp.end(); it++) {
-	m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
-	m_hldata.slacks.push_back(0);
-	m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
+    for (const auto& term : exp) {
+        HighlightData::TermGroup tg;
+        tg.term = term.substr(prefix.size());
+        tg.grpsugidx =  m_hldata.ugroups.size() - 1;
+        m_hldata.index_term_groups.push_back(tg);
    }

    // Push either term or OR of stem-expanded set
@ -735,18 +735,16 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
 			   original_term_wqf_booster);
    pqueries.push_back(xq);

-    // Add all combinations of NEAR/PHRASE groups to the highlighting data. 
-    vector<vector<string> > allcombs;
-    vector<string> comb;
-    multiply_groups(groups.begin(), groups.end(), comb, allcombs);
-    
    // Insert the search groups and slacks in the highlight data, with
    // a reference to the user entry that generated them:
-    m_hldata.groups.insert(m_hldata.groups.end(), 
-			   allcombs.begin(), allcombs.end());
-    m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
-    m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), 
-			      m_hldata.ugroups.size() - 1);
+    HighlightData::TermGroup tg;
+    tg.orgroups = groups;
+    tg.slack = slack;
+    tg.grpsugidx =  m_hldata.ugroups.size() - 1;
+    tg.kind = (op == Xapian::Query::OP_PHRASE) ?
+        HighlightData::TermGroup::TGK_PHRASE :
+        HighlightData::TermGroup::TGK_NEAR;
+    m_hldata.index_term_groups.push_back(tg);
 }

 // Trim string beginning with ^ or ending with $ and convert to flags
--- a/src/utils/hldata.cpp
+++ b/src/utils/hldata.cpp
@ -31,11 +31,69 @@ using std::pair;

 #undef DEBUGGROUPS
 #ifdef DEBUGGROUPS
-#define LOGRP LOGDEB
+#define LOGRP LOGINF
 #else
 #define LOGRP LOGDEB1
 #endif

+// Combined position list for or'd terms
+struct OrPList {
+    void addplist(const string& term, const vector<int>* pl) {
+        terms.push_back(term);
+        plists.push_back(pl);
+        indexes.push_back(0);
+        totalsize += pl->size();
+    }
+
+    // Returns -1 for eof, else the next smallest value in the
+    // combined lists, according to the current indexes.
+    int value() {
+        int minval = INT_MAX;
+        int minidx = -1;
+        for (unsigned ii = 0; ii < indexes.size(); ii++) {
+            const vector<int>& pl(*plists[ii]);
+            if (indexes[ii] >= pl.size())
+                continue; // this list done
+            if (pl[indexes[ii]] < minval) {
+                minval = pl[indexes[ii]];
+                minidx = ii;
+            }
+        }
+        if (minidx != -1) {
+            LOGRP("OrPList::value() -> " << minval << " for " <<
+                  terms[minidx] << "\n");
+            currentidx = minidx;
+            return minval;
+        } else {
+            LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n");
+            return -1;
+        }
+    }
+
+    int next() {
+        if (currentidx != -1) {
+            indexes[currentidx]++;
+        }
+        return value();
+    }
+    
+    int size() const {
+        return totalsize;
+    }
+    void rewind() {
+        for (auto& idx : indexes) {
+            idx = 0;
+        }
+        currentidx = -1;
+    }
+
+    vector<const vector<int>*> plists;
+    vector<unsigned int> indexes;
+    vector<string> terms;
+    int currentidx{-1};
+    int totalsize{0};
+};
+
 static inline void setWinMinMax(int pos, int& sta, int& sto)
 {
    if (pos < sta) {
@ -65,42 +123,44 @@ static inline void setWinMinMax(int pos, int& sta, int& sto)
 *     we only look for the next match beyond the current window top.
 */
 static bool do_proximity_test(
-    const int window, vector<const vector<int>*>& plists,
+    const int window, vector<OrPList>& plists,
    unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
    bool isphrase)
 {
-    LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx <<
-           " min " <<  min << " max " << max << " minpos " << minpos <<
-           " isphrase " << isphrase << "\n");
-
    // Overlap interdiction: possibly adjust window start by input minpos
    int actualminpos = isphrase ? max + 1 : max + 1 - window;
    if (actualminpos < minpos)
        actualminpos = minpos;
+    LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx <<
+          " min " <<  min << " max " << max << " minpos " << minpos <<
+          " isphrase " << isphrase << " actualminpos " << actualminpos << "\n");

-    // Find 1st position bigger than window start
-    auto it = plists[plist_idx]->begin();
-    while (it != plists[plist_idx]->end() && *it < actualminpos)
-        it++;
+    // Find 1st position bigger than window start. A previous call may
+    // have advanced the index, so we begin by retrieving the current
+    // value.
+    int nextpos = plists[plist_idx].value();
+    while (nextpos != -1 && nextpos < actualminpos)
+        nextpos = plists[plist_idx].next();

    // Look for position inside window. If not found, no match. If
    // found: if this is the last list we're done, else recurse on
    // next list after adjusting the window
-    while (it != plists[plist_idx]->end()) {
-        int pos = *it;
-        if (pos > min + window - 1) 
+    while (nextpos != -1) {
+        if (nextpos > min + window - 1) {
            return false;
+        }
        if (plist_idx + 1 == plists.size()) {
-            // Done: set return values
-            setWinMinMax(pos, *sp, *ep);
+            // We already checked pos > min, now we also have pos <
+            // max, and we are the last list: done: set return values.
+            setWinMinMax(nextpos, *sp, *ep);
            return true;
        }
-        setWinMinMax(pos, min, max);
-        if (do_proximity_test(window,plists, plist_idx + 1,
-                              min, max, sp, ep, minpos)) {
+        setWinMinMax(nextpos, min, max);
+        if (do_proximity_test(window, plists, plist_idx + 1,
+                              min, max, sp, ep, minpos, isphrase)) {
            return true;
        }
-        it++;
+        nextpos = plists[plist_idx].next();
    }
    return false;
 }
@ -111,63 +171,62 @@ bool matchGroup(const HighlightData& hldata,
                unsigned int grpidx,
                const map<string, vector<int>>& inplists,
                const map<int, pair<int,int>>& gpostobytes,
-                vector<GroupMatchEntry>& tboffs,
-                bool isphrase
-    )
+                vector<GroupMatchEntry>& tboffs)
 {
-    isphrase=true;
-    const vector<string>& terms = hldata.index_term_groups[grpidx];
-    int window = int(terms.size() + hldata.slacks[grpidx]);
-
-    LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
-          stringsToString(terms) << "\n");

+    const auto& tg(hldata.index_term_groups[grpidx]);
+    bool isphrase =  tg.kind == HighlightData::TermGroup::TGK_PHRASE;
+    string allplterms;
+    for (const auto& entry:inplists) {
+        allplterms += entry.first + " ";
+    }
+    LOGRP("matchGroup: isphrase " << isphrase <<
+          ". Have plists for [" << allplterms << "]\n");
+    LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl);
+    
+    int window = int(tg.orgroups.size() + tg.slack);
    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
-    vector<const vector<int>*> plists;
-    // A revert plist->term map. This is so that we can find who is who after
-    // sorting the plists by length.
-    map<const vector<int>*, string> plistToTerm;
+    vector<OrPList> orplists;

-    // Find the position list for each term in the group. It is
-    // possible that this particular group was not actually matched by
-    // the search, so that some terms are not found.
-    for (const auto& term : terms) {
-        map<string, vector<int> >::const_iterator pl = inplists.find(term);
-        if (pl == inplists.end()) {
-            LOGRP("TextSplitPTR::matchGroup: [" << term <<
-                  "] not found in plists\n");
-            return false;
+    // Find the position list for each term in the group and build the
+    // combined lists for the term or groups (each group is the result
+    // of the exansion of one user term). It is possible that this
+    // particular group was not actually matched by the search, so
+    // that some terms are not found, in which case we bail out.
+    for (const auto& group : tg.orgroups) {
+        orplists.push_back(OrPList());
+        for (const auto& term : group) {
+            const auto pl = inplists.find(term);
+            if (pl == inplists.end()) {
+                LOGRP("TextSplitPTR::matchGroup: term [" << term <<
+                      "] not found in plists\n");
+                continue;
+            }
+            orplists.back().addplist(pl->first, &(pl->second));
+        }
+        if (orplists.back().plists.empty()) {
+            LOGINF("No positions list found for group " <<
+                   stringsToString(group) << std::endl);
+            orplists.pop_back();
        }
-        plists.push_back(&(pl->second));
-        plistToTerm[&(pl->second)] = term;
    }
+
    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
-    if (plists.size() < 2) {
-        LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
+    if (orplists.size() < 2) {
+        LOGINF("TextSplitPTR::matchGroup: no actual groups found\n");
        return false;
    }

    if (!isphrase) {
        // Sort the positions lists so that the shorter is first
-        std::sort(plists.begin(), plists.end(),
-                  [](const vector<int> *a, const vector<int> *b) -> bool {
-                      return a->size() < b->size();
+        std::sort(orplists.begin(), orplists.end(),
+                  [](const OrPList& a, const OrPList& b) -> bool {
+                      return a.size() < b.size();
                  }
            );
    }
-    
-    if (0) { // Debug
-        auto it = plistToTerm.find(plists[0]);
-        if (it == plistToTerm.end()) {
-            // SuperWeird
-            LOGERR("matchGroup: term for first list not found !?!\n");
-            return false;
-        }
-        LOGRP("matchGroup: walking the shortest plist. Term [" <<
-              it->second << "], len " << plists[0]->size() << "\n");
-    }

    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
@ -175,11 +234,12 @@ bool matchGroup(const HighlightData& hldata,
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
-    for (int pos : *(plists[0])) {
+    int pos;
+    while ((pos = orplists[0].next()) != -1) {
        int sta = INT_MAX, sto = 0;
        LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
        if (do_proximity_test(
-                window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
+                window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
            setWinMinMax(pos, sta, sto);
            LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
                   "," << sto << "]\n"); 
@ -204,8 +264,9 @@ bool matchGroup(const HighlightData& hldata,
    return true;
 }

-void HighlightData::toString(string& out) const
+string HighlightData::toString() const
 {
+    string out;
    out.append("\nUser terms (orthograph): ");
    for (const auto& term : uterms) {
        out.append(" [").append(term).append("]");
@ -217,29 +278,37 @@ void HighlightData::toString(string& out) const
    }
    out.append("\nGroups: ");
    char cbuf[200];
-    sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
-            int(index_term_groups.size()), int(grpsugidx.size()),
-            int(ugroups.size()));
+    sprintf(cbuf, "index_term_groups size %d ugroups size %d",
+            int(index_term_groups.size()), int(ugroups.size()));
    out.append(cbuf);

    size_t ugidx = (size_t) - 1;
-    for (unsigned int i = 0; i < index_term_groups.size(); i++) {
-        if (ugidx != grpsugidx[i]) {
-            ugidx = grpsugidx[i];
+    for (HighlightData::TermGroup tg : index_term_groups) {
+        if (ugidx != tg.grpsugidx) {
+            ugidx = tg.grpsugidx;
            out.append("\n(");
            for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
                out.append("[").append(ugroups[ugidx][j]).append("] ");
            }
            out.append(") ->");
        }
-        out.append(" {");
-        for (unsigned int j = 0; j < index_term_groups[i].size(); j++) {
-            out.append("[").append(index_term_groups[i][j]).append("]");
+        if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
+            out.append(" <").append(tg.term).append(">");
+        } else {
+            out.append(" {");
+            for (unsigned int j = 0; j < tg.orgroups.size(); j++) {
+                out.append(" {");
+                for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) {
+                    out.append("[").append(tg.orgroups[j][k]).append("]");
+                }
+                out.append("}");
+            }
+            sprintf(cbuf, "%d", tg.slack);
+            out.append("}").append(cbuf);
        }
-        sprintf(cbuf, "%d", slacks[i]);
-        out.append("}").append(cbuf);
    }
    out.append("\n");
+    return out;
 }

 void HighlightData::append(const HighlightData& hl)
@ -249,12 +318,12 @@ void HighlightData::append(const HighlightData& hl)
    size_t ugsz0 = ugroups.size();
    ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());

+    size_t itgsize = index_term_groups.size();
    index_term_groups.insert(index_term_groups.end(),
                             hl.index_term_groups.begin(),
                             hl.index_term_groups.end());
-    slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
-    for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
-         it != hl.grpsugidx.end(); it++) {
-        grpsugidx.push_back(*it + ugsz0);
+    // Adjust the grpsugidx values for the newly inserted entries
+    for (unsigned int idx = itgsize; idx < index_term_groups.size(); idx++) {
+        index_term_groups[idx].grpsugidx += ugsz0;
    }
 }
--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@ -5,6 +5,7 @@
 #include <string>
 #include <set>
 #include <map>
+#include <unordered_map>

 /** Store data about user search terms and their expansions. This is used
 * mostly for highlighting result text and walking the matches, generating 
@ -22,7 +23,7 @@ struct HighlightData {
     * This is used for aggregating term stats when generating snippets (for 
     * choosing the best terms, allocating slots, etc. )
     */
-    std::map<std::string, std::string> terms;
+    std::unordered_map<std::string, std::string> terms;

    /** The original user terms-or-groups. This is for display
     * purposes: ie when creating a menu to look for a specific
@ -33,40 +34,39 @@ struct HighlightData {
    std::vector<std::vector<std::string> > ugroups;

    /** Processed/expanded terms and groups. Used for looking for
-     * regions to highlight. A group can be a PHRASE or NEAR entry (we
-     * process everything as NEAR to keep things reasonably
-     * simple. Terms are just groups with 1 entry. All
+     * regions to highlight. A group can be a PHRASE or NEAR entry
+     * Terms are just groups with 1 entry. All
     * terms are transformed to be compatible with index content
     * (unaccented and lowercased as needed depending on
     * configuration), and the list may include values
     * expanded from the original terms by stem or wildcard expansion.
-     * NEAR clauses are expanded to all possible combinations of the 
-     * stem-expanded member terms. Ex: 
-     * "clean floor"p -> (clean floor) (clean floors) (cleaning floor)...
     */
-    std::vector<std::vector<std::string> > index_term_groups;
-    /** Group slacks. Parallel to groups */
-    std::vector<int> slacks;
+    struct TermGroup {
+        // We'd use an union but no can do
+        std::string term;
+        std::vector<std::vector<std::string> > orgroups;
+        int slack{0};

-    /** Index into ugroups for each group. Parallel to groups. As a
-     * user term or group may generate many processed/expanded terms
-     * or groups, this is how we relate an expansion to its source
-     * (used, e.g. for generating anchors for walking search matches
-     * in the preview window).
-     */
-    std::vector<size_t> grpsugidx;
+        /* Index into ugroups. As a user term or group may generate
+         * many processed/expanded terms or groups, this is how we
+         * relate an expansion to its source (used, e.g. for
+         * generating anchors for walking search matches in the
+         * preview window). */
+        size_t grpsugidx{0};
+        enum TGK {TGK_TERM, TGK_NEAR, TGK_PHRASE};
+        TGK kind{TGK_TERM};
+    };
+    std::vector<TermGroup> index_term_groups;

    void clear() {
 	uterms.clear();
 	ugroups.clear();
 	index_term_groups.clear();
-	slacks.clear();
-	grpsugidx.clear();
    }
    void append(const HighlightData&);

    // Print (debug)
-    void toString(std::string& out) const;
+    std::string toString() const;
 };

 /* The following is used by plaintorich.cpp for finding zones to
@ -109,8 +109,7 @@ extern bool matchGroup(
    unsigned int grpidx,
    const std::map<std::string, std::vector<int>>& inplists,
    const std::map<int, std::pair<int,int>>& gpostobytes,
-    std::vector<GroupMatchEntry>& tboffs,
-    bool isphrase = false
+    std::vector<GroupMatchEntry>& tboffs
    );

 #endif /* _hldata_h_included_ */