From 175ca9832f8d4700efb75768037a85e8be3c9d70 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 3 Jan 2018 15:28:11 +0100 Subject: [PATCH] Extracted some code from plaintorich, for reuse by the new snippets building code --- src/query/plaintorich.cpp | 199 +++----------------------------------- src/utils/hldata.cpp | 139 +++++++++++++++++++++++++- src/utils/hldata.h | 58 +++++++++++ 3 files changed, 211 insertions(+), 185 deletions(-) diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index 6e7813f0..7076d344 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -40,24 +40,13 @@ using std::set; #include "cancelcheck.h" #include "unacpp.h" -struct MatchEntry { - // Start/End byte offsets in the document text - pair offs; - // Index of the search group this comes from: this is to relate a - // match to the original user input. - size_t grpidx; - MatchEntry(int sta, int sto, size_t idx) - : offs(sta, sto), grpidx(idx) { - } -}; - // Text splitter used to take note of the position of query terms // inside the result text. This is then used to insert highlight tags. class TextSplitPTR : public TextSplit { public: // Out: begin and end byte positions of query terms/groups in text - vector tboffs; + vector m_tboffs; TextSplitPTR(const HighlightData& hdata) : m_wcount(0), m_hdata(hdata) { @@ -96,7 +85,7 @@ public: // If this word is a search term, remember its byte-offset span. map::const_iterator it = m_terms.find(dumb); if (it != m_terms.end()) { - tboffs.push_back(MatchEntry(bts, bte, it->second)); + m_tboffs.push_back(GroupMatchEntry(bts, bte, it->second)); } // If word is part of a search group, update its positions list @@ -119,8 +108,6 @@ public: virtual bool matchGroups(); private: - virtual bool matchGroup(unsigned int idx); - // Word count. Used to call checkCancel from time to time. int m_wcount; @@ -138,166 +125,6 @@ private: }; -/** Sort by shorter comparison class */ -class VecIntCmpShorter { -public: - /** Return true if and only if a is strictly shorter than b. */ - bool operator()(const vector *a, const vector *b) { - return a->size() < b->size(); - } -}; - -#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \ - if ((POS) > (STO)) (STO) = (POS);} - -// Check that at least an entry from the first position list is inside -// the window and recurse on next list. The window is readjusted as -// the successive terms are found. -// -// @param window the search window width -// @param plists the position list vector -// @param i the position list to process (we then recurse with the next list) -// @param min the current minimum pos for a found term -// @param max the current maximum pos for a found term -// @param sp, ep output: the found area -// @param minpos bottom of search: this is the highest point of -// any previous match. We don't look below this as overlapping matches -// make no sense for highlighting. -static bool do_proximity_test(int window, vector* >& plists, - unsigned int i, int min, int max, - int *sp, int *ep, int minpos) -{ - LOGDEB1("do_prox_test: win " << window << " i " << i << " min " << - min << " max " << max << " minpos " << minpos << "\n"); - int tmp = max + 1 - window; - if (tmp < minpos) - tmp = minpos; - - // Find 1st position bigger than window start - vector::iterator it = plists[i]->begin(); - while (it != plists[i]->end() && *it < tmp) - it++; - - // Look for position inside window. If not found, no match. If - // found: if this is the last list we're done, else recurse on - // next list after adjusting the window - while (it != plists[i]->end()) { - int pos = *it; - if (pos > min + window - 1) - return false; - if (i + 1 == plists.size()) { - SETMINMAX(pos, *sp, *ep); - return true; - } - SETMINMAX(pos, min, max); - if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { - SETMINMAX(pos, *sp, *ep); - return true; - } - it++; - } - return false; -} - -// Find NEAR matches for one group of terms, update highlight map -bool TextSplitPTR::matchGroup(unsigned int grpidx) -{ - const vector& terms = m_hdata.groups[grpidx]; - int window = int(m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx]); - - LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " << - stringsToString(terms) << "\n"); - - // The position lists we are going to work with. We extract them from the - // (string->plist) map - vector* > plists; - // A revert plist->term map. This is so that we can find who is who after - // sorting the plists by length. - map*, string> plistToTerm; - - // Find the position list for each term in the group. It is - // possible that this particular group was not actually matched by - // the search, so that some terms are not found. - for (vector::const_iterator it = terms.begin(); - it != terms.end(); it++) { - map >::iterator pl = m_plists.find(*it); - if (pl == m_plists.end()) { - LOGDEB1("TextSplitPTR::matchGroup: [" << *it << - "] not found in m_plists\n"); - return false; - } - plists.push_back(&(pl->second)); - plistToTerm[&(pl->second)] = *it; - } - // I think this can't actually happen, was useful when we used to - // prune the groups, but doesn't hurt. - if (plists.size() < 2) { - LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n"); - return false; - } - // Sort the positions lists so that the shorter is first - std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); - - { // Debug - map*, string>::iterator it; - it = plistToTerm.find(plists[0]); - if (it == plistToTerm.end()) { - // SuperWeird - LOGERR("matchGroup: term for first list not found !?!\n"); - return false; - } - LOGDEB1("matchGroup: walking the shortest plist. Term [" << - it->second << "], len " << plists[0]->size() << "\n"); - } - - // Minpos is the highest end of a found match. While looking for - // further matches, we don't want the search to extend before - // this, because it does not make sense for highlight regions to - // overlap - int minpos = 0; - // Walk the shortest plist and look for matches - for (vector::iterator it = plists[0]->begin(); - it != plists[0]->end(); it++) { - int pos = *it; - int sta = INT_MAX, sto = 0; - LOGDEB2("MatchGroup: Testing at pos " << pos << "\n"); - if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { - LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta << - "," << sto << "]\n"); - // Maybe extend the window by 1st term position, this was not - // done by do_prox.. - SETMINMAX(pos, sta, sto); - minpos = sto+1; - // Translate the position window into a byte offset window - map >::iterator i1 = m_gpostobytes.find(sta); - map >::iterator i2 = m_gpostobytes.find(sto); - if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { - LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " << - i1->second.first << " " << i2->second.second << "\n"); - tboffs.push_back(MatchEntry(i1->second.first, - i2->second.second, grpidx)); - } else { - LOGDEB0("matchGroup: no bpos found for " << sta << " or " - << sto << "\n"); - } - } else { - LOGDEB1("matchGroup: no group match found at this position\n"); - } - } - - return true; -} - -/** Sort integer pairs by increasing first value and decreasing width */ -class PairIntCmpFirst { -public: - bool operator()(const MatchEntry& a, const MatchEntry& b) { - if (a.offs.first != b.offs.first) - return a.offs.first < b.offs.first; - return a.offs.second > b.offs.second; - } -}; - // Look for matches to PHRASE and NEAR term groups and finalize the // matched regions list (sort it by increasing start then decreasing // length) @@ -307,12 +134,18 @@ bool TextSplitPTR::matchGroups() for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { if (m_hdata.groups[i].size() <= 1) continue; - matchGroup(i); + matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs); } // Sort regions by increasing start and decreasing width. // The output process will skip overlapping entries. - std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst()); + std::sort(m_tboffs.begin(), m_tboffs.end(), + [](const GroupMatchEntry& a, const GroupMatchEntry& b) -> bool { + if (a.offs.first != b.offs.first) + return a.offs.first < b.offs.first; + return a.offs.second > b.offs.second; + } + ); return true; } @@ -357,7 +190,7 @@ bool PlainToRich::plaintorich(const string& in, // No term matches. Happens, for example on a snippet selected for // a term match when we are actually looking for a group match // (the snippet generator does this...). - if (splitter.tboffs.empty()) { + if (splitter.m_tboffs.empty()) { LOGDEB1("plaintorich: no term matches\n"); ret = false; } @@ -365,12 +198,12 @@ bool PlainToRich::plaintorich(const string& in, // Iterator for the list of input term positions. We use it to // output highlight tags and to compute term positions in the // output text - vector::iterator tPosIt = splitter.tboffs.begin(); - vector::iterator tPosEnd = splitter.tboffs.end(); + vector::iterator tPosIt = splitter.m_tboffs.begin(); + vector::iterator tPosEnd = splitter.m_tboffs.end(); #if 0 - for (vector >::const_iterator it = splitter.tboffs.begin(); - it != splitter.tboffs.end(); it++) { + for (vector >::const_iterator it = splitter.m_tboffs.begin(); + it != splitter.m_tboffs.end(); it++) { LOGDEB2("plaintorich: region: " << it->first << " "<second<< "\n"); } #endif @@ -420,7 +253,7 @@ bool PlainToRich::plaintorich(const string& in, } // Skip all highlight areas that would overlap this one int crend = tPosIt->offs.second; - while (tPosIt != splitter.tboffs.end() && + while (tPosIt != splitter.m_tboffs.end() && tPosIt->offs.first < crend) tPosIt++; inrcltag = 0; diff --git a/src/utils/hldata.cpp b/src/utils/hldata.cpp index 44fcef94..ee1d4d0d 100644 --- a/src/utils/hldata.cpp +++ b/src/utils/hldata.cpp @@ -16,12 +16,147 @@ */ #include "autoconfig.h" -#include - #include "hldata.h" +#include +#include + +#include "log.h" + using std::string; using std::map; +using std::vector; +using std::pair; + +bool do_proximity_test(int window, vector*>& plists, + unsigned int i, int min, int max, + int *sp, int *ep, int minpos) +{ + LOGDEB1("do_prox_test: win " << window << " i " << i << " min " << + min << " max " << max << " minpos " << minpos << "\n"); + int tmp = max + 1 - window; + if (tmp < minpos) + tmp = minpos; + + // Find 1st position bigger than window start + auto it = plists[i]->begin(); + while (it != plists[i]->end() && *it < tmp) + it++; + + // Look for position inside window. If not found, no match. If + // found: if this is the last list we're done, else recurse on + // next list after adjusting the window + while (it != plists[i]->end()) { + int pos = *it; + if (pos > min + window - 1) + return false; + if (i + 1 == plists.size()) { + setWinMinMax(pos, *sp, *ep); + return true; + } + setWinMinMax(pos, min, max); + if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { + setWinMinMax(pos, *sp, *ep); + return true; + } + it++; + } + return false; +} + +// Find NEAR matches for one group of terms +bool matchGroup(const HighlightData& hldata, + unsigned int grpidx, + const map>& inplists, + const map>& gpostobytes, + vector& tboffs + ) +{ + const vector& terms = hldata.groups[grpidx]; + int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]); + + LOGDEB1("TextSplitPTR::matchGroup:d " << window << ": " << + stringsToString(terms) << "\n"); + + // The position lists we are going to work with. We extract them from the + // (string->plist) map + vector*> plists; + // A revert plist->term map. This is so that we can find who is who after + // sorting the plists by length. + map*, string> plistToTerm; + + // Find the position list for each term in the group. It is + // possible that this particular group was not actually matched by + // the search, so that some terms are not found. + for (const auto& term : terms) { + map >::const_iterator pl = inplists.find(term); + if (pl == inplists.end()) { + LOGDEB1("TextSplitPTR::matchGroup: [" << term << + "] not found in plists\n"); + return false; + } + plists.push_back(&(pl->second)); + plistToTerm[&(pl->second)] = term; + } + // I think this can't actually happen, was useful when we used to + // prune the groups, but doesn't hurt. + if (plists.size() < 2) { + LOGDEB1("TextSplitPTR::matchGroup: no actual groups found\n"); + return false; + } + // Sort the positions lists so that the shorter is first + std::sort(plists.begin(), plists.end(), + [](const vector *a, const vector *b) -> bool { + return a->size() < b->size(); + } + ); + + if (0) { // Debug + auto it = plistToTerm.find(plists[0]); + if (it == plistToTerm.end()) { + // SuperWeird + LOGERR("matchGroup: term for first list not found !?!\n"); + return false; + } + LOGDEB1("matchGroup: walking the shortest plist. Term [" << + it->second << "], len " << plists[0]->size() << "\n"); + } + + // Minpos is the highest end of a found match. While looking for + // further matches, we don't want the search to extend before + // this, because it does not make sense for highlight regions to + // overlap + int minpos = 0; + // Walk the shortest plist and look for matches + for (int pos : *(plists[0])) { + int sta = INT_MAX, sto = 0; + LOGDEB2("MatchGroup: Testing at pos " << pos << "\n"); + if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { + LOGDEB1("TextSplitPTR::matchGroup: MATCH termpos [" << sta << + "," << sto << "]\n"); + // Maybe extend the window by 1st term position, this was not + // done by do_prox.. + setWinMinMax(pos, sta, sto); + minpos = sto + 1; + // Translate the position window into a byte offset window + auto i1 = gpostobytes.find(sta); + auto i2 = gpostobytes.find(sto); + if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) { + LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " << + i1->second.first << " " << i2->second.second << "\n"); + tboffs.push_back(GroupMatchEntry(i1->second.first, + i2->second.second, grpidx)); + } else { + LOGDEB0("matchGroup: no bpos found for " << sta << " or " + << sto << "\n"); + } + } else { + LOGDEB1("matchGroup: no group match found at this position\n"); + } + } + + return true; +} void HighlightData::toString(string& out) { diff --git a/src/utils/hldata.h b/src/utils/hldata.h index d6886c34..6344eb27 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -67,4 +67,62 @@ struct HighlightData { void toString(std::string& out); }; +inline void setWinMinMax(int pos, int& sta, int& sto) +{ + if (pos < sta) { + sta = pos; + } + if (pos > sto) { + sto = pos; + } +} + +// Check that at least an entry from the first position list is inside +// the window and recurse on next list. The window is readjusted as +// the successive terms are found. Mostly copied from Xapian code. +// +// @param window the search window width +// @param plists the position list vector +// @param i the position list to process (we then recurse with the next list) +// @param min the current minimum pos for a found term +// @param max the current maximum pos for a found term +// @param sp, ep output: the found area +// @param minpos bottom of search: this is the highest point of +// any previous match. We don't look below this as overlapping matches +// make no sense for highlighting. +extern bool do_proximity_test( + int window, std::vector*>& plists, + unsigned int i, int min, int max, int *sp, int *ep, int minpos); + + +/**** The following is used by plaintorich.cpp for finding zones to + highlight and by rclabsfromtext.cpp to choose fragments for the + abstract */ + +struct GroupMatchEntry { + // Start/End byte offsets in the document text + std::pair offs; + // Index of the search group this comes from: this is to relate a + // match to the original user input. + size_t grpidx; + GroupMatchEntry(int sta, int sto, size_t idx) + : offs(sta, sto), grpidx(idx) { + } +}; + +// Find NEAR matches for one group of terms. +// +// @param hldata Data about the user query +// @param grpidx Index in hldata.groups for the group we process +// @param inplists Position lists for the the group terms +// @param gpostobytes Translation of term position to start/end byte offsets +// @param[out] tboffs Found matches +extern bool matchGroup( + const HighlightData& hldata, + unsigned int grpidx, + const std::map>& inplists, + const std::map>& gpostobytes, + std::vector& tboffs + ); + #endif /* _hldata_h_included_ */