diff --git a/src/utils/hldata.cpp b/src/utils/hldata.cpp index 2830c693..b0182940 100644 --- a/src/utils/hldata.cpp +++ b/src/utils/hldata.cpp @@ -29,42 +29,6 @@ using std::map; using std::vector; using std::pair; -bool do_proximity_test(int window, vector*>& plists, - unsigned int i, int min, int max, - int *sp, int *ep, int minpos) -{ - LOGDEB1("do_prox_test: win " << window << " i " << i << " min " << - min << " max " << max << " minpos " << minpos << "\n"); - int tmp = max + 1 - window; - if (tmp < minpos) - tmp = minpos; - - // Find 1st position bigger than window start - auto it = plists[i]->begin(); - while (it != plists[i]->end() && *it < tmp) - it++; - - // Look for position inside window. If not found, no match. If - // found: if this is the last list we're done, else recurse on - // next list after adjusting the window - while (it != plists[i]->end()) { - int pos = *it; - if (pos > min + window - 1) - return false; - if (i + 1 == plists.size()) { - setWinMinMax(pos, *sp, *ep); - return true; - } - setWinMinMax(pos, min, max); - if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { - setWinMinMax(pos, *sp, *ep); - return true; - } - it++; - } - return false; -} - #undef DEBUGGROUPS #ifdef DEBUGGROUPS #define LOGRP LOGDEB @@ -72,19 +36,91 @@ bool do_proximity_test(int window, vector*>& plists, #define LOGRP LOGDEB1 #endif -// Find NEAR matches for one group of terms +static inline void setWinMinMax(int pos, int& sta, int& sto) +{ + if (pos < sta) { + sta = pos; + } + if (pos > sto) { + sto = pos; + } +} + +/* + * @param window the total width for the "near" area, in positions. + + * @param plists the position vectors for the terms. The array is + * sorted shorted first for optimization. The function does a + * recursive call on the next array if the match is still possible + * after dealing with the current one + + * @param plist_idx the index for the position list we will work with. + * @param min, max the current minimum and maximum term positions. + * @param[output] sp, ep, the start and end positions of the found match. + * @param minpos Highest end of a found match. While looking for + * further matches, we don't want the search to extend before + * this, because it does not make sense for highlight regions to + * overlap. + * @param isphrase if true, the position lists are in term order, and + * we only look for the next match beyond the current window top. + */ +static bool do_proximity_test( + const int window, vector*>& plists, + unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos, + bool isphrase) +{ + LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx << + " min " << min << " max " << max << " minpos " << minpos << + " isphrase " << isphrase << "\n"); + + // Overlap interdiction: possibly adjust window start by input minpos + int actualminpos = isphrase ? max + 1 : max + 1 - window; + if (actualminpos < minpos) + actualminpos = minpos; + + // Find 1st position bigger than window start + auto it = plists[plist_idx]->begin(); + while (it != plists[plist_idx]->end() && *it < actualminpos) + it++; + + // Look for position inside window. If not found, no match. If + // found: if this is the last list we're done, else recurse on + // next list after adjusting the window + while (it != plists[plist_idx]->end()) { + int pos = *it; + if (pos > min + window - 1) + return false; + if (plist_idx + 1 == plists.size()) { + // Done: set return values + setWinMinMax(pos, *sp, *ep); + return true; + } + setWinMinMax(pos, min, max); + if (do_proximity_test(window,plists, plist_idx + 1, + min, max, sp, ep, minpos)) { + return true; + } + it++; + } + return false; +} + + +// Find matches for one group of terms bool matchGroup(const HighlightData& hldata, unsigned int grpidx, const map>& inplists, const map>& gpostobytes, - vector& tboffs + vector& tboffs, + bool isphrase ) { - const vector& terms = hldata.groups[grpidx]; - int window = int(hldata.groups[grpidx].size() + hldata.slacks[grpidx]); + isphrase=true; + const vector& terms = hldata.index_term_groups[grpidx]; + int window = int(terms.size() + hldata.slacks[grpidx]); LOGRP("TextSplitPTR::matchGroup:d " << window << ": " << - stringsToString(terms) << "\n"); + stringsToString(terms) << "\n"); // The position lists we are going to work with. We extract them from the // (string->plist) map @@ -100,7 +136,7 @@ bool matchGroup(const HighlightData& hldata, map >::const_iterator pl = inplists.find(term); if (pl == inplists.end()) { LOGRP("TextSplitPTR::matchGroup: [" << term << - "] not found in plists\n"); + "] not found in plists\n"); return false; } plists.push_back(&(pl->second)); @@ -112,13 +148,16 @@ bool matchGroup(const HighlightData& hldata, LOGRP("TextSplitPTR::matchGroup: no actual groups found\n"); return false; } - // Sort the positions lists so that the shorter is first - std::sort(plists.begin(), plists.end(), - [](const vector *a, const vector *b) -> bool { - return a->size() < b->size(); - } - ); + if (!isphrase) { + // Sort the positions lists so that the shorter is first + std::sort(plists.begin(), plists.end(), + [](const vector *a, const vector *b) -> bool { + return a->size() < b->size(); + } + ); + } + if (0) { // Debug auto it = plistToTerm.find(plists[0]); if (it == plistToTerm.end()) { @@ -127,7 +166,7 @@ bool matchGroup(const HighlightData& hldata, return false; } LOGRP("matchGroup: walking the shortest plist. Term [" << - it->second << "], len " << plists[0]->size() << "\n"); + it->second << "], len " << plists[0]->size() << "\n"); } // Minpos is the highest end of a found match. While looking for @@ -139,12 +178,11 @@ bool matchGroup(const HighlightData& hldata, for (int pos : *(plists[0])) { int sta = INT_MAX, sto = 0; LOGDEB2("MatchGroup: Testing at pos " << pos << "\n"); - if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { - LOGRP("TextSplitPTR::matchGroup: MATCH termpos [" << sta << - "," << sto << "]\n"); - // Maybe extend the window by 1st term position, this was not - // done by do_prox.. + if (do_proximity_test( + window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) { setWinMinMax(pos, sta, sto); + LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta << + "," << sto << "]\n"); minpos = sto + 1; // Translate the position window into a byte offset window auto i1 = gpostobytes.find(sta); @@ -153,7 +191,7 @@ bool matchGroup(const HighlightData& hldata, LOGDEB2("TextSplitPTR::matchGroup: pushing bpos " << i1->second.first << " " << i2->second.second << "\n"); tboffs.push_back(GroupMatchEntry(i1->second.first, - i2->second.second, grpidx)); + i2->second.second, grpidx)); } else { LOGDEB0("matchGroup: no bpos found for " << sta << " or " << sto << "\n"); @@ -169,24 +207,23 @@ bool matchGroup(const HighlightData& hldata, void HighlightData::toString(string& out) const { out.append("\nUser terms (orthograph): "); - for (std::set::const_iterator it = uterms.begin(); - it != uterms.end(); it++) { - out.append(" [").append(*it).append("]"); + for (const auto& term : uterms) { + out.append(" [").append(term).append("]"); } out.append("\nUser terms to Query terms:"); - for (map::const_iterator it = terms.begin(); - it != terms.end(); it++) { - out.append("[").append(it->first).append("]->["); - out.append(it->second).append("] "); + for (const auto& entry: terms) { + out.append("[").append(entry.first).append("]->["); + out.append(entry.second).append("] "); } out.append("\nGroups: "); char cbuf[200]; sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d", - int(groups.size()), int(grpsugidx.size()), int(ugroups.size())); + int(index_term_groups.size()), int(grpsugidx.size()), + int(ugroups.size())); out.append(cbuf); size_t ugidx = (size_t) - 1; - for (unsigned int i = 0; i < groups.size(); i++) { + for (unsigned int i = 0; i < index_term_groups.size(); i++) { if (ugidx != grpsugidx[i]) { ugidx = grpsugidx[i]; out.append("\n("); @@ -196,8 +233,8 @@ void HighlightData::toString(string& out) const out.append(") ->"); } out.append(" {"); - for (unsigned int j = 0; j < groups[i].size(); j++) { - out.append("[").append(groups[i][j]).append("]"); + for (unsigned int j = 0; j < index_term_groups[i].size(); j++) { + out.append("[").append(index_term_groups[i][j]).append("]"); } sprintf(cbuf, "%d", slacks[i]); out.append("}").append(cbuf); @@ -212,10 +249,12 @@ void HighlightData::append(const HighlightData& hl) size_t ugsz0 = ugroups.size(); ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end()); - groups.insert(groups.end(), hl.groups.begin(), hl.groups.end()); + index_term_groups.insert(index_term_groups.end(), + hl.index_term_groups.begin(), + hl.index_term_groups.end()); slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end()); for (std::vector::const_iterator it = hl.grpsugidx.begin(); - it != hl.grpsugidx.end(); it++) { + it != hl.grpsugidx.end(); it++) { grpsugidx.push_back(*it + ugsz0); } } diff --git a/src/utils/hldata.h b/src/utils/hldata.h index 4c93b673..36f01df3 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -40,8 +40,11 @@ struct HighlightData { * (unaccented and lowercased as needed depending on * configuration), and the list may include values * expanded from the original terms by stem or wildcard expansion. + * NEAR clauses are expanded to all possible combinations of the + * stem-expanded member terms. Ex: + * "clean floor"p -> (clean floor) (clean floors) (cleaning floor)... */ - std::vector > groups; + std::vector > index_term_groups; /** Group slacks. Parallel to groups */ std::vector slacks; @@ -53,11 +56,10 @@ struct HighlightData { */ std::vector grpsugidx; - void clear() - { + void clear() { uterms.clear(); ugroups.clear(); - groups.clear(); + index_term_groups.clear(); slacks.clear(); grpsugidx.clear(); } @@ -67,35 +69,7 @@ struct HighlightData { void toString(std::string& out) const; }; -inline void setWinMinMax(int pos, int& sta, int& sto) -{ - if (pos < sta) { - sta = pos; - } - if (pos > sto) { - sto = pos; - } -} - -// Check that at least an entry from the first position list is inside -// the window and recurse on next list. The window is readjusted as -// the successive terms are found. Mostly copied from Xapian code. -// -// @param window the search window width -// @param plists the position list vector -// @param i the position list to process (we then recurse with the next list) -// @param min the current minimum pos for a found term -// @param max the current maximum pos for a found term -// @param sp, ep output: the found area -// @param minpos bottom of search: this is the highest point of -// any previous match. We don't look below this as overlapping matches -// make no sense for highlighting. -extern bool do_proximity_test( - int window, std::vector*>& plists, - unsigned int i, int min, int max, int *sp, int *ep, int minpos); - - -/**** The following is used by plaintorich.cpp for finding zones to +/* The following is used by plaintorich.cpp for finding zones to highlight and by rclabsfromtext.cpp to choose fragments for the abstract */ @@ -112,17 +86,31 @@ struct GroupMatchEntry { // Find NEAR matches for one group of terms. // -// @param hldata Data about the user query -// @param grpidx Index in hldata.groups for the group we process -// @param inplists Position lists for the the group terms -// @param gpostobytes Translation of term position to start/end byte offsets -// @param[out] tboffs Found matches +// @param hldata User query expansion descriptor (see above). +// +// @param grpidx Index in hldata.index_term_groups for the group we +// process. This is used by us to get the terms and slacks, and +// set in the output GroupMatchEntry structures to allow the +// caller to link a match with a specific user input (e.g. for +// walking the match in the GUI preview) +// +// @param inplists Position lists for the the group terms. This is the +// data used to look for matches. +// +// @param gpostobytes Translation of term position to start/end byte +// offsets. This is used to translate term positions to byte +// positions in the output, for ease of use by caller. +// +// @param[out] tboffs Found matches. Each match has a begin and end +// byte offset and an index linking to the origin data in the +// HighlightData structure. extern bool matchGroup( const HighlightData& hldata, unsigned int grpidx, const std::map>& inplists, const std::map>& gpostobytes, - std::vector& tboffs + std::vector& tboffs, + bool isphrase = false ); #endif /* _hldata_h_included_ */