diff --git a/src/qtgui/preview_plaintorich.cpp b/src/qtgui/preview_plaintorich.cpp index 608a854c..30885cf5 100644 --- a/src/qtgui/preview_plaintorich.cpp +++ b/src/qtgui/preview_plaintorich.cpp @@ -72,7 +72,7 @@ string PlainToRichQtPreview::PlainToRichQtPreview::header() string PlainToRichQtPreview::startMatch(unsigned int grpidx) { LOGDEB2("startMatch, grpidx " << (grpidx) << "\n" ); - grpidx = m_hdata->grpsugidx[grpidx]; + grpidx = m_hdata->index_term_groups[grpidx].grpsugidx; LOGDEB2("startMatch, ugrpidx " << (grpidx) << "\n" ); m_groupanchors[grpidx].push_back(++m_lastanchor); m_groupcuranchors[grpidx] = 0; diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index 2090a07b..f559030b 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -53,15 +53,17 @@ public: : m_wcount(0), m_hdata(hdata) { // We separate single terms and groups and extract the group // terms for computing positions list before looking for group - // matches - for (vector >::const_iterator vit = hdata.groups.begin(); - vit != hdata.groups.end(); vit++) { - if (vit->size() == 1) { - m_terms[vit->front()] = vit - hdata.groups.begin(); - } else if (vit->size() > 1) { - for (vector::const_iterator it = vit->begin(); - it != vit->end(); it++) { - m_gterms.insert(*it); + // matches. Single terms are stored with a reference to the + // entry they come with. + for (unsigned int i = 0; i < hdata.index_term_groups.size(); i++) { + const HighlightData::TermGroup& tg(hdata.index_term_groups[i]); + if (tg.kind == HighlightData::TermGroup::TGK_TERM) { + m_terms[tg.term] = i; + } else { + for (const auto& group : tg.orgroups) { + for (const auto& term : group) { + m_gterms.insert(term); + } } } } @@ -129,13 +131,13 @@ private: // Look for matches to PHRASE and NEAR term groups and finalize the // matched regions list (sort it by increasing start then decreasing // length) -// Actually, we handle all groups as NEAR (ignore order). bool TextSplitPTR::matchGroups() { - for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { - if (m_hdata.groups[i].size() <= 1) - continue; - matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs); + for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) { + if (m_hdata.index_term_groups[i].kind != + HighlightData::TermGroup::TGK_TERM) { + matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs); + } } // Sort regions by increasing start and decreasing width. diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index d56ad5cc..36f0b030 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -119,10 +119,12 @@ public: // Take note of the group (phrase/near) terms because we need // to compute the position lists for them. - for (const auto& group : hdata.groups) { - if (group.size() > 1) { - for (const auto& term: group) { - m_gterms.insert(term); + for (const auto& tg : hdata.index_term_groups) { + if (tg.kind != HighlightData::TermGroup::TGK_TERM) { + for (const auto& group : tg.orgroups) { + for (const auto& term: group) { + m_gterms.insert(term); + } } } } @@ -134,7 +136,9 @@ public: LOGDEB2("takeword: " << term << endl); // Limit time taken with monster documents. The resulting // abstract will be incorrect or inexistant, but this is - // better than taking forever (the default cutoff is 10E6) + // better than taking forever (the default cutoff value comes + // from the snippetMaxPosWalk configuration parameter, and is + // 10E6) if (maxtermcount && termcount++ > maxtermcount) { LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<< maxtermcount << endl); @@ -276,8 +280,9 @@ public: // Look for matches to PHRASE and NEAR term groups and finalize // the matched regions list (sort it by increasing start then // decreasing length). We process all groups as NEAR (ignore order). - for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { - if (m_hdata.groups[i].size() > 1) { + for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) { + if (m_hdata.index_term_groups[i].kind != + HighlightData::TermGroup::TGK_TERM) { matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs); } } diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index 6a651f7b..b6fb729a 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -163,7 +163,7 @@ double Query::Native::qualityTerms(Xapian::docid docid, // expanded from (by stemming) map > byRoot; for (const auto& term: terms) { - map::const_iterator eit = hld.terms.find(term); + const auto eit = hld.terms.find(term); if (eit != hld.terms.end()) { byRoot[eit->second].push_back(term); } else { @@ -174,9 +174,7 @@ double Query::Native::qualityTerms(Xapian::docid docid, #ifdef DEBUGABSTRACT { - string deb; - hld.toString(deb); - LOGABS("qualityTerms: hld: " << deb << "\n"); + LOGABS("qualityTerms: hld: " << hld.toString() << "\n"); string byRootstr; for (const auto& entry : byRoot) { byRootstr.append("[").append(entry.first).append("]->"); diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index a0c6334f..596036b5 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -603,11 +603,11 @@ void SearchDataClauseSimple::processSimpleSpan( return; // Set up the highlight data. No prefix should go in there - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - m_hldata.groups.push_back(vector(1, it->substr(prefix.size()))); - m_hldata.slacks.push_back(0); - m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1); + for (const auto& term : exp) { + HighlightData::TermGroup tg; + tg.term = term.substr(prefix.size()); + tg.grpsugidx = m_hldata.ugroups.size() - 1; + m_hldata.index_term_groups.push_back(tg); } // Push either term or OR of stem-expanded set @@ -735,18 +735,16 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, original_term_wqf_booster); pqueries.push_back(xq); - // Add all combinations of NEAR/PHRASE groups to the highlighting data. - vector > allcombs; - vector comb; - multiply_groups(groups.begin(), groups.end(), comb, allcombs); - // Insert the search groups and slacks in the highlight data, with // a reference to the user entry that generated them: - m_hldata.groups.insert(m_hldata.groups.end(), - allcombs.begin(), allcombs.end()); - m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack); - m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), - m_hldata.ugroups.size() - 1); + HighlightData::TermGroup tg; + tg.orgroups = groups; + tg.slack = slack; + tg.grpsugidx = m_hldata.ugroups.size() - 1; + tg.kind = (op == Xapian::Query::OP_PHRASE) ? + HighlightData::TermGroup::TGK_PHRASE : + HighlightData::TermGroup::TGK_NEAR; + m_hldata.index_term_groups.push_back(tg); } // Trim string beginning with ^ or ending with $ and convert to flags diff --git a/src/utils/hldata.cpp b/src/utils/hldata.cpp index b0182940..da49b4de 100644 --- a/src/utils/hldata.cpp +++ b/src/utils/hldata.cpp @@ -31,11 +31,69 @@ using std::pair; #undef DEBUGGROUPS #ifdef DEBUGGROUPS -#define LOGRP LOGDEB +#define LOGRP LOGINF #else #define LOGRP LOGDEB1 #endif +// Combined position list for or'd terms +struct OrPList { + void addplist(const string& term, const vector* pl) { + terms.push_back(term); + plists.push_back(pl); + indexes.push_back(0); + totalsize += pl->size(); + } + + // Returns -1 for eof, else the next smallest value in the + // combined lists, according to the current indexes. + int value() { + int minval = INT_MAX; + int minidx = -1; + for (unsigned ii = 0; ii < indexes.size(); ii++) { + const vector& pl(*plists[ii]); + if (indexes[ii] >= pl.size()) + continue; // this list done + if (pl[indexes[ii]] < minval) { + minval = pl[indexes[ii]]; + minidx = ii; + } + } + if (minidx != -1) { + LOGRP("OrPList::value() -> " << minval << " for " << + terms[minidx] << "\n"); + currentidx = minidx; + return minval; + } else { + LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n"); + return -1; + } + } + + int next() { + if (currentidx != -1) { + indexes[currentidx]++; + } + return value(); + } + + int size() const { + return totalsize; + } + void rewind() { + for (auto& idx : indexes) { + idx = 0; + } + currentidx = -1; + } + + vector*> plists; + vector indexes; + vector terms; + int currentidx{-1}; + int totalsize{0}; +}; + static inline void setWinMinMax(int pos, int& sta, int& sto) { if (pos < sta) { @@ -65,42 +123,44 @@ static inline void setWinMinMax(int pos, int& sta, int& sto) * we only look for the next match beyond the current window top. */ static bool do_proximity_test( - const int window, vector*>& plists, + const int window, vector& plists, unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos, bool isphrase) { - LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx << - " min " << min << " max " << max << " minpos " << minpos << - " isphrase " << isphrase << "\n"); - // Overlap interdiction: possibly adjust window start by input minpos int actualminpos = isphrase ? max + 1 : max + 1 - window; if (actualminpos < minpos) actualminpos = minpos; + LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx << + " min " << min << " max " << max << " minpos " << minpos << + " isphrase " << isphrase << " actualminpos " << actualminpos << "\n"); - // Find 1st position bigger than window start - auto it = plists[plist_idx]->begin(); - while (it != plists[plist_idx]->end() && *it < actualminpos) - it++; + // Find 1st position bigger than window start. A previous call may + // have advanced the index, so we begin by retrieving the current + // value. + int nextpos = plists[plist_idx].value(); + while (nextpos != -1 && nextpos < actualminpos) + nextpos = plists[plist_idx].next(); // Look for position inside window. If not found, no match. If // found: if this is the last list we're done, else recurse on // next list after adjusting the window - while (it != plists[plist_idx]->end()) { - int pos = *it; - if (pos > min + window - 1) + while (nextpos != -1) { + if (nextpos > min + window - 1) { return false; + } if (plist_idx + 1 == plists.size()) { - // Done: set return values - setWinMinMax(pos, *sp, *ep); + // We already checked pos > min, now we also have pos < + // max, and we are the last list: done: set return values. + setWinMinMax(nextpos, *sp, *ep); return true; } - setWinMinMax(pos, min, max); - if (do_proximity_test(window,plists, plist_idx + 1, - min, max, sp, ep, minpos)) { + setWinMinMax(nextpos, min, max); + if (do_proximity_test(window, plists, plist_idx + 1, + min, max, sp, ep, minpos, isphrase)) { return true; } - it++; + nextpos = plists[plist_idx].next(); } return false; } @@ -111,63 +171,62 @@ bool matchGroup(const HighlightData& hldata, unsigned int grpidx, const map>& inplists, const map>& gpostobytes, - vector& tboffs, - bool isphrase - ) + vector& tboffs) { - isphrase=true; - const vector& terms = hldata.index_term_groups[grpidx]; - int window = int(terms.size() + hldata.slacks[grpidx]); - - LOGRP("TextSplitPTR::matchGroup:d " << window << ": " << - stringsToString(terms) << "\n"); + const auto& tg(hldata.index_term_groups[grpidx]); + bool isphrase = tg.kind == HighlightData::TermGroup::TGK_PHRASE; + string allplterms; + for (const auto& entry:inplists) { + allplterms += entry.first + " "; + } + LOGRP("matchGroup: isphrase " << isphrase << + ". Have plists for [" << allplterms << "]\n"); + LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl); + + int window = int(tg.orgroups.size() + tg.slack); // The position lists we are going to work with. We extract them from the // (string->plist) map - vector*> plists; - // A revert plist->term map. This is so that we can find who is who after - // sorting the plists by length. - map*, string> plistToTerm; + vector orplists; - // Find the position list for each term in the group. It is - // possible that this particular group was not actually matched by - // the search, so that some terms are not found. - for (const auto& term : terms) { - map >::const_iterator pl = inplists.find(term); - if (pl == inplists.end()) { - LOGRP("TextSplitPTR::matchGroup: [" << term << - "] not found in plists\n"); - return false; + // Find the position list for each term in the group and build the + // combined lists for the term or groups (each group is the result + // of the exansion of one user term). It is possible that this + // particular group was not actually matched by the search, so + // that some terms are not found, in which case we bail out. + for (const auto& group : tg.orgroups) { + orplists.push_back(OrPList()); + for (const auto& term : group) { + const auto pl = inplists.find(term); + if (pl == inplists.end()) { + LOGRP("TextSplitPTR::matchGroup: term [" << term << + "] not found in plists\n"); + continue; + } + orplists.back().addplist(pl->first, &(pl->second)); + } + if (orplists.back().plists.empty()) { + LOGINF("No positions list found for group " << + stringsToString(group) << std::endl); + orplists.pop_back(); } - plists.push_back(&(pl->second)); - plistToTerm[&(pl->second)] = term; } + // I think this can't actually happen, was useful when we used to // prune the groups, but doesn't hurt. - if (plists.size() < 2) { - LOGRP("TextSplitPTR::matchGroup: no actual groups found\n"); + if (orplists.size() < 2) { + LOGINF("TextSplitPTR::matchGroup: no actual groups found\n"); return false; } if (!isphrase) { // Sort the positions lists so that the shorter is first - std::sort(plists.begin(), plists.end(), - [](const vector *a, const vector *b) -> bool { - return a->size() < b->size(); + std::sort(orplists.begin(), orplists.end(), + [](const OrPList& a, const OrPList& b) -> bool { + return a.size() < b.size(); } ); } - - if (0) { // Debug - auto it = plistToTerm.find(plists[0]); - if (it == plistToTerm.end()) { - // SuperWeird - LOGERR("matchGroup: term for first list not found !?!\n"); - return false; - } - LOGRP("matchGroup: walking the shortest plist. Term [" << - it->second << "], len " << plists[0]->size() << "\n"); - } // Minpos is the highest end of a found match. While looking for // further matches, we don't want the search to extend before @@ -175,11 +234,12 @@ bool matchGroup(const HighlightData& hldata, // overlap int minpos = 0; // Walk the shortest plist and look for matches - for (int pos : *(plists[0])) { + int pos; + while ((pos = orplists[0].next()) != -1) { int sta = INT_MAX, sto = 0; LOGDEB2("MatchGroup: Testing at pos " << pos << "\n"); if (do_proximity_test( - window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) { + window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) { setWinMinMax(pos, sta, sto); LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta << "," << sto << "]\n"); @@ -204,8 +264,9 @@ bool matchGroup(const HighlightData& hldata, return true; } -void HighlightData::toString(string& out) const +string HighlightData::toString() const { + string out; out.append("\nUser terms (orthograph): "); for (const auto& term : uterms) { out.append(" [").append(term).append("]"); @@ -217,29 +278,37 @@ void HighlightData::toString(string& out) const } out.append("\nGroups: "); char cbuf[200]; - sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d", - int(index_term_groups.size()), int(grpsugidx.size()), - int(ugroups.size())); + sprintf(cbuf, "index_term_groups size %d ugroups size %d", + int(index_term_groups.size()), int(ugroups.size())); out.append(cbuf); size_t ugidx = (size_t) - 1; - for (unsigned int i = 0; i < index_term_groups.size(); i++) { - if (ugidx != grpsugidx[i]) { - ugidx = grpsugidx[i]; + for (HighlightData::TermGroup tg : index_term_groups) { + if (ugidx != tg.grpsugidx) { + ugidx = tg.grpsugidx; out.append("\n("); for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) { out.append("[").append(ugroups[ugidx][j]).append("] "); } out.append(") ->"); } - out.append(" {"); - for (unsigned int j = 0; j < index_term_groups[i].size(); j++) { - out.append("[").append(index_term_groups[i][j]).append("]"); + if (tg.kind == HighlightData::TermGroup::TGK_TERM) { + out.append(" <").append(tg.term).append(">"); + } else { + out.append(" {"); + for (unsigned int j = 0; j < tg.orgroups.size(); j++) { + out.append(" {"); + for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) { + out.append("[").append(tg.orgroups[j][k]).append("]"); + } + out.append("}"); + } + sprintf(cbuf, "%d", tg.slack); + out.append("}").append(cbuf); } - sprintf(cbuf, "%d", slacks[i]); - out.append("}").append(cbuf); } out.append("\n"); + return out; } void HighlightData::append(const HighlightData& hl) @@ -249,12 +318,12 @@ void HighlightData::append(const HighlightData& hl) size_t ugsz0 = ugroups.size(); ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end()); + size_t itgsize = index_term_groups.size(); index_term_groups.insert(index_term_groups.end(), hl.index_term_groups.begin(), hl.index_term_groups.end()); - slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end()); - for (std::vector::const_iterator it = hl.grpsugidx.begin(); - it != hl.grpsugidx.end(); it++) { - grpsugidx.push_back(*it + ugsz0); + // Adjust the grpsugidx values for the newly inserted entries + for (unsigned int idx = itgsize; idx < index_term_groups.size(); idx++) { + index_term_groups[idx].grpsugidx += ugsz0; } } diff --git a/src/utils/hldata.h b/src/utils/hldata.h index 36f01df3..e9900c95 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -5,6 +5,7 @@ #include #include #include +#include /** Store data about user search terms and their expansions. This is used * mostly for highlighting result text and walking the matches, generating @@ -22,7 +23,7 @@ struct HighlightData { * This is used for aggregating term stats when generating snippets (for * choosing the best terms, allocating slots, etc. ) */ - std::map terms; + std::unordered_map terms; /** The original user terms-or-groups. This is for display * purposes: ie when creating a menu to look for a specific @@ -33,40 +34,39 @@ struct HighlightData { std::vector > ugroups; /** Processed/expanded terms and groups. Used for looking for - * regions to highlight. A group can be a PHRASE or NEAR entry (we - * process everything as NEAR to keep things reasonably - * simple. Terms are just groups with 1 entry. All + * regions to highlight. A group can be a PHRASE or NEAR entry + * Terms are just groups with 1 entry. All * terms are transformed to be compatible with index content * (unaccented and lowercased as needed depending on * configuration), and the list may include values * expanded from the original terms by stem or wildcard expansion. - * NEAR clauses are expanded to all possible combinations of the - * stem-expanded member terms. Ex: - * "clean floor"p -> (clean floor) (clean floors) (cleaning floor)... */ - std::vector > index_term_groups; - /** Group slacks. Parallel to groups */ - std::vector slacks; + struct TermGroup { + // We'd use an union but no can do + std::string term; + std::vector > orgroups; + int slack{0}; - /** Index into ugroups for each group. Parallel to groups. As a - * user term or group may generate many processed/expanded terms - * or groups, this is how we relate an expansion to its source - * (used, e.g. for generating anchors for walking search matches - * in the preview window). - */ - std::vector grpsugidx; + /* Index into ugroups. As a user term or group may generate + * many processed/expanded terms or groups, this is how we + * relate an expansion to its source (used, e.g. for + * generating anchors for walking search matches in the + * preview window). */ + size_t grpsugidx{0}; + enum TGK {TGK_TERM, TGK_NEAR, TGK_PHRASE}; + TGK kind{TGK_TERM}; + }; + std::vector index_term_groups; void clear() { uterms.clear(); ugroups.clear(); index_term_groups.clear(); - slacks.clear(); - grpsugidx.clear(); } void append(const HighlightData&); // Print (debug) - void toString(std::string& out) const; + std::string toString() const; }; /* The following is used by plaintorich.cpp for finding zones to @@ -109,8 +109,7 @@ extern bool matchGroup( unsigned int grpidx, const std::map>& inplists, const std::map>& gpostobytes, - std::vector& tboffs, - bool isphrase = false + std::vector& tboffs ); #endif /* _hldata_h_included_ */