Highlighting and snippets extraction: reworked to handle phrases properly. Use a compound position list instead of multiplying the OR groups inside a near clause
This commit is contained in:
parent
baa6062de1
commit
35ee3f7a13
@ -72,7 +72,7 @@ string PlainToRichQtPreview::PlainToRichQtPreview::header()
|
|||||||
string PlainToRichQtPreview::startMatch(unsigned int grpidx)
|
string PlainToRichQtPreview::startMatch(unsigned int grpidx)
|
||||||
{
|
{
|
||||||
LOGDEB2("startMatch, grpidx " << (grpidx) << "\n" );
|
LOGDEB2("startMatch, grpidx " << (grpidx) << "\n" );
|
||||||
grpidx = m_hdata->grpsugidx[grpidx];
|
grpidx = m_hdata->index_term_groups[grpidx].grpsugidx;
|
||||||
LOGDEB2("startMatch, ugrpidx " << (grpidx) << "\n" );
|
LOGDEB2("startMatch, ugrpidx " << (grpidx) << "\n" );
|
||||||
m_groupanchors[grpidx].push_back(++m_lastanchor);
|
m_groupanchors[grpidx].push_back(++m_lastanchor);
|
||||||
m_groupcuranchors[grpidx] = 0;
|
m_groupcuranchors[grpidx] = 0;
|
||||||
|
|||||||
@ -53,15 +53,17 @@ public:
|
|||||||
: m_wcount(0), m_hdata(hdata) {
|
: m_wcount(0), m_hdata(hdata) {
|
||||||
// We separate single terms and groups and extract the group
|
// We separate single terms and groups and extract the group
|
||||||
// terms for computing positions list before looking for group
|
// terms for computing positions list before looking for group
|
||||||
// matches
|
// matches. Single terms are stored with a reference to the
|
||||||
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
// entry they come with.
|
||||||
vit != hdata.groups.end(); vit++) {
|
for (unsigned int i = 0; i < hdata.index_term_groups.size(); i++) {
|
||||||
if (vit->size() == 1) {
|
const HighlightData::TermGroup& tg(hdata.index_term_groups[i]);
|
||||||
m_terms[vit->front()] = vit - hdata.groups.begin();
|
if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
|
||||||
} else if (vit->size() > 1) {
|
m_terms[tg.term] = i;
|
||||||
for (vector<string>::const_iterator it = vit->begin();
|
} else {
|
||||||
it != vit->end(); it++) {
|
for (const auto& group : tg.orgroups) {
|
||||||
m_gterms.insert(*it);
|
for (const auto& term : group) {
|
||||||
|
m_gterms.insert(term);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -129,13 +131,13 @@ private:
|
|||||||
// Look for matches to PHRASE and NEAR term groups and finalize the
|
// Look for matches to PHRASE and NEAR term groups and finalize the
|
||||||
// matched regions list (sort it by increasing start then decreasing
|
// matched regions list (sort it by increasing start then decreasing
|
||||||
// length)
|
// length)
|
||||||
// Actually, we handle all groups as NEAR (ignore order).
|
|
||||||
bool TextSplitPTR::matchGroups()
|
bool TextSplitPTR::matchGroups()
|
||||||
{
|
{
|
||||||
for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
|
for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) {
|
||||||
if (m_hdata.groups[i].size() <= 1)
|
if (m_hdata.index_term_groups[i].kind !=
|
||||||
continue;
|
HighlightData::TermGroup::TGK_TERM) {
|
||||||
matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
|
matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort regions by increasing start and decreasing width.
|
// Sort regions by increasing start and decreasing width.
|
||||||
|
|||||||
@ -119,10 +119,12 @@ public:
|
|||||||
|
|
||||||
// Take note of the group (phrase/near) terms because we need
|
// Take note of the group (phrase/near) terms because we need
|
||||||
// to compute the position lists for them.
|
// to compute the position lists for them.
|
||||||
for (const auto& group : hdata.groups) {
|
for (const auto& tg : hdata.index_term_groups) {
|
||||||
if (group.size() > 1) {
|
if (tg.kind != HighlightData::TermGroup::TGK_TERM) {
|
||||||
for (const auto& term: group) {
|
for (const auto& group : tg.orgroups) {
|
||||||
m_gterms.insert(term);
|
for (const auto& term: group) {
|
||||||
|
m_gterms.insert(term);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -134,7 +136,9 @@ public:
|
|||||||
LOGDEB2("takeword: " << term << endl);
|
LOGDEB2("takeword: " << term << endl);
|
||||||
// Limit time taken with monster documents. The resulting
|
// Limit time taken with monster documents. The resulting
|
||||||
// abstract will be incorrect or inexistant, but this is
|
// abstract will be incorrect or inexistant, but this is
|
||||||
// better than taking forever (the default cutoff is 10E6)
|
// better than taking forever (the default cutoff value comes
|
||||||
|
// from the snippetMaxPosWalk configuration parameter, and is
|
||||||
|
// 10E6)
|
||||||
if (maxtermcount && termcount++ > maxtermcount) {
|
if (maxtermcount && termcount++ > maxtermcount) {
|
||||||
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
|
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
|
||||||
maxtermcount << endl);
|
maxtermcount << endl);
|
||||||
@ -276,8 +280,9 @@ public:
|
|||||||
// Look for matches to PHRASE and NEAR term groups and finalize
|
// Look for matches to PHRASE and NEAR term groups and finalize
|
||||||
// the matched regions list (sort it by increasing start then
|
// the matched regions list (sort it by increasing start then
|
||||||
// decreasing length). We process all groups as NEAR (ignore order).
|
// decreasing length). We process all groups as NEAR (ignore order).
|
||||||
for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
|
for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) {
|
||||||
if (m_hdata.groups[i].size() > 1) {
|
if (m_hdata.index_term_groups[i].kind !=
|
||||||
|
HighlightData::TermGroup::TGK_TERM) {
|
||||||
matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
|
matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -163,7 +163,7 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
|||||||
// expanded from (by stemming)
|
// expanded from (by stemming)
|
||||||
map<string, vector<string> > byRoot;
|
map<string, vector<string> > byRoot;
|
||||||
for (const auto& term: terms) {
|
for (const auto& term: terms) {
|
||||||
map<string, string>::const_iterator eit = hld.terms.find(term);
|
const auto eit = hld.terms.find(term);
|
||||||
if (eit != hld.terms.end()) {
|
if (eit != hld.terms.end()) {
|
||||||
byRoot[eit->second].push_back(term);
|
byRoot[eit->second].push_back(term);
|
||||||
} else {
|
} else {
|
||||||
@ -174,9 +174,7 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
|||||||
|
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
{
|
{
|
||||||
string deb;
|
LOGABS("qualityTerms: hld: " << hld.toString() << "\n");
|
||||||
hld.toString(deb);
|
|
||||||
LOGABS("qualityTerms: hld: " << deb << "\n");
|
|
||||||
string byRootstr;
|
string byRootstr;
|
||||||
for (const auto& entry : byRoot) {
|
for (const auto& entry : byRoot) {
|
||||||
byRootstr.append("[").append(entry.first).append("]->");
|
byRootstr.append("[").append(entry.first).append("]->");
|
||||||
|
|||||||
@ -603,11 +603,11 @@ void SearchDataClauseSimple::processSimpleSpan(
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// Set up the highlight data. No prefix should go in there
|
// Set up the highlight data. No prefix should go in there
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
for (const auto& term : exp) {
|
||||||
it != exp.end(); it++) {
|
HighlightData::TermGroup tg;
|
||||||
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
tg.term = term.substr(prefix.size());
|
||||||
m_hldata.slacks.push_back(0);
|
tg.grpsugidx = m_hldata.ugroups.size() - 1;
|
||||||
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
m_hldata.index_term_groups.push_back(tg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Push either term or OR of stem-expanded set
|
// Push either term or OR of stem-expanded set
|
||||||
@ -735,18 +735,16 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|||||||
original_term_wqf_booster);
|
original_term_wqf_booster);
|
||||||
pqueries.push_back(xq);
|
pqueries.push_back(xq);
|
||||||
|
|
||||||
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
|
||||||
vector<vector<string> > allcombs;
|
|
||||||
vector<string> comb;
|
|
||||||
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
|
||||||
|
|
||||||
// Insert the search groups and slacks in the highlight data, with
|
// Insert the search groups and slacks in the highlight data, with
|
||||||
// a reference to the user entry that generated them:
|
// a reference to the user entry that generated them:
|
||||||
m_hldata.groups.insert(m_hldata.groups.end(),
|
HighlightData::TermGroup tg;
|
||||||
allcombs.begin(), allcombs.end());
|
tg.orgroups = groups;
|
||||||
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
tg.slack = slack;
|
||||||
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
tg.grpsugidx = m_hldata.ugroups.size() - 1;
|
||||||
m_hldata.ugroups.size() - 1);
|
tg.kind = (op == Xapian::Query::OP_PHRASE) ?
|
||||||
|
HighlightData::TermGroup::TGK_PHRASE :
|
||||||
|
HighlightData::TermGroup::TGK_NEAR;
|
||||||
|
m_hldata.index_term_groups.push_back(tg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim string beginning with ^ or ending with $ and convert to flags
|
// Trim string beginning with ^ or ending with $ and convert to flags
|
||||||
|
|||||||
@ -31,11 +31,69 @@ using std::pair;
|
|||||||
|
|
||||||
#undef DEBUGGROUPS
|
#undef DEBUGGROUPS
|
||||||
#ifdef DEBUGGROUPS
|
#ifdef DEBUGGROUPS
|
||||||
#define LOGRP LOGDEB
|
#define LOGRP LOGINF
|
||||||
#else
|
#else
|
||||||
#define LOGRP LOGDEB1
|
#define LOGRP LOGDEB1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Combined position list for or'd terms
|
||||||
|
struct OrPList {
|
||||||
|
void addplist(const string& term, const vector<int>* pl) {
|
||||||
|
terms.push_back(term);
|
||||||
|
plists.push_back(pl);
|
||||||
|
indexes.push_back(0);
|
||||||
|
totalsize += pl->size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns -1 for eof, else the next smallest value in the
|
||||||
|
// combined lists, according to the current indexes.
|
||||||
|
int value() {
|
||||||
|
int minval = INT_MAX;
|
||||||
|
int minidx = -1;
|
||||||
|
for (unsigned ii = 0; ii < indexes.size(); ii++) {
|
||||||
|
const vector<int>& pl(*plists[ii]);
|
||||||
|
if (indexes[ii] >= pl.size())
|
||||||
|
continue; // this list done
|
||||||
|
if (pl[indexes[ii]] < minval) {
|
||||||
|
minval = pl[indexes[ii]];
|
||||||
|
minidx = ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (minidx != -1) {
|
||||||
|
LOGRP("OrPList::value() -> " << minval << " for " <<
|
||||||
|
terms[minidx] << "\n");
|
||||||
|
currentidx = minidx;
|
||||||
|
return minval;
|
||||||
|
} else {
|
||||||
|
LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int next() {
|
||||||
|
if (currentidx != -1) {
|
||||||
|
indexes[currentidx]++;
|
||||||
|
}
|
||||||
|
return value();
|
||||||
|
}
|
||||||
|
|
||||||
|
int size() const {
|
||||||
|
return totalsize;
|
||||||
|
}
|
||||||
|
void rewind() {
|
||||||
|
for (auto& idx : indexes) {
|
||||||
|
idx = 0;
|
||||||
|
}
|
||||||
|
currentidx = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<const vector<int>*> plists;
|
||||||
|
vector<unsigned int> indexes;
|
||||||
|
vector<string> terms;
|
||||||
|
int currentidx{-1};
|
||||||
|
int totalsize{0};
|
||||||
|
};
|
||||||
|
|
||||||
static inline void setWinMinMax(int pos, int& sta, int& sto)
|
static inline void setWinMinMax(int pos, int& sta, int& sto)
|
||||||
{
|
{
|
||||||
if (pos < sta) {
|
if (pos < sta) {
|
||||||
@ -65,42 +123,44 @@ static inline void setWinMinMax(int pos, int& sta, int& sto)
|
|||||||
* we only look for the next match beyond the current window top.
|
* we only look for the next match beyond the current window top.
|
||||||
*/
|
*/
|
||||||
static bool do_proximity_test(
|
static bool do_proximity_test(
|
||||||
const int window, vector<const vector<int>*>& plists,
|
const int window, vector<OrPList>& plists,
|
||||||
unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
|
unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
|
||||||
bool isphrase)
|
bool isphrase)
|
||||||
{
|
{
|
||||||
LOGINF("do_prox_test: win " << window << " plist_idx " << plist_idx <<
|
|
||||||
" min " << min << " max " << max << " minpos " << minpos <<
|
|
||||||
" isphrase " << isphrase << "\n");
|
|
||||||
|
|
||||||
// Overlap interdiction: possibly adjust window start by input minpos
|
// Overlap interdiction: possibly adjust window start by input minpos
|
||||||
int actualminpos = isphrase ? max + 1 : max + 1 - window;
|
int actualminpos = isphrase ? max + 1 : max + 1 - window;
|
||||||
if (actualminpos < minpos)
|
if (actualminpos < minpos)
|
||||||
actualminpos = minpos;
|
actualminpos = minpos;
|
||||||
|
LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx <<
|
||||||
|
" min " << min << " max " << max << " minpos " << minpos <<
|
||||||
|
" isphrase " << isphrase << " actualminpos " << actualminpos << "\n");
|
||||||
|
|
||||||
// Find 1st position bigger than window start
|
// Find 1st position bigger than window start. A previous call may
|
||||||
auto it = plists[plist_idx]->begin();
|
// have advanced the index, so we begin by retrieving the current
|
||||||
while (it != plists[plist_idx]->end() && *it < actualminpos)
|
// value.
|
||||||
it++;
|
int nextpos = plists[plist_idx].value();
|
||||||
|
while (nextpos != -1 && nextpos < actualminpos)
|
||||||
|
nextpos = plists[plist_idx].next();
|
||||||
|
|
||||||
// Look for position inside window. If not found, no match. If
|
// Look for position inside window. If not found, no match. If
|
||||||
// found: if this is the last list we're done, else recurse on
|
// found: if this is the last list we're done, else recurse on
|
||||||
// next list after adjusting the window
|
// next list after adjusting the window
|
||||||
while (it != plists[plist_idx]->end()) {
|
while (nextpos != -1) {
|
||||||
int pos = *it;
|
if (nextpos > min + window - 1) {
|
||||||
if (pos > min + window - 1)
|
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
if (plist_idx + 1 == plists.size()) {
|
if (plist_idx + 1 == plists.size()) {
|
||||||
// Done: set return values
|
// We already checked pos > min, now we also have pos <
|
||||||
setWinMinMax(pos, *sp, *ep);
|
// max, and we are the last list: done: set return values.
|
||||||
|
setWinMinMax(nextpos, *sp, *ep);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
setWinMinMax(pos, min, max);
|
setWinMinMax(nextpos, min, max);
|
||||||
if (do_proximity_test(window,plists, plist_idx + 1,
|
if (do_proximity_test(window, plists, plist_idx + 1,
|
||||||
min, max, sp, ep, minpos)) {
|
min, max, sp, ep, minpos, isphrase)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
it++;
|
nextpos = plists[plist_idx].next();
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -111,75 +171,75 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
unsigned int grpidx,
|
unsigned int grpidx,
|
||||||
const map<string, vector<int>>& inplists,
|
const map<string, vector<int>>& inplists,
|
||||||
const map<int, pair<int,int>>& gpostobytes,
|
const map<int, pair<int,int>>& gpostobytes,
|
||||||
vector<GroupMatchEntry>& tboffs,
|
vector<GroupMatchEntry>& tboffs)
|
||||||
bool isphrase
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
isphrase=true;
|
|
||||||
const vector<string>& terms = hldata.index_term_groups[grpidx];
|
|
||||||
int window = int(terms.size() + hldata.slacks[grpidx]);
|
|
||||||
|
|
||||||
LOGRP("TextSplitPTR::matchGroup:d " << window << ": " <<
|
const auto& tg(hldata.index_term_groups[grpidx]);
|
||||||
stringsToString(terms) << "\n");
|
bool isphrase = tg.kind == HighlightData::TermGroup::TGK_PHRASE;
|
||||||
|
string allplterms;
|
||||||
|
for (const auto& entry:inplists) {
|
||||||
|
allplterms += entry.first + " ";
|
||||||
|
}
|
||||||
|
LOGRP("matchGroup: isphrase " << isphrase <<
|
||||||
|
". Have plists for [" << allplterms << "]\n");
|
||||||
|
LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl);
|
||||||
|
|
||||||
|
int window = int(tg.orgroups.size() + tg.slack);
|
||||||
// The position lists we are going to work with. We extract them from the
|
// The position lists we are going to work with. We extract them from the
|
||||||
// (string->plist) map
|
// (string->plist) map
|
||||||
vector<const vector<int>*> plists;
|
vector<OrPList> orplists;
|
||||||
// A revert plist->term map. This is so that we can find who is who after
|
|
||||||
// sorting the plists by length.
|
|
||||||
map<const vector<int>*, string> plistToTerm;
|
|
||||||
|
|
||||||
// Find the position list for each term in the group. It is
|
// Find the position list for each term in the group and build the
|
||||||
// possible that this particular group was not actually matched by
|
// combined lists for the term or groups (each group is the result
|
||||||
// the search, so that some terms are not found.
|
// of the exansion of one user term). It is possible that this
|
||||||
for (const auto& term : terms) {
|
// particular group was not actually matched by the search, so
|
||||||
map<string, vector<int> >::const_iterator pl = inplists.find(term);
|
// that some terms are not found, in which case we bail out.
|
||||||
if (pl == inplists.end()) {
|
for (const auto& group : tg.orgroups) {
|
||||||
LOGRP("TextSplitPTR::matchGroup: [" << term <<
|
orplists.push_back(OrPList());
|
||||||
"] not found in plists\n");
|
for (const auto& term : group) {
|
||||||
return false;
|
const auto pl = inplists.find(term);
|
||||||
|
if (pl == inplists.end()) {
|
||||||
|
LOGRP("TextSplitPTR::matchGroup: term [" << term <<
|
||||||
|
"] not found in plists\n");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
orplists.back().addplist(pl->first, &(pl->second));
|
||||||
|
}
|
||||||
|
if (orplists.back().plists.empty()) {
|
||||||
|
LOGINF("No positions list found for group " <<
|
||||||
|
stringsToString(group) << std::endl);
|
||||||
|
orplists.pop_back();
|
||||||
}
|
}
|
||||||
plists.push_back(&(pl->second));
|
|
||||||
plistToTerm[&(pl->second)] = term;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// I think this can't actually happen, was useful when we used to
|
// I think this can't actually happen, was useful when we used to
|
||||||
// prune the groups, but doesn't hurt.
|
// prune the groups, but doesn't hurt.
|
||||||
if (plists.size() < 2) {
|
if (orplists.size() < 2) {
|
||||||
LOGRP("TextSplitPTR::matchGroup: no actual groups found\n");
|
LOGINF("TextSplitPTR::matchGroup: no actual groups found\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isphrase) {
|
if (!isphrase) {
|
||||||
// Sort the positions lists so that the shorter is first
|
// Sort the positions lists so that the shorter is first
|
||||||
std::sort(plists.begin(), plists.end(),
|
std::sort(orplists.begin(), orplists.end(),
|
||||||
[](const vector<int> *a, const vector<int> *b) -> bool {
|
[](const OrPList& a, const OrPList& b) -> bool {
|
||||||
return a->size() < b->size();
|
return a.size() < b.size();
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0) { // Debug
|
|
||||||
auto it = plistToTerm.find(plists[0]);
|
|
||||||
if (it == plistToTerm.end()) {
|
|
||||||
// SuperWeird
|
|
||||||
LOGERR("matchGroup: term for first list not found !?!\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOGRP("matchGroup: walking the shortest plist. Term [" <<
|
|
||||||
it->second << "], len " << plists[0]->size() << "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Minpos is the highest end of a found match. While looking for
|
// Minpos is the highest end of a found match. While looking for
|
||||||
// further matches, we don't want the search to extend before
|
// further matches, we don't want the search to extend before
|
||||||
// this, because it does not make sense for highlight regions to
|
// this, because it does not make sense for highlight regions to
|
||||||
// overlap
|
// overlap
|
||||||
int minpos = 0;
|
int minpos = 0;
|
||||||
// Walk the shortest plist and look for matches
|
// Walk the shortest plist and look for matches
|
||||||
for (int pos : *(plists[0])) {
|
int pos;
|
||||||
|
while ((pos = orplists[0].next()) != -1) {
|
||||||
int sta = INT_MAX, sto = 0;
|
int sta = INT_MAX, sto = 0;
|
||||||
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
|
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
|
||||||
if (do_proximity_test(
|
if (do_proximity_test(
|
||||||
window, plists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
|
window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
|
||||||
setWinMinMax(pos, sta, sto);
|
setWinMinMax(pos, sta, sto);
|
||||||
LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
|
LOGINF("TextSplitPTR::matchGroup: MATCH termpos [" << sta <<
|
||||||
"," << sto << "]\n");
|
"," << sto << "]\n");
|
||||||
@ -204,8 +264,9 @@ bool matchGroup(const HighlightData& hldata,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HighlightData::toString(string& out) const
|
string HighlightData::toString() const
|
||||||
{
|
{
|
||||||
|
string out;
|
||||||
out.append("\nUser terms (orthograph): ");
|
out.append("\nUser terms (orthograph): ");
|
||||||
for (const auto& term : uterms) {
|
for (const auto& term : uterms) {
|
||||||
out.append(" [").append(term).append("]");
|
out.append(" [").append(term).append("]");
|
||||||
@ -217,29 +278,37 @@ void HighlightData::toString(string& out) const
|
|||||||
}
|
}
|
||||||
out.append("\nGroups: ");
|
out.append("\nGroups: ");
|
||||||
char cbuf[200];
|
char cbuf[200];
|
||||||
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
sprintf(cbuf, "index_term_groups size %d ugroups size %d",
|
||||||
int(index_term_groups.size()), int(grpsugidx.size()),
|
int(index_term_groups.size()), int(ugroups.size()));
|
||||||
int(ugroups.size()));
|
|
||||||
out.append(cbuf);
|
out.append(cbuf);
|
||||||
|
|
||||||
size_t ugidx = (size_t) - 1;
|
size_t ugidx = (size_t) - 1;
|
||||||
for (unsigned int i = 0; i < index_term_groups.size(); i++) {
|
for (HighlightData::TermGroup tg : index_term_groups) {
|
||||||
if (ugidx != grpsugidx[i]) {
|
if (ugidx != tg.grpsugidx) {
|
||||||
ugidx = grpsugidx[i];
|
ugidx = tg.grpsugidx;
|
||||||
out.append("\n(");
|
out.append("\n(");
|
||||||
for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
|
for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
|
||||||
out.append("[").append(ugroups[ugidx][j]).append("] ");
|
out.append("[").append(ugroups[ugidx][j]).append("] ");
|
||||||
}
|
}
|
||||||
out.append(") ->");
|
out.append(") ->");
|
||||||
}
|
}
|
||||||
out.append(" {");
|
if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
|
||||||
for (unsigned int j = 0; j < index_term_groups[i].size(); j++) {
|
out.append(" <").append(tg.term).append(">");
|
||||||
out.append("[").append(index_term_groups[i][j]).append("]");
|
} else {
|
||||||
|
out.append(" {");
|
||||||
|
for (unsigned int j = 0; j < tg.orgroups.size(); j++) {
|
||||||
|
out.append(" {");
|
||||||
|
for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) {
|
||||||
|
out.append("[").append(tg.orgroups[j][k]).append("]");
|
||||||
|
}
|
||||||
|
out.append("}");
|
||||||
|
}
|
||||||
|
sprintf(cbuf, "%d", tg.slack);
|
||||||
|
out.append("}").append(cbuf);
|
||||||
}
|
}
|
||||||
sprintf(cbuf, "%d", slacks[i]);
|
|
||||||
out.append("}").append(cbuf);
|
|
||||||
}
|
}
|
||||||
out.append("\n");
|
out.append("\n");
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HighlightData::append(const HighlightData& hl)
|
void HighlightData::append(const HighlightData& hl)
|
||||||
@ -249,12 +318,12 @@ void HighlightData::append(const HighlightData& hl)
|
|||||||
size_t ugsz0 = ugroups.size();
|
size_t ugsz0 = ugroups.size();
|
||||||
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
||||||
|
|
||||||
|
size_t itgsize = index_term_groups.size();
|
||||||
index_term_groups.insert(index_term_groups.end(),
|
index_term_groups.insert(index_term_groups.end(),
|
||||||
hl.index_term_groups.begin(),
|
hl.index_term_groups.begin(),
|
||||||
hl.index_term_groups.end());
|
hl.index_term_groups.end());
|
||||||
slacks.insert(slacks.end(), hl.slacks.begin(), hl.slacks.end());
|
// Adjust the grpsugidx values for the newly inserted entries
|
||||||
for (std::vector<size_t>::const_iterator it = hl.grpsugidx.begin();
|
for (unsigned int idx = itgsize; idx < index_term_groups.size(); idx++) {
|
||||||
it != hl.grpsugidx.end(); it++) {
|
index_term_groups[idx].grpsugidx += ugsz0;
|
||||||
grpsugidx.push_back(*it + ugsz0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
/** Store data about user search terms and their expansions. This is used
|
/** Store data about user search terms and their expansions. This is used
|
||||||
* mostly for highlighting result text and walking the matches, generating
|
* mostly for highlighting result text and walking the matches, generating
|
||||||
@ -22,7 +23,7 @@ struct HighlightData {
|
|||||||
* This is used for aggregating term stats when generating snippets (for
|
* This is used for aggregating term stats when generating snippets (for
|
||||||
* choosing the best terms, allocating slots, etc. )
|
* choosing the best terms, allocating slots, etc. )
|
||||||
*/
|
*/
|
||||||
std::map<std::string, std::string> terms;
|
std::unordered_map<std::string, std::string> terms;
|
||||||
|
|
||||||
/** The original user terms-or-groups. This is for display
|
/** The original user terms-or-groups. This is for display
|
||||||
* purposes: ie when creating a menu to look for a specific
|
* purposes: ie when creating a menu to look for a specific
|
||||||
@ -33,40 +34,39 @@ struct HighlightData {
|
|||||||
std::vector<std::vector<std::string> > ugroups;
|
std::vector<std::vector<std::string> > ugroups;
|
||||||
|
|
||||||
/** Processed/expanded terms and groups. Used for looking for
|
/** Processed/expanded terms and groups. Used for looking for
|
||||||
* regions to highlight. A group can be a PHRASE or NEAR entry (we
|
* regions to highlight. A group can be a PHRASE or NEAR entry
|
||||||
* process everything as NEAR to keep things reasonably
|
* Terms are just groups with 1 entry. All
|
||||||
* simple. Terms are just groups with 1 entry. All
|
|
||||||
* terms are transformed to be compatible with index content
|
* terms are transformed to be compatible with index content
|
||||||
* (unaccented and lowercased as needed depending on
|
* (unaccented and lowercased as needed depending on
|
||||||
* configuration), and the list may include values
|
* configuration), and the list may include values
|
||||||
* expanded from the original terms by stem or wildcard expansion.
|
* expanded from the original terms by stem or wildcard expansion.
|
||||||
* NEAR clauses are expanded to all possible combinations of the
|
|
||||||
* stem-expanded member terms. Ex:
|
|
||||||
* "clean floor"p -> (clean floor) (clean floors) (cleaning floor)...
|
|
||||||
*/
|
*/
|
||||||
std::vector<std::vector<std::string> > index_term_groups;
|
struct TermGroup {
|
||||||
/** Group slacks. Parallel to groups */
|
// We'd use an union but no can do
|
||||||
std::vector<int> slacks;
|
std::string term;
|
||||||
|
std::vector<std::vector<std::string> > orgroups;
|
||||||
|
int slack{0};
|
||||||
|
|
||||||
/** Index into ugroups for each group. Parallel to groups. As a
|
/* Index into ugroups. As a user term or group may generate
|
||||||
* user term or group may generate many processed/expanded terms
|
* many processed/expanded terms or groups, this is how we
|
||||||
* or groups, this is how we relate an expansion to its source
|
* relate an expansion to its source (used, e.g. for
|
||||||
* (used, e.g. for generating anchors for walking search matches
|
* generating anchors for walking search matches in the
|
||||||
* in the preview window).
|
* preview window). */
|
||||||
*/
|
size_t grpsugidx{0};
|
||||||
std::vector<size_t> grpsugidx;
|
enum TGK {TGK_TERM, TGK_NEAR, TGK_PHRASE};
|
||||||
|
TGK kind{TGK_TERM};
|
||||||
|
};
|
||||||
|
std::vector<TermGroup> index_term_groups;
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
uterms.clear();
|
uterms.clear();
|
||||||
ugroups.clear();
|
ugroups.clear();
|
||||||
index_term_groups.clear();
|
index_term_groups.clear();
|
||||||
slacks.clear();
|
|
||||||
grpsugidx.clear();
|
|
||||||
}
|
}
|
||||||
void append(const HighlightData&);
|
void append(const HighlightData&);
|
||||||
|
|
||||||
// Print (debug)
|
// Print (debug)
|
||||||
void toString(std::string& out) const;
|
std::string toString() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* The following is used by plaintorich.cpp for finding zones to
|
/* The following is used by plaintorich.cpp for finding zones to
|
||||||
@ -109,8 +109,7 @@ extern bool matchGroup(
|
|||||||
unsigned int grpidx,
|
unsigned int grpidx,
|
||||||
const std::map<std::string, std::vector<int>>& inplists,
|
const std::map<std::string, std::vector<int>>& inplists,
|
||||||
const std::map<int, std::pair<int,int>>& gpostobytes,
|
const std::map<int, std::pair<int,int>>& gpostobytes,
|
||||||
std::vector<GroupMatchEntry>& tboffs,
|
std::vector<GroupMatchEntry>& tboffs
|
||||||
bool isphrase = false
|
|
||||||
);
|
);
|
||||||
|
|
||||||
#endif /* _hldata_h_included_ */
|
#endif /* _hldata_h_included_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user