Building abstract/snippets from the doc text: process phrase/group terms
This commit is contained in:
parent
175ca9832f
commit
567401233a
@ -36,43 +36,74 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
// We now let plaintorich do the highlight tags insertions which is
|
||||
// wasteful because we have most of the information (but the perf hit
|
||||
// is small because it's only called on the output fragments, not on
|
||||
// the whole text). The highlight zone computation code has been left
|
||||
// around just in case I change my mind.
|
||||
#undef COMPUTE_HLZONES
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
#warning NEAR and PHRASE
|
||||
// Chars we turn to spaces in the Snippets
|
||||
static const string cstr_nc("\n\r\x0c\\");
|
||||
|
||||
// Text splitter for finding the match terms in the doc text.
|
||||
// Fragment descriptor. A fragment is a text area with one or several
|
||||
// matched terms and some context. It is ranked according to the
|
||||
// matched term weights and the near/phrase matches get a boost.
|
||||
struct MatchFragment {
|
||||
// Start/End byte offsets of fragment in the document text
|
||||
int start;
|
||||
int stop;
|
||||
// Weight for this fragment (bigger better)
|
||||
double coef;
|
||||
#ifdef COMPUTE_HLZONES
|
||||
// Highlight areas (each is one or several contiguous match
|
||||
// terms). Because a fragment extends around a match, there
|
||||
// can be several contiguous or separate matches in a given
|
||||
// fragment.
|
||||
vector<pair<int,int>> hlzones;
|
||||
#endif
|
||||
// Position of the first matched term (for page number computations)
|
||||
unsigned int hitpos;
|
||||
// "best term" for this match (e.g. for use as ext app search term)
|
||||
string term;
|
||||
|
||||
MatchFragment(int sta, int sto, double c,
|
||||
#ifdef COMPUTE_HLZONES
|
||||
vector<pair<int,int>>& hl,
|
||||
#endif
|
||||
unsigned int pos, string& trm)
|
||||
: start(sta), stop(sto), coef(c), hitpos(pos) {
|
||||
#ifdef COMPUTE_HLZONES
|
||||
hlzones.swap(hl);
|
||||
#endif
|
||||
term.swap(trm);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Text splitter for finding the match areas in the document text.
|
||||
class TextSplitABS : public TextSplit {
|
||||
public:
|
||||
|
||||
struct MatchEntry {
|
||||
// Start/End byte offsets of fragment in the document text
|
||||
int start;
|
||||
int stop;
|
||||
double coef;
|
||||
// Position of the first matched term.
|
||||
unsigned int hitpos;
|
||||
// "best term" for this match
|
||||
string term;
|
||||
// Hilight areas (each is one or several contiguous match terms).
|
||||
vector<pair<int,int>> hlzones;
|
||||
|
||||
MatchEntry(int sta, int sto, double c, vector<pair<int,int>>& hl,
|
||||
unsigned int pos, string& trm)
|
||||
: start(sta), stop(sto), coef(c), hitpos(pos) {
|
||||
hlzones.swap(hl);
|
||||
term.swap(trm);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
TextSplitABS(const vector<string>& matchTerms,
|
||||
const HighlightData& hdata,
|
||||
unordered_map<string, double>& wordcoefs,
|
||||
unsigned int ctxwords,
|
||||
Flags flags = TXTS_NONE)
|
||||
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
||||
m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
|
||||
LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl);
|
||||
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
||||
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
|
||||
|
||||
// Take note of the group (phrase/near) terms because we need
|
||||
// to compute the position lists for them.
|
||||
for (const auto& group : hdata.groups) {
|
||||
if (group.size() > 1) {
|
||||
for (const auto& term: group) {
|
||||
m_gterms.insert(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accept a word and its position. If the word is a matched term,
|
||||
@ -80,7 +111,7 @@ public:
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
LOGDEB2("takeword: " << term << endl);
|
||||
|
||||
// Recent past
|
||||
// Remember recent past
|
||||
m_prevterms.push_back(pair<int,int>(bts,bte));
|
||||
if (m_prevterms.size() > m_ctxwords+1) {
|
||||
m_prevterms.pop_front();
|
||||
@ -103,28 +134,35 @@ public:
|
||||
m_remainingWords << endl);
|
||||
double coef = m_wordcoefs[dumb];
|
||||
if (!m_remainingWords) {
|
||||
// No current fragment
|
||||
// No current fragment. Start one
|
||||
m_curhitpos = baseTextPosition + pos;
|
||||
m_curfrag.first = m_prevterms.front().first;
|
||||
m_curfrag.second = m_prevterms.back().second;
|
||||
#ifdef COMPUTE_HLZONES
|
||||
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
||||
#endif
|
||||
m_curterm = term;
|
||||
m_curtermcoef = coef;
|
||||
} else {
|
||||
LOGDEB2("Extending current fragment: " << m_remainingWords <<
|
||||
" -> " << m_ctxwords << endl);
|
||||
m_extcount++;
|
||||
#ifdef COMPUTE_HLZONES
|
||||
if (m_prevwordhit) {
|
||||
m_curhlzones.back().second = bte;
|
||||
} else {
|
||||
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
||||
}
|
||||
#endif
|
||||
if (coef > m_curtermcoef) {
|
||||
m_curterm = term;
|
||||
m_curtermcoef = coef;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef COMPUTE_HLZONES
|
||||
m_prevwordhit = true;
|
||||
#endif
|
||||
m_curfragcoef += coef;
|
||||
m_remainingWords = m_ctxwords + 1;
|
||||
if (m_extcount > 3) {
|
||||
@ -134,10 +172,24 @@ public:
|
||||
m_remainingWords = 1;
|
||||
m_extcount = 0;
|
||||
}
|
||||
} else {
|
||||
|
||||
// If the term is part of a near/phrase group, update its
|
||||
// positions list
|
||||
if (m_gterms.find(dumb) != m_gterms.end()) {
|
||||
// Term group (phrase/near) handling
|
||||
m_plists[dumb].push_back(pos);
|
||||
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
||||
LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
|
||||
bte << "\n");
|
||||
}
|
||||
}
|
||||
#ifdef COMPUTE_HLZONES
|
||||
else {
|
||||
// Not a matched term
|
||||
m_prevwordhit = false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
if (m_remainingWords) {
|
||||
// Fragment currently open. Time to close ?
|
||||
@ -146,10 +198,12 @@ public:
|
||||
if (m_remainingWords == 0) {
|
||||
if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
|
||||
// Don't push bad fragments if we have a lot already
|
||||
m_fragments.push_back(MatchEntry(m_curfrag.first,
|
||||
m_fragments.push_back(MatchFragment(m_curfrag.first,
|
||||
m_curfrag.second,
|
||||
m_curfragcoef,
|
||||
#ifdef COMPUTE_HLZONES
|
||||
m_curhlzones,
|
||||
#endif
|
||||
m_curhitpos,
|
||||
m_curterm
|
||||
));
|
||||
@ -161,10 +215,67 @@ public:
|
||||
}
|
||||
return true;
|
||||
}
|
||||
const vector<MatchEntry>& getFragments() {
|
||||
|
||||
const vector<MatchFragment>& getFragments() {
|
||||
return m_fragments;
|
||||
}
|
||||
|
||||
|
||||
// After the text is split: use the group terms positions lists to
|
||||
// find the group matches. We process everything as NEAR (no
|
||||
// PHRASE specific processing).
|
||||
void updgroups() {
|
||||
vector<GroupMatchEntry> tboffs;
|
||||
|
||||
// Look for matches to PHRASE and NEAR term groups and finalize
|
||||
// the matched regions list (sort it by increasing start then
|
||||
// decreasing length). We process all groups as NEAR (ignore order).
|
||||
for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
|
||||
if (m_hdata.groups[i].size() > 1) {
|
||||
matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the fragments by increasing start and decreasing width
|
||||
std::sort(m_fragments.begin(), m_fragments.end(),
|
||||
[](const MatchFragment& a, const MatchFragment& b) -> bool {
|
||||
if (a.start != b.start)
|
||||
return a.start < b.start;
|
||||
return a.stop - a.start > b.stop - a.stop;
|
||||
}
|
||||
);
|
||||
|
||||
// Sort the group regions by increasing start and decreasing width.
|
||||
std::sort(tboffs.begin(), tboffs.end(),
|
||||
[](const GroupMatchEntry& a, const GroupMatchEntry& b)
|
||||
-> bool {
|
||||
if (a.offs.first != b.offs.first)
|
||||
return a.offs.first < b.offs.first;
|
||||
return a.offs.second > b.offs.second;
|
||||
}
|
||||
);
|
||||
|
||||
// Give a boost to fragments which contain a group match
|
||||
// (phrase/near), they are dear to the user's heart. list are
|
||||
// sorted, so we never go back in the fragment list (can
|
||||
// always start the search where we previously stopped).
|
||||
auto fragit = m_fragments.begin();
|
||||
for (const auto& grpmatch : tboffs) {
|
||||
while (fragit->start > grpmatch.offs.first) {
|
||||
fragit++;
|
||||
if (fragit == m_fragments.end()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (fragit->stop >= grpmatch.offs.second) {
|
||||
// grp in frag
|
||||
fragit->coef += 10.0;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
private:
|
||||
// Past terms because we need to go back for context before a hit
|
||||
deque<pair<int,int>> m_prevterms;
|
||||
@ -173,8 +284,10 @@ private:
|
||||
double m_curfragcoef{0.0};
|
||||
unsigned int m_remainingWords{0};
|
||||
unsigned int m_extcount{0};
|
||||
#ifdef COMPUTE_HLZONES
|
||||
vector<pair<int,int>> m_curhlzones;
|
||||
bool m_prevwordhit{false};
|
||||
#endif
|
||||
// Current sum of fragment weights
|
||||
double m_totalcoef{0.0};
|
||||
// Position of 1st term match (for page number computations)
|
||||
@ -182,14 +295,21 @@ private:
|
||||
// "best" term
|
||||
string m_curterm;
|
||||
double m_curtermcoef{0.0};
|
||||
|
||||
// Group terms, extracted from m_hdata
|
||||
unordered_set<string> m_gterms;
|
||||
// group/near terms word positions.
|
||||
map<string, vector<int> > m_plists;
|
||||
map<int, pair<int, int> > m_gpostobytes;
|
||||
|
||||
// Input
|
||||
set<string> m_terms;
|
||||
unordered_set<string> m_terms;
|
||||
const HighlightData& m_hdata;
|
||||
unordered_map<string, double>& m_wordcoefs;
|
||||
unsigned int m_ctxwords;
|
||||
|
||||
// Result: begin and end byte positions of query terms/groups in text
|
||||
vector<MatchEntry> m_fragments;
|
||||
vector<MatchFragment> m_fragments;
|
||||
};
|
||||
|
||||
int Query::Native::abstractFromText(
|
||||
@ -256,26 +376,45 @@ int Query::Native::abstractFromText(
|
||||
wordcoefs[word] = mment.first;
|
||||
}
|
||||
}
|
||||
TextSplitABS splitter(matchTerms, wordcoefs, ctxwords,
|
||||
|
||||
// Note: getTerms() was already called by qualityTerms, so this is
|
||||
// a bit wasteful. I guess that the performance impact is
|
||||
// negligible though. To be checked ? We need the highlightdata for the
|
||||
// phrase/near groups.
|
||||
HighlightData hld;
|
||||
if (m_q->m_sd) {
|
||||
m_q->m_sd->getTerms(hld);
|
||||
}
|
||||
|
||||
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
|
||||
TextSplit::TXTS_ONLYSPANS);
|
||||
splitter.text_to_words(rawtext);
|
||||
const vector<TextSplitABS::MatchEntry>& res1 = splitter.getFragments();
|
||||
vector<TextSplitABS::MatchEntry> result(res1.begin(), res1.end());
|
||||
splitter.updgroups();
|
||||
|
||||
// Sort the fragments by decreasing weight
|
||||
const vector<MatchFragment>& res1 = splitter.getFragments();
|
||||
vector<MatchFragment> result(res1.begin(), res1.end());
|
||||
std::sort(result.begin(), result.end(),
|
||||
[](const TextSplitABS::MatchEntry& a,
|
||||
const TextSplitABS::MatchEntry& b) -> bool {
|
||||
[](const MatchFragment& a,
|
||||
const MatchFragment& b) -> bool {
|
||||
return a.coef > b.coef;
|
||||
}
|
||||
);
|
||||
|
||||
static const string cstr_nc("\n\r\x0c\\");
|
||||
|
||||
vector<int> vpbreaks;
|
||||
ndb->getPagePositions(docid, vpbreaks);
|
||||
|
||||
// Build the output snippets array by merging the fragments, their
|
||||
// main term and the page positions.
|
||||
unsigned int count = 0;
|
||||
for (const auto& entry : result) {
|
||||
string frag = neutchars(
|
||||
rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
|
||||
#if 0
|
||||
|
||||
#ifdef COMPUTE_HLZONES
|
||||
// This would need to be modified to take tag parameters
|
||||
// instead of the const strings
|
||||
static const string starthit("<span style='color: blue;'>");
|
||||
static const string endhit("</span>");
|
||||
size_t inslen = 0;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user