highlighting: avoid trying to match groups in area already highlighted: this could prevent an effective match further in the text

This commit is contained in:
Jean-Francois Dockes 2012-08-14 08:33:52 +02:00
parent ed4a98c8b3
commit ebdd6faaf5

View File

@ -77,7 +77,9 @@ class TextSplitPTR : public TextSplit {
}
}
// Callback called by the text-to-words breaker for each word
// Accept word and its position. If word is search term, add
// highlight zone definition. If word is part of search group
// (phrase or near), update positions list.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
@ -93,14 +95,18 @@ class TextSplitPTR : public TextSplit {
tboffs.push_back(pair<int, int>(bts, bte));
}
// If word is part of a search group, update its positions list
if (m_gterms.find(dumb) != m_gterms.end()) {
// Term group (phrase/near) handling
m_plists[dumb].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bts, bte);
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
}
// Check for cancellation request
if ((m_wcount++ & 0xfff) == 0)
CancelCheck::instance().checkCancel();
return true;
}
@ -140,21 +146,28 @@ class VecIntCmpShorter {
if ((POS) > (STO)) (STO) = (POS);}
// Recursively check that each term is inside the window (which is
// readjusted as the successive terms are found). i is the index for
// the next position list to use (initially 1)
// readjusted as the successive terms are found).
// @param window the search window width
// @param plists the position list vector
// @param i the position list to process (we then recurse with the next list)
// @param min the current minimum pos for a found term
// @param max the current maximum pos for a found term
// @param sp, ep output: the found area
// @param minpos bottom of search: this is the highest point of
// any previous match. We don't look below this as overlapping matches
// make no sense for highlighting.
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
unsigned int i, int min, int max,
int *sp, int *ep)
int *sp, int *ep, int minpos)
{
int tmp = max + 1;
// take care to avoid underflow
if (window <= tmp)
tmp -= window;
else
tmp = 0;
vector<int>::iterator it = plists[i]->begin();
LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
window, i, min, max, minpos));
int tmp = max + 1 - window;
if (tmp < minpos)
tmp = minpos;
// Find 1st position bigger than window start
vector<int>::iterator it = plists[i]->begin();
while (it != plists[i]->end() && *it < tmp)
it++;
@ -167,12 +180,8 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
SETMINMAX(pos, *sp, *ep);
return true;
}
if (pos < min) {
min = pos;
} else if (pos > max) {
max = pos;
}
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
SETMINMAX(pos, min, max);
if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
SETMINMAX(pos, *sp, *ep);
return true;
}
@ -181,7 +190,7 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
return false;
}
// Check if there is a NEAR match for the group of terms
// Find NEAR matches for the input group of terms, update highlight map
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
@ -232,18 +241,24 @@ bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
it->second.c_str(), plists[0]->size()));
}
// Minpos is the highest end of a found match. While looking for
// further matches, we don't want the search to extend before
// this, because it does not make sense for highlight regions to
// overlap
int minpos = 0;
// Walk the shortest plist and look for matches
for (vector<int>::iterator it = plists[0]->begin();
it != plists[0]->end(); it++) {
int pos = *it;
int sta = int(10E9), sto = 0;
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
sta, sto));
// Maybe extend the window by 1st term position, this was not
// done by do_prox..
SETMINMAX(pos, sta, sto);
minpos = sto+1;
// Translate the position window into a byte offset window
int bs = 0;
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
@ -257,6 +272,8 @@ bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
} else {
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
}
} else {
LOGDEB0(("matchGroup: no group match found at this position\n"));
}
}
@ -273,7 +290,8 @@ public:
}
};
// Do the phrase match thing, then merge the highlight lists
// Look for matches to PHRASE and NEAR term groups. Actually, we
// handle all groups as NEAR (ignore order).
bool TextSplitPTR::matchGroups()
{
vector<vector<string> >::const_iterator vit = m_groups.begin();
@ -282,8 +300,8 @@ bool TextSplitPTR::matchGroups()
matchGroup(*vit, *sit + (*vit).size());
}
// Sort by start and end offsets. The merging of overlapping entries
// will be handled during output.
// Sort regions by increasing start and decreasing width.
// The output process will skip overlapping entries.
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
return true;
}
@ -291,15 +309,12 @@ bool TextSplitPTR::matchGroups()
// Fix result text for display inside the gui text window.
//
// To compute the term character positions in the output text, we used
// to emulate how qt's textedit counts chars (ignoring tags and
// duplicate whitespace etc...). This was tricky business, dependant
// on qtextedit internals, and we don't do it any more, so we finally
// don't know the term par/car positions in the editor text.
// Instead, we now mark the search term positions with html anchors
// We call overridden functions to output header data, beginnings and ends of
// matches etc.
//
// We output the result in chunks, arranging not to cut in the middle of
// a tag, which would confuse qtextedit.
// If the input is text, we output the result in chunks, arranging not
// to cut in the middle of a tag, which would confuse qtextedit. If
// the input is html, the body is always a single output chunk.
bool PlainToRich::plaintorich(const string& in,
list<string>& out, // Output chunk list
const HiliteData& hdata,
@ -311,18 +326,16 @@ bool PlainToRich::plaintorich(const string& in,
const vector<int>& slacks(hdata.gslks);
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
LOGDEB0(("plaintorich: terms: \n"));
string sterms = vecStringToString(terms);
LOGDEB0((" %s\n", sterms.c_str()));
sterms = "\n";
LOGDEB0(("plaintorich: groups: \n"));
LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
sterms.clear();
for (vector<vector<string> >::const_iterator vit = groups.begin();
vit != groups.end(); vit++) {
sterms += "GROUP: ";
sterms += vecStringToString(*vit);
sterms += "\n";
}
LOGDEB0((" %s", sterms.c_str()));
LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
LOGDEB2((" TEXT:[%s]\n", in.c_str()));
}
@ -394,7 +407,7 @@ bool PlainToRich::plaintorich(const string& in,
if (tPosIt != tPosEnd) {
int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) {
if (!intag && ibyteidx > (int)headend) {
if (!intag && ibyteidx >= (int)headend) {
*olit += startAnchor(anchoridx);
*olit += startMatch();
}