highlighting: avoid trying to match groups in area already highlighted: this could prevent an effective match further in the text

This commit is contained in:
Jean-Francois Dockes 2012-08-14 08:33:52 +02:00
parent ed4a98c8b3
commit ebdd6faaf5

View File

@ -77,7 +77,9 @@ class TextSplitPTR : public TextSplit {
} }
} }
// Callback called by the text-to-words breaker for each word // Accept word and its position. If word is search term, add
// highlight zone definition. If word is part of search group
// (phrase or near), update positions list.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) { virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb; string dumb;
if (!unacmaybefold(term, dumb, "UTF-8", true)) { if (!unacmaybefold(term, dumb, "UTF-8", true)) {
@ -93,14 +95,18 @@ class TextSplitPTR : public TextSplit {
tboffs.push_back(pair<int, int>(bts, bte)); tboffs.push_back(pair<int, int>(bts, bte));
} }
// If word is part of a search group, update its positions list
if (m_gterms.find(dumb) != m_gterms.end()) { if (m_gterms.find(dumb) != m_gterms.end()) {
// Term group (phrase/near) handling // Term group (phrase/near) handling
m_plists[dumb].push_back(pos); m_plists[dumb].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bts, bte); m_gpostobytes[pos] = pair<int,int>(bts, bte);
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte)); //LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
} }
// Check for cancellation request
if ((m_wcount++ & 0xfff) == 0) if ((m_wcount++ & 0xfff) == 0)
CancelCheck::instance().checkCancel(); CancelCheck::instance().checkCancel();
return true; return true;
} }
@ -140,21 +146,28 @@ class VecIntCmpShorter {
if ((POS) > (STO)) (STO) = (POS);} if ((POS) > (STO)) (STO) = (POS);}
// Recursively check that each term is inside the window (which is // Recursively check that each term is inside the window (which is
// readjusted as the successive terms are found). i is the index for // readjusted as the successive terms are found).
// the next position list to use (initially 1) // @param window the search window width
// @param plists the position list vector
// @param i the position list to process (we then recurse with the next list)
// @param min the current minimum pos for a found term
// @param max the current maximum pos for a found term
// @param sp, ep output: the found area
// @param minpos bottom of search: this is the highest point of
// any previous match. We don't look below this as overlapping matches
// make no sense for highlighting.
static bool do_proximity_test(int window, vector<vector<int>* >& plists, static bool do_proximity_test(int window, vector<vector<int>* >& plists,
unsigned int i, int min, int max, unsigned int i, int min, int max,
int *sp, int *ep) int *sp, int *ep, int minpos)
{ {
int tmp = max + 1; LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
// take care to avoid underflow window, i, min, max, minpos));
if (window <= tmp) int tmp = max + 1 - window;
tmp -= window; if (tmp < minpos)
else tmp = minpos;
tmp = 0;
vector<int>::iterator it = plists[i]->begin();
// Find 1st position bigger than window start // Find 1st position bigger than window start
vector<int>::iterator it = plists[i]->begin();
while (it != plists[i]->end() && *it < tmp) while (it != plists[i]->end() && *it < tmp)
it++; it++;
@ -167,12 +180,8 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
SETMINMAX(pos, *sp, *ep); SETMINMAX(pos, *sp, *ep);
return true; return true;
} }
if (pos < min) { SETMINMAX(pos, min, max);
min = pos; if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
} else if (pos > max) {
max = pos;
}
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
SETMINMAX(pos, *sp, *ep); SETMINMAX(pos, *sp, *ep);
return true; return true;
} }
@ -181,7 +190,7 @@ static bool do_proximity_test(int window, vector<vector<int>* >& plists,
return false; return false;
} }
// Check if there is a NEAR match for the group of terms // Find NEAR matches for the input group of terms, update highlight map
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window) bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{ {
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window, LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
@ -232,18 +241,24 @@ bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
it->second.c_str(), plists[0]->size())); it->second.c_str(), plists[0]->size()));
} }
// Minpos is the highest end of a found match. While looking for
// further matches, we don't want the search to extend before
// this, because it does not make sense for highlight regions to
// overlap
int minpos = 0;
// Walk the shortest plist and look for matches // Walk the shortest plist and look for matches
for (vector<int>::iterator it = plists[0]->begin(); for (vector<int>::iterator it = plists[0]->begin();
it != plists[0]->end(); it++) { it != plists[0]->end(); it++) {
int pos = *it; int pos = *it;
int sta = int(10E9), sto = 0; int sta = int(10E9), sto = 0;
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos)); LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) { if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
sta, sto)); sta, sto));
// Maybe extend the window by 1st term position, this was not // Maybe extend the window by 1st term position, this was not
// done by do_prox.. // done by do_prox..
SETMINMAX(pos, sta, sto); SETMINMAX(pos, sta, sto);
minpos = sto+1;
// Translate the position window into a byte offset window // Translate the position window into a byte offset window
int bs = 0; int bs = 0;
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta); map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
@ -257,6 +272,8 @@ bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
} else { } else {
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto)); LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
} }
} else {
LOGDEB0(("matchGroup: no group match found at this position\n"));
} }
} }
@ -273,7 +290,8 @@ public:
} }
}; };
// Do the phrase match thing, then merge the highlight lists // Look for matches to PHRASE and NEAR term groups. Actually, we
// handle all groups as NEAR (ignore order).
bool TextSplitPTR::matchGroups() bool TextSplitPTR::matchGroups()
{ {
vector<vector<string> >::const_iterator vit = m_groups.begin(); vector<vector<string> >::const_iterator vit = m_groups.begin();
@ -282,8 +300,8 @@ bool TextSplitPTR::matchGroups()
matchGroup(*vit, *sit + (*vit).size()); matchGroup(*vit, *sit + (*vit).size());
} }
// Sort by start and end offsets. The merging of overlapping entries // Sort regions by increasing start and decreasing width.
// will be handled during output. // The output process will skip overlapping entries.
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst()); std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
return true; return true;
} }
@ -291,15 +309,12 @@ bool TextSplitPTR::matchGroups()
// Fix result text for display inside the gui text window. // Fix result text for display inside the gui text window.
// //
// To compute the term character positions in the output text, we used // We call overridden functions to output header data, beginnings and ends of
// to emulate how qt's textedit counts chars (ignoring tags and // matches etc.
// duplicate whitespace etc...). This was tricky business, dependant
// on qtextedit internals, and we don't do it any more, so we finally
// don't know the term par/car positions in the editor text.
// Instead, we now mark the search term positions with html anchors
// //
// We output the result in chunks, arranging not to cut in the middle of // If the input is text, we output the result in chunks, arranging not
// a tag, which would confuse qtextedit. // to cut in the middle of a tag, which would confuse qtextedit. If
// the input is html, the body is always a single output chunk.
bool PlainToRich::plaintorich(const string& in, bool PlainToRich::plaintorich(const string& in,
list<string>& out, // Output chunk list list<string>& out, // Output chunk list
const HiliteData& hdata, const HiliteData& hdata,
@ -311,18 +326,16 @@ bool PlainToRich::plaintorich(const string& in,
const vector<int>& slacks(hdata.gslks); const vector<int>& slacks(hdata.gslks);
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) { if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
LOGDEB0(("plaintorich: terms: \n"));
string sterms = vecStringToString(terms); string sterms = vecStringToString(terms);
LOGDEB0((" %s\n", sterms.c_str())); LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
sterms = "\n"; sterms.clear();
LOGDEB0(("plaintorich: groups: \n"));
for (vector<vector<string> >::const_iterator vit = groups.begin(); for (vector<vector<string> >::const_iterator vit = groups.begin();
vit != groups.end(); vit++) { vit != groups.end(); vit++) {
sterms += "GROUP: "; sterms += "GROUP: ";
sterms += vecStringToString(*vit); sterms += vecStringToString(*vit);
sterms += "\n"; sterms += "\n";
} }
LOGDEB0((" %s", sterms.c_str())); LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
LOGDEB2((" TEXT:[%s]\n", in.c_str())); LOGDEB2((" TEXT:[%s]\n", in.c_str()));
} }
@ -394,7 +407,7 @@ bool PlainToRich::plaintorich(const string& in,
if (tPosIt != tPosEnd) { if (tPosIt != tPosEnd) {
int ibyteidx = chariter.getBpos(); int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) { if (ibyteidx == tPosIt->first) {
if (!intag && ibyteidx > (int)headend) { if (!intag && ibyteidx >= (int)headend) {
*olit += startAnchor(anchoridx); *olit += startAnchor(anchoridx);
*olit += startMatch(); *olit += startMatch();
} }