447 lines
13 KiB
C++
447 lines
13 KiB
C++
#ifndef lint
|
|
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.19 2006-12-11 14:56:38 dockes Exp $ (C) 2005 J.F.Dockes";
|
|
#endif
|
|
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
|
|
#include <string>
|
|
#include <utility>
|
|
#include <list>
|
|
#include <set>
|
|
#include <vector>
|
|
#include <map>
|
|
#include <algorithm>
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using std::vector;
|
|
using std::list;
|
|
using std::pair;
|
|
using std::set;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
#include "rcldb.h"
|
|
#include "rclconfig.h"
|
|
#include "debuglog.h"
|
|
#include "textsplit.h"
|
|
#include "utf8iter.h"
|
|
#include "transcode.h"
|
|
#include "smallut.h"
|
|
#include "plaintorich.h"
|
|
#include "cancelcheck.h"
|
|
|
|
// For debug printing
|
|
static string vecStringToString(const vector<string>& t)
|
|
{
|
|
string sterms;
|
|
for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
|
|
sterms += "[" + *it + "] ";
|
|
}
|
|
return sterms;
|
|
}
|
|
|
|
// Text splitter callback used to take note of the position of query terms
|
|
// inside the result text. This is then used to insert highlight tags.
|
|
class myTextSplitCB : public TextSplitCB {
|
|
public:
|
|
|
|
// Out: first query term found in text
|
|
string firstTerm;
|
|
int firstTermOcc;
|
|
int m_firstTermPos;
|
|
int m_firstTermBPos;
|
|
|
|
// Out: begin and end byte positions of query terms/groups in text
|
|
vector<pair<int, int> > tboffs;
|
|
|
|
myTextSplitCB(const vector<string>& its, vector<vector<string> >&groups,
|
|
vector<int>& slacks)
|
|
: firstTermOcc(1), m_wcount(0), m_groups(groups), m_slacks(slacks)
|
|
{
|
|
for (vector<string>::const_iterator it = its.begin();
|
|
it != its.end(); it++) {
|
|
m_terms.insert(*it);
|
|
}
|
|
for (vector<vector<string> >::const_iterator vit = m_groups.begin();
|
|
vit != m_groups.end(); vit++) {
|
|
for (vector<string>::const_iterator it = (*vit).begin();
|
|
it != (*vit).end(); it++) {
|
|
m_gterms.insert(*it);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Callback called by the text-to-words breaker for each word
|
|
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
|
string dumb;
|
|
Rcl::dumb_string(term, dumb);
|
|
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
|
// pos, bts, bte));
|
|
|
|
// If this word is a search term, remember its byte-offset span.
|
|
if (m_terms.find(dumb) != m_terms.end()) {
|
|
tboffs.push_back(pair<int, int>(bts, bte));
|
|
if (firstTerm.empty()) {
|
|
firstTerm = term;
|
|
m_firstTermPos = pos;
|
|
m_firstTermBPos = bts;
|
|
}
|
|
}
|
|
|
|
if (m_gterms.find(dumb) != m_gterms.end()) {
|
|
// Term group (phrase/near) handling
|
|
m_plists[dumb].push_back(pos);
|
|
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
|
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
|
|
}
|
|
if ((m_wcount++ & 0xfff) == 0)
|
|
CancelCheck::instance().checkCancel();
|
|
return true;
|
|
}
|
|
|
|
// Must be called after the split to find the phrase/near match positions
|
|
virtual bool matchGroups();
|
|
|
|
private:
|
|
virtual bool matchGroup(const vector<string>& terms, int dist);
|
|
|
|
int m_wcount;
|
|
|
|
// In: user query terms
|
|
set<string> m_terms;
|
|
|
|
// In: user query groups, for near/phrase searches.
|
|
const vector<vector<string> >& m_groups;
|
|
const vector<int>& m_slacks;
|
|
set<string> m_gterms;
|
|
|
|
// group/near terms word positions.
|
|
map<string, vector<int> > m_plists;
|
|
map<int, pair<int, int> > m_gpostobytes;
|
|
};
|
|
|
|
|
|
/** Sort by shorter comparison class */
|
|
class VecIntCmpShorter {
|
|
public:
|
|
/** Return true if and only if a is strictly shorter than b.
|
|
*/
|
|
bool operator()(const vector<int> *a, const vector<int> *b) {
|
|
return a->size() < b->size();
|
|
}
|
|
};
|
|
|
|
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
|
|
if ((POS) > (STO)) (STO) = (POS);}
|
|
|
|
// Recursively check that each term is inside the window (which is readjusted
|
|
// as the successive terms are found)
|
|
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
|
unsigned int i, int min, int max, int *sp, int *ep)
|
|
{
|
|
int tmp = max + 1;
|
|
// take care to avoid underflow
|
|
if (window <= tmp)
|
|
tmp -= window;
|
|
else
|
|
tmp = 0;
|
|
vector<int>::iterator it = plists[i]->begin();
|
|
|
|
// Find 1st position bigger than window start
|
|
while (it != plists[i]->end() && *it < tmp)
|
|
it++;
|
|
|
|
// Try each position inside window in turn for match with other lists
|
|
while (it != plists[i]->end()) {
|
|
int pos = *it;
|
|
if (pos > min + window - 1)
|
|
return false;
|
|
if (i + 1 == plists.size()) {
|
|
SETMINMAX(pos, *sp, *ep);
|
|
return true;
|
|
}
|
|
if (pos < min) {
|
|
min = pos;
|
|
} else if (pos > max) {
|
|
max = pos;
|
|
}
|
|
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
|
|
SETMINMAX(pos, *sp, *ep);
|
|
return true;
|
|
}
|
|
it++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Check if there is a NEAR match for the group of terms
|
|
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|
{
|
|
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
|
vecStringToString(terms).c_str()));
|
|
|
|
// The position lists we are going to work with. We extract them from the
|
|
// (string->plist) map
|
|
vector<vector<int>* > plists;
|
|
// A revert plist->term map. This is so that we can find who is who after
|
|
// sorting the plists by length.
|
|
map<vector<int>*, string> plistToTerm;
|
|
// For traces
|
|
vector<string> realgroup;
|
|
|
|
// Find the position list for each term in the group. Not all
|
|
// necessarily exist (esp for NEAR where terms have been
|
|
// stem-expanded: we don't know which matched)
|
|
for (vector<string>::const_iterator it = terms.begin();
|
|
it != terms.end(); it++) {
|
|
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
|
if (pl == m_plists.end()) {
|
|
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
|
(*it).c_str()));
|
|
continue;
|
|
}
|
|
plists.push_back(&(pl->second));
|
|
plistToTerm[&(pl->second)] = *it;
|
|
realgroup.push_back(*it);
|
|
}
|
|
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
|
|
vecStringToString(realgroup).c_str()));
|
|
if (plists.size() < 2)
|
|
return false;
|
|
// Sort the positions lists so that the shorter is first
|
|
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
|
|
|
// Walk the shortest plist and look for matches
|
|
int sta = int(10E9), sto = 0;
|
|
int pos;
|
|
// Occurrences are from 1->N
|
|
firstTermOcc = 0;
|
|
vector<int>::iterator it = plists[0]->begin();
|
|
do {
|
|
if (it == plists[0]->end())
|
|
return false;
|
|
pos = *it++;
|
|
firstTermOcc++;
|
|
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
|
|
SETMINMAX(pos, sta, sto);
|
|
|
|
LOGDEB0(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto));
|
|
|
|
// Translate the position window into a byte offset window
|
|
int bs = 0;
|
|
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
|
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
|
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
|
LOGDEB1(("myTextSplitCB::matchGroup: pushing %d %d\n",
|
|
i1->second.first, i2->second.second));
|
|
tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
|
|
bs = i1->second.first;
|
|
} else {
|
|
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
|
|
sta, sto));
|
|
}
|
|
|
|
if (firstTerm.empty() || m_firstTermPos > sta) {
|
|
// firsTerm is used to try an position the preview window over
|
|
// the match. As it's difficult to divine byte/word positions
|
|
// in qtextedit, we use a string search. Use the
|
|
// shortest plist for this, which hopefully gives a better
|
|
// chance for the group to be found (it's hopeless to try and
|
|
// match the whole group)
|
|
map<vector<int>*, string>::iterator it =
|
|
plistToTerm.find(plists.front());
|
|
if (it != plistToTerm.end())
|
|
firstTerm = it->second;
|
|
LOGDEB0(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
|
|
firstTerm.c_str(), firstTermOcc));
|
|
m_firstTermPos = sta;
|
|
m_firstTermBPos = bs;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/** Sort integer pairs by increasing first value and decreasing width */
|
|
class PairIntCmpFirst {
|
|
public:
|
|
bool operator()(pair<int,int> a, pair<int, int>b) {
|
|
if (a.first != b.first)
|
|
return a.first < b.first;
|
|
return a.second > b.second;
|
|
}
|
|
};
|
|
|
|
// Do the phrase match thing, then merge the highlight lists
|
|
bool myTextSplitCB::matchGroups()
|
|
{
|
|
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
|
vector<int>::const_iterator sit = m_slacks.begin();
|
|
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
|
matchGroup(*vit, *sit + (*vit).size());
|
|
}
|
|
|
|
// Sort by start and end offsets. The merging of overlapping entries
|
|
// will be handled during output.
|
|
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
|
|
return true;
|
|
}
|
|
|
|
const char *firstTermAnchorName = "FIRSTTERM";
|
|
|
|
#ifdef QT_SCROLL_TO_ANCHOR_BUG
|
|
const char *firstTermBeacon = "\xe2\xa0\x91\xe2\x96\x9f\x20\x01\x9a";
|
|
#endif
|
|
|
|
// Fix result text for display inside the gui text window.
|
|
//
|
|
// To compute the term character positions in the output text, we used
|
|
// to emulate how qt's textedit counts chars (ignoring tags and
|
|
// duplicate whitespace etc...). This was tricky business, dependant
|
|
// on qtextedit internals, and we don't do it any more, so we finally
|
|
// don't know the term par/car positions in the editor text. Instead,
|
|
// we return the first term encountered, and the caller will use the
|
|
// editor's find() function to position on it
|
|
bool plaintorich(const string& in, string& out,
|
|
RefCntr<Rcl::SearchData> sdata,
|
|
bool noHeader, bool fft)
|
|
{
|
|
Chrono chron;
|
|
out.erase();
|
|
vector<string> terms;
|
|
vector<vector<string> > groups;
|
|
vector<int> slacks;
|
|
|
|
sdata->getTerms(terms, groups, slacks);
|
|
|
|
if (DebugLog::getdbl()->getlevel() >= DEBDEB0) {
|
|
LOGDEB0(("plaintorich: terms: \n"));
|
|
string sterms = vecStringToString(terms);
|
|
LOGDEB0((" %s\n", sterms.c_str()));
|
|
sterms = "\n";
|
|
LOGDEB0(("plaintorich: groups: \n"));
|
|
for (vector<vector<string> >::iterator vit = groups.begin();
|
|
vit != groups.end(); vit++) {
|
|
sterms += vecStringToString(*vit);
|
|
sterms += "\n";
|
|
}
|
|
LOGDEB0((" %s", sterms.c_str()));
|
|
}
|
|
|
|
// We first use the text splitter to break the text into words,
|
|
// and compare the words to the search terms, which yields the
|
|
// query terms positions inside the text
|
|
myTextSplitCB cb(terms, groups, slacks);
|
|
TextSplit splitter(&cb);
|
|
// Note that splitter returns the term locations in byte, not
|
|
// character offset
|
|
splitter.text_to_words(in);
|
|
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
|
|
|
cb.matchGroups();
|
|
|
|
// Rich text output
|
|
if (noHeader)
|
|
out = "";
|
|
else
|
|
out = "<qt><head><title></title></head><body><p>";
|
|
|
|
// Iterator for the list of input term positions. We use it to
|
|
// output highlight tags and to compute term positions in the
|
|
// output text
|
|
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
|
|
vector<pair<int, int> >::iterator tboffsend = cb.tboffs.end();
|
|
|
|
#if 0
|
|
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
|
|
it != cb.tboffs.end(); it++) {
|
|
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
|
}
|
|
#endif
|
|
|
|
// Input character iterator
|
|
Utf8Iter chariter(in);
|
|
// State variable used to limitate the number of consecutive empty lines
|
|
int ateol = 0;
|
|
// State variable to update the char pos only for the first of
|
|
// consecutive blank chars
|
|
int atblank = 0;
|
|
for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
|
|
if ((pos & 0xfff) == 0) {
|
|
CancelCheck::instance().checkCancel();
|
|
}
|
|
// If we still have terms positions, check (byte) position
|
|
if (tPosIt != tboffsend) {
|
|
int ibyteidx = chariter.getBpos();
|
|
|
|
if (fft && ibyteidx == cb.m_firstTermBPos) {
|
|
out += string("<a name=\"") + firstTermAnchorName + "\"> "
|
|
#ifdef QT_SCROLL_TO_ANCHOR_BUG
|
|
+ "<font color=\"white\"> " + firstTermBeacon + " </font> "
|
|
#endif
|
|
+ "</a>";
|
|
}
|
|
|
|
if (ibyteidx == tPosIt->first) {
|
|
out += "<termtag>";
|
|
} else if (ibyteidx == tPosIt->second) {
|
|
// Output end tag, then skip all highlight areas that
|
|
// would overlap this one
|
|
out += "</termtag>";
|
|
int crend = tPosIt->second;
|
|
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
|
|
tPosIt++;
|
|
}
|
|
}
|
|
switch(*chariter) {
|
|
case '\n':
|
|
if (ateol < 2) {
|
|
out += "<br>\n";
|
|
ateol++;
|
|
}
|
|
break;
|
|
case '\r':
|
|
break;
|
|
case '<':
|
|
ateol = 0;
|
|
out += "<";
|
|
break;
|
|
case '&':
|
|
ateol = 0;
|
|
out += "&";
|
|
break;
|
|
default:
|
|
// We don't change the eol status for whitespace, want a real line
|
|
if (*chariter == ' ' || *chariter == '\t') {
|
|
atblank = 1;
|
|
} else {
|
|
ateol = 0;
|
|
atblank = 0;
|
|
}
|
|
chariter.appendchartostring(out);
|
|
}
|
|
}
|
|
#if 0
|
|
{
|
|
FILE *fp = fopen("/tmp/debugplaintorich", "a");
|
|
fprintf(fp, "%s\n", out.c_str());
|
|
fclose(fp);
|
|
}
|
|
#endif
|
|
LOGDEB0(("plaintorich: done %d mS\n", chron.millis()));
|
|
return true;
|
|
}
|