cleaned up plaintorich. Now a proper subclassable class + highlights multiple groups, not just the first

This commit is contained in:
dockes 2008-07-01 08:27:58 +00:00
parent c27cf93d8c
commit 5856df2230
5 changed files with 172 additions and 142 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.30 2007-11-15 18:05:32 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.31 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -43,6 +43,8 @@ using std::set;
#include "plaintorich.h"
#include "cancelcheck.h"
const string PlainToRich::snull = "";
// For debug printing
static string vecStringToString(const vector<string>& t)
{
@ -58,19 +60,13 @@ static string vecStringToString(const vector<string>& t)
class myTextSplitCB : public TextSplitCB {
public:
// Out: first query term found in text
string firstTerm;
int firstTermOcc;
int m_firstTermPos;
int m_firstTermBPos;
// Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs;
myTextSplitCB(const vector<string>& its,
const vector<vector<string> >&groups,
const vector<int>& slacks)
: firstTermOcc(1), m_wcount(0), m_groups(groups), m_slacks(slacks)
: m_wcount(0), m_groups(groups), m_slacks(slacks)
{
for (vector<string>::const_iterator it = its.begin();
it != its.end(); it++) {
@ -95,11 +91,6 @@ class myTextSplitCB : public TextSplitCB {
// If this word is a search term, remember its byte-offset span.
if (m_terms.find(dumb) != m_terms.end()) {
tboffs.push_back(pair<int, int>(bts, bte));
if (firstTerm.empty()) {
firstTerm = term;
m_firstTermPos = pos;
m_firstTermBPos = bts;
}
}
if (m_gterms.find(dumb) != m_gterms.end()) {
@ -148,10 +139,12 @@ class VecIntCmpShorter {
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
if ((POS) > (STO)) (STO) = (POS);}
// Recursively check that each term is inside the window (which is readjusted
// as the successive terms are found)
// Recursively check that each term is inside the window (which is
// readjusted as the successive terms are found). i is the index for
// the next position list to use (initially 1)
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
unsigned int i, int min, int max, int *sp, int *ep)
unsigned int i, int min, int max,
int *sp, int *ep)
{
int tmp = max + 1;
// take care to avoid underflow
@ -210,7 +203,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl == m_plists.end()) {
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
(*it).c_str()));
continue;
}
@ -218,58 +211,53 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
plistToTerm[&(pl->second)] = *it;
realgroup.push_back(*it);
}
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
vecStringToString(realgroup).c_str()));
if (plists.size() < 2)
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",
window, vecStringToString(realgroup).c_str()));
if (plists.size() < 2) {
LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
return false;
}
// Sort the positions lists so that the shorter is first
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
// Walk the shortest plist and look for matches
int sta = int(10E9), sto = 0;
int pos;
// Occurrences are from 1->N
firstTermOcc = 0;
vector<int>::iterator it = plists[0]->begin();
do {
if (it == plists[0]->end())
{ // Debug
map<vector<int>*, string>::iterator it;
it = plistToTerm.find(plists[0]);
if (it == plistToTerm.end()) {
// SuperWeird
LOGERR(("matchGroup: term for first list not found !?!\n"));
return false;
pos = *it++;
firstTermOcc++;
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
SETMINMAX(pos, sta, sto);
LOGDEB0(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto));
// Translate the position window into a byte offset window
int bs = 0;
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
LOGDEB1(("myTextSplitCB::matchGroup: pushing %d %d\n",
i1->second.first, i2->second.second));
tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
bs = i1->second.first;
} else {
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
sta, sto));
}
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
it->second.c_str(), plists[0]->size()));
}
if (firstTerm.empty() || m_firstTermPos > sta) {
// firsTerm is used to try an position the preview window over
// the match. As it's difficult to divine byte/word positions
// in qtextedit, we use a string search. Use the
// shortest plist for this, which hopefully gives a better
// chance for the group to be found (it's hopeless to try and
// match the whole group)
map<vector<int>*, string>::iterator it =
plistToTerm.find(plists.front());
if (it != plistToTerm.end())
firstTerm = it->second;
LOGDEB0(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
firstTerm.c_str(), firstTermOcc));
m_firstTermPos = sta;
m_firstTermBPos = bs;
// Walk the shortest plist and look for matches
for (vector<int>::iterator it = plists[0]->begin();
it != plists[0]->end(); it++) {
int pos = *it;
int sta = int(10E9), sto = 0;
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",
sta, sto));
// Maybe extend the window by 1st term position, this was not
// done by do_prox..
SETMINMAX(pos, sta, sto);
// Translate the position window into a byte offset window
int bs = 0;
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
i1->second.first, i2->second.second));
tboffs.push_back(pair<int, int>(i1->second.first,
i2->second.second));
bs = i1->second.first;
} else {
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
}
}
}
return true;
@ -300,20 +288,6 @@ bool myTextSplitCB::matchGroups()
return true;
}
// Setting searchable beacons in the text to walk the term list.
static const char *termAnchorNameBase = "TRM";
string termAnchorName(int i)
{
char acname[sizeof(termAnchorNameBase) + 20];
sprintf(acname, "%s%d", termAnchorNameBase, i);
return string(acname);
}
static string termBeacon(int i)
{
return string("<a name=\"") + termAnchorName(i) + "\">";
}
// Fix result text for display inside the gui text window.
//
@ -325,9 +299,9 @@ static string termBeacon(int i)
// Instead, we mark the search term positions either with html anchor
// (qt currently has problems with them), or a special string, and the
// caller will use the editor's find() function to position on it
bool plaintorich(const string& in, list<string>& out,
const HiliteData& hdata,
bool noHeader, int *lastAnchor, int chunksize)
bool PlainToRich::plaintorich(const string& in, list<string>& out,
const HiliteData& hdata,
int chunksize)
{
Chrono chron;
const vector<string>& terms(hdata.terms);
@ -342,6 +316,7 @@ bool plaintorich(const string& in, list<string>& out,
LOGDEB0(("plaintorich: groups: \n"));
for (vector<vector<string> >::const_iterator vit = groups.begin();
vit != groups.end(); vit++) {
sterms += "GROUP: ";
sterms += vecStringToString(*vit);
sterms += "\n";
}
@ -362,13 +337,10 @@ bool plaintorich(const string& in, list<string>& out,
out.clear();
out.push_back("");
list<string>::iterator sit = out.begin();
list<string>::iterator olit = out.begin();
// Rich text output
if (noHeader)
*sit = "";
else
*sit = "<qt><head><title></title></head><body><p>";
*olit = header();
// Iterator for the list of input term positions. We use it to
// output highlight tags and to compute term positions in the
@ -388,10 +360,11 @@ bool plaintorich(const string& in, list<string>& out,
// State variable used to limitate the number of consecutive empty lines
int ateol = 0;
// Stuff for numbered anchors at each term match
// Value for numbered anchors at each term match
int anchoridx = 1;
for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
// Check from time to time if we need to stop
if ((pos & 0xfff) == 0) {
CancelCheck::instance().checkCancel();
}
@ -401,20 +374,20 @@ bool plaintorich(const string& in, list<string>& out,
if (tPosIt != tboffsend) {
int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) {
if (lastAnchor)
*sit += termBeacon(anchoridx++);
*sit += "<termtag>";
*olit += startAnchor(anchoridx++);
*olit += startMatch();
} else if (ibyteidx == tPosIt->second) {
// Output end tag, then skip all highlight areas that
// would overlap this one
*sit += "</termtag>";
*olit += endMatch();
*olit += endAnchor();
int crend = tPosIt->second;
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
tPosIt++;
// Maybe end chunk
if (sit->size() > (unsigned int)chunksize) {
// Maybe end this chunk, begin next
if (olit->size() > (unsigned int)chunksize) {
out.push_back("");
sit++;
olit++;
}
}
}
@ -422,33 +395,29 @@ bool plaintorich(const string& in, list<string>& out,
switch(*chariter) {
case '\n':
if (ateol < 2) {
*sit += "<br>\n";
*olit += "<br>\n";
ateol++;
}
break;
case '\r':
break;
case '\007': // used as anchor char, strip other instances
break;
case '<':
ateol = 0;
*sit += "&lt;";
*olit += "&lt;";
break;
case '&':
ateol = 0;
*sit += "&amp;";
*olit += "&amp;";
break;
default:
// We don't change the eol status for whitespace, want a real line
if (!(*chariter == ' ' || *chariter == '\t')) {
ateol = 0;
}
chariter.appendchartostring(*sit);
chariter.appendchartostring(*olit);
}
}
if (lastAnchor)
*lastAnchor = anchoridx - 1;
#if 0
#if 1
{
FILE *fp = fopen("/tmp/debugplaintorich", "a");
fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");

View File

@ -16,42 +16,65 @@
*/
#ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.16 2007-11-15 18:05:32 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: plaintorich.h,v 1.17 2008-07-01 08:27:58 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
using std::list;
using std::string;
// A data struct to hold words and groups of words to be highlighted
/// Holder for plaintorich() input data: words and groups of words to
/// be highlighted
struct HiliteData {
// Single terms
vector<string> terms;
// NEAR and PHRASE elements
vector<vector<string> > groups;
vector<int> gslks; // group slacks (number of permitted non-matched words)
// Group slacks (number of permitted non-matched words).
// Parallel vector to the above 'groups'
vector<int> gslks;
};
/**
* Transform plain text into qt rich text for the preview window.
*
* We escape characters like < or &, and add qt rich text tags to
* colorize the query terms. The latter is a quite complicated matter because
* of phrase/near searches. We treat all such searches as "near", not "phrase"
*
* @param in raw text out of internfile.
* @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param hdata terms and groups to be highlighted. These are
* lowercase and unaccented.
* @param noHeader if true don't output header (<qt><title>...)
* @param needBeacons Need to navigate highlighted terms, mark them,return last
/**
* A class for highlighting search results. Overridable methods allow
* for different styles
*/
extern bool plaintorich(const string &in, list<string> &out,
const HiliteData& hdata,
bool noHeader,
int *needBeacons,
int chunksize = 50000
);
class PlainToRich {
public:
static const string snull;
virtual ~PlainToRich() {}
/**
* Transform plain text for highlighting search terms, ie in the
* preview window or result list entries.
*
* The actual tags used for highlighting and anchoring are
* determined by deriving from this class which handles the searching for
* terms and groups, but there is an assumption that the output will be
* html-like: we escape characters like < or &
*
* Finding the search terms is relatively complicated because of
* phrase/near searches, which need group highlights. As a matter
* of simplification, we handle "phrase" as "near", not filtering
* on word order.
*
* @param in raw text out of internfile.
* @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param hdata terms and groups to be highlighted. These are
* lowercase and unaccented.
* @param chunksize max size of chunks in output list
*/
virtual bool plaintorich(const string &in, list<string> &out,
const HiliteData& hdata,
int chunksize = 50000
);
extern string termAnchorName(int i);
/* Methods to ouput headers, highlighting and marking tags */
virtual string header() {return snull;}
virtual string startMatch() {return snull;}
virtual string endMatch() {return snull;}
virtual string startAnchor(int) {return snull;}
virtual string endAnchor() {return snull;}
};
#endif /* _PLAINTORICH_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.34 2008-05-21 07:21:37 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.35 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -345,17 +345,17 @@ void Preview::doSearch(const QString &_text, bool next, bool reverse,
return;
if (reverse) {
if (m_curAnchor == 1)
m_curAnchor = m_lastAnchor;
m_curAnchor = m_plaintorich.lastanchor;
else
m_curAnchor--;
} else {
if (m_curAnchor == m_lastAnchor)
if (m_curAnchor == m_plaintorich.lastanchor)
m_curAnchor = 1;
else
m_curAnchor++;
}
QString aname =
QString::fromUtf8(termAnchorName(m_curAnchor).c_str());
QString::fromUtf8(m_plaintorich.termAnchorName(m_curAnchor).c_str());
edit->moveToAnchor(aname);
return;
}
@ -552,7 +552,7 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
}
LOGDEB(("Doc.url: [%s]\n", doc.url.c_str()));
string url;
printableUrl(doc.url, url);
printableUrl(rclconfig->getDefCharset(), doc.url, url);
string tiptxt = url + string("\n");
tiptxt += doc.mimetype + " " + string(datebuf) + "\n";
if (meta_it != doc.meta.end() && !meta_it->second.empty())
@ -670,7 +670,7 @@ class LoadThread : public QThread {
*statusp = -1;
return;
}
FileInterner interner(filename, &st, rclconfig, tmpdir, mtype);
try {
FileInterner::Status ret = interner.internfile(*out, ipath);
@ -699,11 +699,11 @@ class ToRichThread : public QThread {
const HiliteData &hdata;
list<string> &out;
int loglevel;
int *lastanchor;
PlainToRichQtPreview& ptr;
public:
ToRichThread(string &i, const HiliteData& hd, list<string> &o,
int *lsta)
: in(i), hdata(hd), out(o), lastanchor(lsta)
PlainToRichQtPreview& _ptr)
: in(i), hdata(hd), out(o), ptr(_ptr)
{
loglevel = DebugLog::getdbl()->getlevel();
}
@ -711,7 +711,7 @@ class ToRichThread : public QThread {
{
DebugLog::getdbl()->setloglevel(loglevel);
try {
plaintorich(in, out, hdata, false, lastanchor);
ptr.plaintorich(in, out, hdata);
} catch (CancelExcept) {
}
}
@ -828,7 +828,7 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
progress.setLabelText(tr("Creating preview text"));
qApp->processEvents();
list<string> richlst;
ToRichThread rthr(fdoc.text, m_hData, richlst, &m_lastAnchor);
ToRichThread rthr(fdoc.text, m_hData, richlst, m_plaintorich);
rthr.start();
for (;;prog++) {
@ -911,7 +911,7 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
progress.close();
m_haveAnchors = m_lastAnchor != 0;
m_haveAnchors = m_plaintorich.lastanchor != 0;
if (searchTextLine->text().length() != 0) {
// If there is a current search string, perform the search
m_canBeep = true;
@ -919,7 +919,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
} else {
// Position to the first query term
if (m_haveAnchors) {
QString aname = QString::fromUtf8(termAnchorName(1).c_str());
QString aname =
QString::fromUtf8(m_plaintorich.termAnchorName(1).c_str());
LOGDEB2(("Call movetoanchor(%s)\n", (const char *)aname.utf8()));
editor->moveToAnchor(aname);
m_curAnchor = 1;

View File

@ -1,6 +1,6 @@
#ifndef _PREVIEW_W_H_INCLUDED_
#define _PREVIEW_W_H_INCLUDED_
/* @(#$Id: preview_w.h,v 1.17 2007-11-15 18:34:49 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: preview_w.h,v 1.18 2008-07-01 08:27:58 dockes Exp $ (C) 2006 J.F.Dockes */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -65,6 +65,33 @@ class TabData {
{}
};
// Subclass plainToRich to add <termtag>s and anchors to the preview text
class PlainToRichQtPreview : public PlainToRich {
public:
int lastanchor;
PlainToRichQtPreview() {
lastanchor = 0;
}
virtual ~PlainToRichQtPreview() {}
virtual string header() {
return string("<qt><head><title></title></head><body><p>");
}
virtual string startMatch() {return string("<termtag>");}
virtual string endMatch() {return string("</termtag>");}
virtual string termAnchorName(int i) {
static const char *termAnchorNameBase = "TRM";
char acname[sizeof(termAnchorNameBase) + 20];
sprintf(acname, "%s%d", termAnchorNameBase, i);
if (i > lastanchor)
lastanchor = i;
return string(acname);
}
virtual string startAnchor(int i) {
return string("<a name=\"") + termAnchorName(i) + "\">";
}
};
class Preview : public QWidget {
Q_OBJECT
@ -116,6 +143,7 @@ private:
QWidget *m_currentW;
HiliteData m_hData;
bool m_justCreated; // First tab create is different
PlainToRichQtPreview m_plaintorich;
bool m_haveAnchors; // Search terms are marked in text
int m_lastAnchor; // Number of last anchor. Then rewind to 1
int m_curAnchor;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.39 2008-05-21 07:21:37 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.40 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <time.h>
@ -49,6 +49,14 @@ static char rcsid[] = "@(#$Id: reslist.cpp,v 1.39 2008-05-21 07:21:37 dockes Exp
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#endif
class PlainToRichQtReslist : public PlainToRich {
public:
virtual ~PlainToRichQtReslist() {}
virtual string startMatch() {return string("<termtag>");}
virtual string endMatch() {return string("</termtag>");}
};
ResList::ResList(QWidget* parent, const char* name)
: QTEXTBROWSER(parent, name)
{
@ -430,7 +438,7 @@ void ResList::resultPageNext()
// Printable url: either utf-8 if transcoding succeeds, or url-encoded
string url;
printableUrl(doc.url, url);
printableUrl(rclconfig->getDefCharset(), doc.url, url);
// Make title out of file name if none yet
if (doc.meta["title"].empty()) {
@ -480,7 +488,8 @@ void ResList::resultPageNext()
}
// No need to call escapeHtml(), plaintorich handles it
list<string> lr;
plaintorich(abstract, lr, hdata, true, 0, 100000);
PlainToRichQtReslist ptr;
ptr.plaintorich(abstract, lr, hdata);
string richabst = lr.front();
// Links;