let plaintorich do the chunking, easier to make sure we dont confuse textedit by cutting inside a tag

This commit is contained in:
dockes 2007-10-18 10:39:41 +00:00
parent df1817414f
commit 607da9bb5e
4 changed files with 102 additions and 84 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.28 2007-10-17 16:12:38 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.29 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -301,7 +301,7 @@ bool myTextSplitCB::matchGroups()
}
// Setting searchable beacons in the text to walk the term list.
static const char *termAnchorNameBase = "FIRSTTERM";
static const char *termAnchorNameBase = "TRM";
string termAnchorName(int i)
{
char acname[sizeof(termAnchorNameBase) + 20];
@ -314,8 +314,9 @@ string termAnchorName(int i)
// search hit positions does not work well. So we mark the positions with
// a special string which we then use with the find() function for positionning
// We used to use some weird utf8 char for this, but this was displayed
// inconsistently depending of system, font, etc. We now use a good ole bel
// char which doesnt' seem to cause any trouble.
// inconsistently depending of system, font, etc. We now use a good ole ctl
// char which doesnt' seem to cause any trouble. Wanted to use ^L, but can't
// be searched, so ^G
const char *firstTermBeacon = "\007";
#endif
@ -339,12 +340,11 @@ static string termBeacon(int i)
// Instead, we mark the search term positions either with html anchor
// (qt currently has problems with them), or a special string, and the
// caller will use the editor's find() function to position on it
bool plaintorich(const string& in, string& out,
bool plaintorich(const string& in, list<string>& out,
const HiliteData& hdata,
bool noHeader, bool needBeacons)
bool noHeader, bool needBeacons, int chunksize)
{
Chrono chron;
out.erase();
const vector<string>& terms(hdata.terms);
const vector<vector<string> >& groups(hdata.groups);
const vector<int>& slacks(hdata.gslks);
@ -375,11 +375,15 @@ bool plaintorich(const string& in, string& out,
cb.matchGroups();
out.clear();
out.push_back("");
list<string>::iterator sit = out.begin();
// Rich text output
if (noHeader)
out = "";
*sit = "";
else
out = "<qt><head><title></title></head><body><p>";
*sit = "<qt><head><title></title></head><body><p>";
// Iterator for the list of input term positions. We use it to
// output highlight tags and to compute term positions in the
@ -413,47 +417,61 @@ bool plaintorich(const string& in, string& out,
int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) {
if (needBeacons)
out += termBeacon(anchoridx++);
out += "<termtag>";
*sit += termBeacon(anchoridx++);
*sit += "<termtag>";
} else if (ibyteidx == tPosIt->second) {
// Output end tag, then skip all highlight areas that
// would overlap this one
out += "</termtag>";
*sit += "</termtag>";
int crend = tPosIt->second;
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
tPosIt++;
// Maybe end chunk
if (sit->size() > (unsigned int)chunksize) {
out.push_back("");
sit++;
}
}
}
switch(*chariter) {
case '\n':
if (ateol < 2) {
out += "<br>\n";
*sit += "<br>\n";
ateol++;
}
break;
case '\r':
break;
case '\007': // used as anchor char, strip other instances
break;
case '<':
ateol = 0;
out += "&lt;";
*sit += "&lt;";
break;
case '&':
ateol = 0;
out += "&amp;";
*sit += "&amp;";
break;
default:
// We don't change the eol status for whitespace, want a real line
if (!(*chariter == ' ' || *chariter == '\t')) {
ateol = 0;
}
chariter.appendchartostring(out);
chariter.appendchartostring(*sit);
}
}
#if 1
#if 0
{
FILE *fp = fopen("/tmp/debugplaintorich", "a");
fprintf(fp, "%s\n", out.c_str());
fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
for (list<string>::iterator it = out.begin();
it != out.end(); it++) {
fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n");
fprintf(fp, "%s", it->c_str());
fprintf(fp, "ENDOFPLAINTORICHCHUNK\n");
}
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
fclose(fp);
}
#endif

View File

@ -16,9 +16,12 @@
*/
#ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.14 2007-06-25 10:13:40 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: plaintorich.h,v 1.15 2007-10-18 10:39:41 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
using std::list;
using std::string;
// A data struct to hold words and groups of words to be highlighted
struct HiliteData {
@ -35,23 +38,26 @@ struct HiliteData {
* of phrase/near searches. We treat all such searches as "near", not "phrase"
*
* @param in raw text out of internfile.
* @param out rich text output
* @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param hdata terms and groups to be highlighted. These are
* lowercase and unaccented.
* @param noHeader if true don't output header (<qt><title>...)
* @param needBeacons Need to navigate highlighted terms, mark them.
*/
extern bool plaintorich(const string &in, string &out,
extern bool plaintorich(const string &in, list<string> &out,
const HiliteData& hdata,
bool noHeader = false,
bool needBeacons = true);
bool noHeader,
bool needBeacons,
int chunksize = 50000
);
extern string termAnchorName(int i);
#define QT_SCROLL_TO_ANCHOR_BUG
#ifdef QT_SCROLL_TO_ANCHOR_BUG
// For some reason, can't get scrollToAnchor() to work. We use a string made
// of a few rare utf8 chars as a beacon for the match area.
// For some reason, can't get scrollToAnchor() to work. We use a special
// string as a beacon for the match area.
extern const char *firstTermBeacon;
#endif

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.27 2007-09-08 17:25:49 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.28 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -35,10 +35,12 @@ using std::pair;
#if (QT_VERSION < 0x040000)
#include <qtextedit.h>
#include <qprogressdialog.h>
#define THRFINISHED finished
#else
#include <q3textedit.h>
#include <q3progressdialog.h>
#include <q3stylesheet.h>
#define THRFINISHED isFinished
#endif
#include <qevent.h>
#include <qlabel.h>
@ -581,10 +583,10 @@ class LoadThread : public QThread {
class ToRichThread : public QThread {
string &in;
const HiliteData &hdata;
QString &out;
list<string> &out;
int loglevel;
public:
ToRichThread(string &i, const HiliteData& hd, QString &o)
ToRichThread(string &i, const HiliteData& hd, list<string> &o)
: in(i), hdata(hd), out(o)
{
loglevel = DebugLog::getdbl()->getlevel();
@ -592,12 +594,10 @@ class ToRichThread : public QThread {
virtual void run()
{
DebugLog::getdbl()->setloglevel(loglevel);
string rich;
try {
plaintorich(in, rich, hdata, false, true);
plaintorich(in, out, hdata, false, true);
} catch (CancelExcept) {
}
out = QString::fromUtf8(rich.c_str(), rich.length());
}
};
@ -665,13 +665,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
for (prog = 1;;prog++) {
waiter.start();
waiter.wait();
#if (QT_VERSION < 0x040000)
if (lthr.finished())
if (lthr.THRFINISHED ())
break;
#else
if (lthr.isFinished())
break;
#endif
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents();
if (progress.wasCanceled()) {
@ -703,29 +698,27 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
// Reset config just in case.
rclconfig->setKeyDir("");
// Create preview text: highlight search terms (if not too big):
QString richTxt;
// Create preview text: highlight search terms
// We don't do the highlighting for very big texts: too long. We
// should at least do special char escaping, in case a '&' or '<'
// somehow slipped through previous processing.
bool highlightTerms = fdoc.text.length() < (unsigned long)prefs.maxhltextmbs * 1024 * 1024;
int beaconPos = -1;
bool highlightTerms = fdoc.text.length() <
(unsigned long)prefs.maxhltextmbs * 1024 * 1024;
// Final text is produced in chunks so that we can display the top
// while still inserting at bottom
list<QString> qrichlst;
if (highlightTerms) {
progress.setLabelText(tr("Creating preview text"));
qApp->processEvents();
ToRichThread rthr(fdoc.text, m_hData, richTxt);
list<string> richlst;
ToRichThread rthr(fdoc.text, m_hData, richlst);
rthr.start();
for (;;prog++) {
waiter.start(); waiter.wait();
#if (QT_VERSION < 0x040000)
if (rthr.finished())
break;
#else
if (rthr.isFinished())
break;
#endif
if (rthr.THRFINISHED ())
break;
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents();
if (progress.wasCanceled()) {
@ -737,32 +730,36 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
// Conversion to rich text done
if (CancelCheck::instance().cancelState()) {
if (richTxt.length() == 0) {
if (richlst.size() == 0 || richlst.front().length() == 0) {
// We cant call closeCurrentTab here as it might delete
// the object which would be a nasty surprise to our
// caller.
return false;
} else {
richTxt += "<b>Cancelled !</b>";
richlst.back() += "<b>Cancelled !</b>";
}
}
beaconPos = richTxt.find(QString::fromUtf8(firstTermBeacon));
// Convert to QString list
for (list<string>::iterator it = richlst.begin();
it != richlst.end(); it++) {
qrichlst.push_back(QString::fromUtf8(it->c_str(), it->length()));
}
} else {
// Note that in the case were we don't call plaintorich, the
// text will no be identified as richtxt/html (no <html> or
// <qt> etc. at the beginning), and there is no need to escape
// special characters
richTxt = QString::fromUtf8(fdoc.text.c_str(), fdoc.text.length());
// No plaintorich() call.
// In this case, the text will no be identified as
// richtxt/html (no <html> or <qt> etc. at the beginning), and
// there is no need to escape special characters.
// Also we need to split in chunks (so that the top is displayed faster),
// and we must do it on a QString (to avoid utf8 issues).
QString qr = QString::fromUtf8(fdoc.text.c_str(), fdoc.text.length());
int l = 0;
for (int pos = 0; pos < (int)qr.length(); pos += l) {
l = MIN(CHUNKL, qr.length() - pos);
qrichlst.push_back(qr.mid(pos, l));
}
}
m_haveAnchors = (beaconPos != -1);
LOGDEB(("LoadFileInCurrentTab: rich: cancel %d txtln %d, hasAnchors %d "
"(beaconPos %d)\n",
CancelCheck::instance().cancelState(), richTxt.length(),
m_haveAnchors, beaconPos));
// Load into editor
// Do it in several chunks
QTextEdit *editor = getCurrentEditor();
editor->setText("");
if (highlightTerms) {
@ -775,24 +772,18 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
prog = 2 * nsteps / 3;
progress.setLabelText(tr("Loading preview text into editor"));
qApp->processEvents();
int l = 0;
for (int pos = 0; pos < (int)richTxt.length(); pos += l, prog++) {
int instep = 0;
for (list<QString>::iterator it = qrichlst.begin();
it != qrichlst.end(); it++, prog++, instep++) {
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents();
l = MIN(CHUNKL, richTxt.length() - pos);
// Avoid breaking inside a tag. Our tags are short (ie: <br>)
if (pos + l != (int)richTxt.length()) {
for (int i = -15; i < 0; i++) {
if (richTxt[pos+l+i] == '<') {
l = l+i;
break;
}
}
}
editor->append(richTxt.mid(pos, l));
if (it->find(QString::fromUtf8(firstTermBeacon)) != -1)
m_haveAnchors = true;
editor->append(*it);
// Stay at top
if (pos < 5) {
if (instep < 5) {
editor->setCursorPosition(0,0);
editor->ensureCursorVisible();
}
@ -803,6 +794,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
break;
}
}
progress.close();
if (searchTextLine->text().length() != 0) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.34 2007-08-07 08:42:47 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.35 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <time.h>
@ -478,8 +478,9 @@ void ResList::resultPageNext()
abstract = doc.meta["abstract"];
}
// No need to call escapeHtml(), plaintorich handles it
string richabst;
plaintorich(abstract, richabst, hdata, true, false);
list<string> lr;
plaintorich(abstract, lr, hdata, true, false, 100000);
string richabst = lr.front();
// Links;
string linksbuf;