let plaintorich do the chunking, easier to make sure we dont confuse textedit by cutting inside a tag

This commit is contained in:
dockes 2007-10-18 10:39:41 +00:00
parent df1817414f
commit 607da9bb5e
4 changed files with 102 additions and 84 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.28 2007-10-17 16:12:38 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.29 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -301,7 +301,7 @@ bool myTextSplitCB::matchGroups()
} }
// Setting searchable beacons in the text to walk the term list. // Setting searchable beacons in the text to walk the term list.
static const char *termAnchorNameBase = "FIRSTTERM"; static const char *termAnchorNameBase = "TRM";
string termAnchorName(int i) string termAnchorName(int i)
{ {
char acname[sizeof(termAnchorNameBase) + 20]; char acname[sizeof(termAnchorNameBase) + 20];
@ -314,8 +314,9 @@ string termAnchorName(int i)
// search hit positions does not work well. So we mark the positions with // search hit positions does not work well. So we mark the positions with
// a special string which we then use with the find() function for positionning // a special string which we then use with the find() function for positionning
// We used to use some weird utf8 char for this, but this was displayed // We used to use some weird utf8 char for this, but this was displayed
// inconsistently depending of system, font, etc. We now use a good ole bel // inconsistently depending of system, font, etc. We now use a good ole ctl
// char which doesnt' seem to cause any trouble. // char which doesnt' seem to cause any trouble. Wanted to use ^L, but can't
// be searched, so ^G
const char *firstTermBeacon = "\007"; const char *firstTermBeacon = "\007";
#endif #endif
@ -339,12 +340,11 @@ static string termBeacon(int i)
// Instead, we mark the search term positions either with html anchor // Instead, we mark the search term positions either with html anchor
// (qt currently has problems with them), or a special string, and the // (qt currently has problems with them), or a special string, and the
// caller will use the editor's find() function to position on it // caller will use the editor's find() function to position on it
bool plaintorich(const string& in, string& out, bool plaintorich(const string& in, list<string>& out,
const HiliteData& hdata, const HiliteData& hdata,
bool noHeader, bool needBeacons) bool noHeader, bool needBeacons, int chunksize)
{ {
Chrono chron; Chrono chron;
out.erase();
const vector<string>& terms(hdata.terms); const vector<string>& terms(hdata.terms);
const vector<vector<string> >& groups(hdata.groups); const vector<vector<string> >& groups(hdata.groups);
const vector<int>& slacks(hdata.gslks); const vector<int>& slacks(hdata.gslks);
@ -375,11 +375,15 @@ bool plaintorich(const string& in, string& out,
cb.matchGroups(); cb.matchGroups();
out.clear();
out.push_back("");
list<string>::iterator sit = out.begin();
// Rich text output // Rich text output
if (noHeader) if (noHeader)
out = ""; *sit = "";
else else
out = "<qt><head><title></title></head><body><p>"; *sit = "<qt><head><title></title></head><body><p>";
// Iterator for the list of input term positions. We use it to // Iterator for the list of input term positions. We use it to
// output highlight tags and to compute term positions in the // output highlight tags and to compute term positions in the
@ -413,47 +417,61 @@ bool plaintorich(const string& in, string& out,
int ibyteidx = chariter.getBpos(); int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) { if (ibyteidx == tPosIt->first) {
if (needBeacons) if (needBeacons)
out += termBeacon(anchoridx++); *sit += termBeacon(anchoridx++);
out += "<termtag>"; *sit += "<termtag>";
} else if (ibyteidx == tPosIt->second) { } else if (ibyteidx == tPosIt->second) {
// Output end tag, then skip all highlight areas that // Output end tag, then skip all highlight areas that
// would overlap this one // would overlap this one
out += "</termtag>"; *sit += "</termtag>";
int crend = tPosIt->second; int crend = tPosIt->second;
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend) while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
tPosIt++; tPosIt++;
// Maybe end chunk
if (sit->size() > (unsigned int)chunksize) {
out.push_back("");
sit++;
}
} }
} }
switch(*chariter) { switch(*chariter) {
case '\n': case '\n':
if (ateol < 2) { if (ateol < 2) {
out += "<br>\n"; *sit += "<br>\n";
ateol++; ateol++;
} }
break; break;
case '\r': case '\r':
break; break;
case '\007': // used as anchor char, strip other instances
break;
case '<': case '<':
ateol = 0; ateol = 0;
out += "&lt;"; *sit += "&lt;";
break; break;
case '&': case '&':
ateol = 0; ateol = 0;
out += "&amp;"; *sit += "&amp;";
break; break;
default: default:
// We don't change the eol status for whitespace, want a real line // We don't change the eol status for whitespace, want a real line
if (!(*chariter == ' ' || *chariter == '\t')) { if (!(*chariter == ' ' || *chariter == '\t')) {
ateol = 0; ateol = 0;
} }
chariter.appendchartostring(out); chariter.appendchartostring(*sit);
} }
} }
#if 1 #if 0
{ {
FILE *fp = fopen("/tmp/debugplaintorich", "a"); FILE *fp = fopen("/tmp/debugplaintorich", "a");
fprintf(fp, "%s\n", out.c_str()); fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
for (list<string>::iterator it = out.begin();
it != out.end(); it++) {
fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n");
fprintf(fp, "%s", it->c_str());
fprintf(fp, "ENDOFPLAINTORICHCHUNK\n");
}
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
fclose(fp); fclose(fp);
} }
#endif #endif

View File

@ -16,9 +16,12 @@
*/ */
#ifndef _PLAINTORICH_H_INCLUDED_ #ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_ #define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.14 2007-06-25 10:13:40 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: plaintorich.h,v 1.15 2007-10-18 10:39:41 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list>
using std::list;
using std::string;
// A data struct to hold words and groups of words to be highlighted // A data struct to hold words and groups of words to be highlighted
struct HiliteData { struct HiliteData {
@ -35,23 +38,26 @@ struct HiliteData {
* of phrase/near searches. We treat all such searches as "near", not "phrase" * of phrase/near searches. We treat all such searches as "near", not "phrase"
* *
* @param in raw text out of internfile. * @param in raw text out of internfile.
* @param out rich text output * @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param hdata terms and groups to be highlighted. These are * @param hdata terms and groups to be highlighted. These are
* lowercase and unaccented. * lowercase and unaccented.
* @param noHeader if true don't output header (<qt><title>...) * @param noHeader if true don't output header (<qt><title>...)
* @param needBeacons Need to navigate highlighted terms, mark them. * @param needBeacons Need to navigate highlighted terms, mark them.
*/ */
extern bool plaintorich(const string &in, string &out, extern bool plaintorich(const string &in, list<string> &out,
const HiliteData& hdata, const HiliteData& hdata,
bool noHeader = false, bool noHeader,
bool needBeacons = true); bool needBeacons,
int chunksize = 50000
);
extern string termAnchorName(int i); extern string termAnchorName(int i);
#define QT_SCROLL_TO_ANCHOR_BUG #define QT_SCROLL_TO_ANCHOR_BUG
#ifdef QT_SCROLL_TO_ANCHOR_BUG #ifdef QT_SCROLL_TO_ANCHOR_BUG
// For some reason, can't get scrollToAnchor() to work. We use a string made // For some reason, can't get scrollToAnchor() to work. We use a special
// of a few rare utf8 chars as a beacon for the match area. // string as a beacon for the match area.
extern const char *firstTermBeacon; extern const char *firstTermBeacon;
#endif #endif

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.27 2007-09-08 17:25:49 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.28 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -35,10 +35,12 @@ using std::pair;
#if (QT_VERSION < 0x040000) #if (QT_VERSION < 0x040000)
#include <qtextedit.h> #include <qtextedit.h>
#include <qprogressdialog.h> #include <qprogressdialog.h>
#define THRFINISHED finished
#else #else
#include <q3textedit.h> #include <q3textedit.h>
#include <q3progressdialog.h> #include <q3progressdialog.h>
#include <q3stylesheet.h> #include <q3stylesheet.h>
#define THRFINISHED isFinished
#endif #endif
#include <qevent.h> #include <qevent.h>
#include <qlabel.h> #include <qlabel.h>
@ -581,10 +583,10 @@ class LoadThread : public QThread {
class ToRichThread : public QThread { class ToRichThread : public QThread {
string &in; string &in;
const HiliteData &hdata; const HiliteData &hdata;
QString &out; list<string> &out;
int loglevel; int loglevel;
public: public:
ToRichThread(string &i, const HiliteData& hd, QString &o) ToRichThread(string &i, const HiliteData& hd, list<string> &o)
: in(i), hdata(hd), out(o) : in(i), hdata(hd), out(o)
{ {
loglevel = DebugLog::getdbl()->getlevel(); loglevel = DebugLog::getdbl()->getlevel();
@ -592,12 +594,10 @@ class ToRichThread : public QThread {
virtual void run() virtual void run()
{ {
DebugLog::getdbl()->setloglevel(loglevel); DebugLog::getdbl()->setloglevel(loglevel);
string rich;
try { try {
plaintorich(in, rich, hdata, false, true); plaintorich(in, out, hdata, false, true);
} catch (CancelExcept) { } catch (CancelExcept) {
} }
out = QString::fromUtf8(rich.c_str(), rich.length());
} }
}; };
@ -665,13 +665,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
for (prog = 1;;prog++) { for (prog = 1;;prog++) {
waiter.start(); waiter.start();
waiter.wait(); waiter.wait();
#if (QT_VERSION < 0x040000) if (lthr.THRFINISHED ())
if (lthr.finished())
break; break;
#else
if (lthr.isFinished())
break;
#endif
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1); progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents(); qApp->processEvents();
if (progress.wasCanceled()) { if (progress.wasCanceled()) {
@ -703,29 +698,27 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
// Reset config just in case. // Reset config just in case.
rclconfig->setKeyDir(""); rclconfig->setKeyDir("");
// Create preview text: highlight search terms (if not too big): // Create preview text: highlight search terms
QString richTxt;
// We don't do the highlighting for very big texts: too long. We // We don't do the highlighting for very big texts: too long. We
// should at least do special char escaping, in case a '&' or '<' // should at least do special char escaping, in case a '&' or '<'
// somehow slipped through previous processing. // somehow slipped through previous processing.
bool highlightTerms = fdoc.text.length() < (unsigned long)prefs.maxhltextmbs * 1024 * 1024; bool highlightTerms = fdoc.text.length() <
int beaconPos = -1; (unsigned long)prefs.maxhltextmbs * 1024 * 1024;
// Final text is produced in chunks so that we can display the top
// while still inserting at bottom
list<QString> qrichlst;
if (highlightTerms) { if (highlightTerms) {
progress.setLabelText(tr("Creating preview text")); progress.setLabelText(tr("Creating preview text"));
qApp->processEvents(); qApp->processEvents();
ToRichThread rthr(fdoc.text, m_hData, richTxt); list<string> richlst;
ToRichThread rthr(fdoc.text, m_hData, richlst);
rthr.start(); rthr.start();
for (;;prog++) { for (;;prog++) {
waiter.start(); waiter.wait(); waiter.start(); waiter.wait();
#if (QT_VERSION < 0x040000) if (rthr.THRFINISHED ())
if (rthr.finished()) break;
break;
#else
if (rthr.isFinished())
break;
#endif
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1); progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents(); qApp->processEvents();
if (progress.wasCanceled()) { if (progress.wasCanceled()) {
@ -737,32 +730,36 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
// Conversion to rich text done // Conversion to rich text done
if (CancelCheck::instance().cancelState()) { if (CancelCheck::instance().cancelState()) {
if (richTxt.length() == 0) { if (richlst.size() == 0 || richlst.front().length() == 0) {
// We cant call closeCurrentTab here as it might delete // We cant call closeCurrentTab here as it might delete
// the object which would be a nasty surprise to our // the object which would be a nasty surprise to our
// caller. // caller.
return false; return false;
} else { } else {
richTxt += "<b>Cancelled !</b>"; richlst.back() += "<b>Cancelled !</b>";
} }
} }
beaconPos = richTxt.find(QString::fromUtf8(firstTermBeacon)); // Convert to QString list
for (list<string>::iterator it = richlst.begin();
it != richlst.end(); it++) {
qrichlst.push_back(QString::fromUtf8(it->c_str(), it->length()));
}
} else { } else {
// Note that in the case were we don't call plaintorich, the // No plaintorich() call.
// text will no be identified as richtxt/html (no <html> or // In this case, the text will no be identified as
// <qt> etc. at the beginning), and there is no need to escape // richtxt/html (no <html> or <qt> etc. at the beginning), and
// special characters // there is no need to escape special characters.
richTxt = QString::fromUtf8(fdoc.text.c_str(), fdoc.text.length()); // Also we need to split in chunks (so that the top is displayed faster),
// and we must do it on a QString (to avoid utf8 issues).
QString qr = QString::fromUtf8(fdoc.text.c_str(), fdoc.text.length());
int l = 0;
for (int pos = 0; pos < (int)qr.length(); pos += l) {
l = MIN(CHUNKL, qr.length() - pos);
qrichlst.push_back(qr.mid(pos, l));
}
} }
m_haveAnchors = (beaconPos != -1);
LOGDEB(("LoadFileInCurrentTab: rich: cancel %d txtln %d, hasAnchors %d "
"(beaconPos %d)\n",
CancelCheck::instance().cancelState(), richTxt.length(),
m_haveAnchors, beaconPos));
// Load into editor // Load into editor
// Do it in several chunks
QTextEdit *editor = getCurrentEditor(); QTextEdit *editor = getCurrentEditor();
editor->setText(""); editor->setText("");
if (highlightTerms) { if (highlightTerms) {
@ -775,24 +772,18 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
prog = 2 * nsteps / 3; prog = 2 * nsteps / 3;
progress.setLabelText(tr("Loading preview text into editor")); progress.setLabelText(tr("Loading preview text into editor"));
qApp->processEvents(); qApp->processEvents();
int l = 0; int instep = 0;
for (int pos = 0; pos < (int)richTxt.length(); pos += l, prog++) { for (list<QString>::iterator it = qrichlst.begin();
it != qrichlst.end(); it++, prog++, instep++) {
progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1); progress.setProgress(prog , prog <= nsteps-1 ? nsteps : prog+1);
qApp->processEvents(); qApp->processEvents();
if (it->find(QString::fromUtf8(firstTermBeacon)) != -1)
l = MIN(CHUNKL, richTxt.length() - pos); m_haveAnchors = true;
// Avoid breaking inside a tag. Our tags are short (ie: <br>)
if (pos + l != (int)richTxt.length()) { editor->append(*it);
for (int i = -15; i < 0; i++) {
if (richTxt[pos+l+i] == '<') {
l = l+i;
break;
}
}
}
editor->append(richTxt.mid(pos, l));
// Stay at top // Stay at top
if (pos < 5) { if (instep < 5) {
editor->setCursorPosition(0,0); editor->setCursorPosition(0,0);
editor->ensureCursorVisible(); editor->ensureCursorVisible();
} }
@ -803,6 +794,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
break; break;
} }
} }
progress.close(); progress.close();
if (searchTextLine->text().length() != 0) { if (searchTextLine->text().length() != 0) {

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.34 2007-08-07 08:42:47 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: reslist.cpp,v 1.35 2007-10-18 10:39:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
#include <time.h> #include <time.h>
@ -478,8 +478,9 @@ void ResList::resultPageNext()
abstract = doc.meta["abstract"]; abstract = doc.meta["abstract"];
} }
// No need to call escapeHtml(), plaintorich handles it // No need to call escapeHtml(), plaintorich handles it
string richabst; list<string> lr;
plaintorich(abstract, richabst, hdata, true, false); plaintorich(abstract, lr, hdata, true, false, 100000);
string richabst = lr.front();
// Links; // Links;
string linksbuf; string linksbuf;