From 5856df2230c292b8d8592ac86e543081e1be2d5a Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 1 Jul 2008 08:27:58 +0000 Subject: [PATCH] cleaned up plaintorich. Now a proper subclassable class + highlights multiple groups, not just the first --- src/qtgui/plaintorich.cpp | 171 ++++++++++++++++---------------------- src/qtgui/plaintorich.h | 71 ++++++++++------ src/qtgui/preview_w.cpp | 27 +++--- src/qtgui/preview_w.h | 30 ++++++- src/qtgui/reslist.cpp | 15 +++- 5 files changed, 172 insertions(+), 142 deletions(-) diff --git a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp index 7210ace1..91298380 100644 --- a/src/qtgui/plaintorich.cpp +++ b/src/qtgui/plaintorich.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.30 2007-11-15 18:05:32 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.31 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -43,6 +43,8 @@ using std::set; #include "plaintorich.h" #include "cancelcheck.h" +const string PlainToRich::snull = ""; + // For debug printing static string vecStringToString(const vector& t) { @@ -58,19 +60,13 @@ static string vecStringToString(const vector& t) class myTextSplitCB : public TextSplitCB { public: - // Out: first query term found in text - string firstTerm; - int firstTermOcc; - int m_firstTermPos; - int m_firstTermBPos; - // Out: begin and end byte positions of query terms/groups in text vector > tboffs; myTextSplitCB(const vector& its, const vector >&groups, const vector& slacks) - : firstTermOcc(1), m_wcount(0), m_groups(groups), m_slacks(slacks) + : m_wcount(0), m_groups(groups), m_slacks(slacks) { for (vector::const_iterator it = its.begin(); it != its.end(); it++) { @@ -95,11 +91,6 @@ class myTextSplitCB : public TextSplitCB { // If this word is a search term, remember its byte-offset span. if (m_terms.find(dumb) != m_terms.end()) { tboffs.push_back(pair(bts, bte)); - if (firstTerm.empty()) { - firstTerm = term; - m_firstTermPos = pos; - m_firstTermBPos = bts; - } } if (m_gterms.find(dumb) != m_gterms.end()) { @@ -148,10 +139,12 @@ class VecIntCmpShorter { #define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \ if ((POS) > (STO)) (STO) = (POS);} -// Recursively check that each term is inside the window (which is readjusted -// as the successive terms are found) +// Recursively check that each term is inside the window (which is +// readjusted as the successive terms are found). i is the index for +// the next position list to use (initially 1) static bool do_proximity_test(int window, vector* >& plists, - unsigned int i, int min, int max, int *sp, int *ep) + unsigned int i, int min, int max, + int *sp, int *ep) { int tmp = max + 1; // take care to avoid underflow @@ -210,7 +203,7 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) it != terms.end(); it++) { map >::iterator pl = m_plists.find(*it); if (pl == m_plists.end()) { - LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n", + LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n", (*it).c_str())); continue; } @@ -218,58 +211,53 @@ bool myTextSplitCB::matchGroup(const vector& terms, int window) plistToTerm[&(pl->second)] = *it; realgroup.push_back(*it); } - LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window, - vecStringToString(realgroup).c_str())); - if (plists.size() < 2) + LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", + window, vecStringToString(realgroup).c_str())); + if (plists.size() < 2) { + LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n")); return false; + } // Sort the positions lists so that the shorter is first std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); - // Walk the shortest plist and look for matches - int sta = int(10E9), sto = 0; - int pos; - // Occurrences are from 1->N - firstTermOcc = 0; - vector::iterator it = plists[0]->begin(); - do { - if (it == plists[0]->end()) + { // Debug + map*, string>::iterator it; + it = plistToTerm.find(plists[0]); + if (it == plistToTerm.end()) { + // SuperWeird + LOGERR(("matchGroup: term for first list not found !?!\n")); return false; - pos = *it++; - firstTermOcc++; - } while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)); - SETMINMAX(pos, sta, sto); - - LOGDEB0(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto)); - - // Translate the position window into a byte offset window - int bs = 0; - map >::iterator i1 = m_gpostobytes.find(sta); - map >::iterator i2 = m_gpostobytes.find(sto); - if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { - LOGDEB1(("myTextSplitCB::matchGroup: pushing %d %d\n", - i1->second.first, i2->second.second)); - tboffs.push_back(pair(i1->second.first, i2->second.second)); - bs = i1->second.first; - } else { - LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", - sta, sto)); + } + LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n", + it->second.c_str(), plists[0]->size())); } - if (firstTerm.empty() || m_firstTermPos > sta) { - // firsTerm is used to try an position the preview window over - // the match. As it's difficult to divine byte/word positions - // in qtextedit, we use a string search. Use the - // shortest plist for this, which hopefully gives a better - // chance for the group to be found (it's hopeless to try and - // match the whole group) - map*, string>::iterator it = - plistToTerm.find(plists.front()); - if (it != plistToTerm.end()) - firstTerm = it->second; - LOGDEB0(("myTextSplitCB:: best group term %s, firstTermOcc %d\n", - firstTerm.c_str(), firstTermOcc)); - m_firstTermPos = sta; - m_firstTermBPos = bs; + // Walk the shortest plist and look for matches + for (vector::iterator it = plists[0]->begin(); + it != plists[0]->end(); it++) { + int pos = *it; + int sta = int(10E9), sto = 0; + LOGDEB0(("MatchGroup: Testing at pos %d\n", pos)); + if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) { + LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", + sta, sto)); + // Maybe extend the window by 1st term position, this was not + // done by do_prox.. + SETMINMAX(pos, sta, sto); + // Translate the position window into a byte offset window + int bs = 0; + map >::iterator i1 = m_gpostobytes.find(sta); + map >::iterator i2 = m_gpostobytes.find(sto); + if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { + LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n", + i1->second.first, i2->second.second)); + tboffs.push_back(pair(i1->second.first, + i2->second.second)); + bs = i1->second.first; + } else { + LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto)); + } + } } return true; @@ -300,20 +288,6 @@ bool myTextSplitCB::matchGroups() return true; } -// Setting searchable beacons in the text to walk the term list. -static const char *termAnchorNameBase = "TRM"; -string termAnchorName(int i) -{ - char acname[sizeof(termAnchorNameBase) + 20]; - sprintf(acname, "%s%d", termAnchorNameBase, i); - return string(acname); -} - -static string termBeacon(int i) -{ - return string(""; -} - // Fix result text for display inside the gui text window. // @@ -325,9 +299,9 @@ static string termBeacon(int i) // Instead, we mark the search term positions either with html anchor // (qt currently has problems with them), or a special string, and the // caller will use the editor's find() function to position on it -bool plaintorich(const string& in, list& out, - const HiliteData& hdata, - bool noHeader, int *lastAnchor, int chunksize) +bool PlainToRich::plaintorich(const string& in, list& out, + const HiliteData& hdata, + int chunksize) { Chrono chron; const vector& terms(hdata.terms); @@ -342,6 +316,7 @@ bool plaintorich(const string& in, list& out, LOGDEB0(("plaintorich: groups: \n")); for (vector >::const_iterator vit = groups.begin(); vit != groups.end(); vit++) { + sterms += "GROUP: "; sterms += vecStringToString(*vit); sterms += "\n"; } @@ -362,13 +337,10 @@ bool plaintorich(const string& in, list& out, out.clear(); out.push_back(""); - list::iterator sit = out.begin(); + list::iterator olit = out.begin(); // Rich text output - if (noHeader) - *sit = ""; - else - *sit = "

"; + *olit = header(); // Iterator for the list of input term positions. We use it to // output highlight tags and to compute term positions in the @@ -388,10 +360,11 @@ bool plaintorich(const string& in, list& out, // State variable used to limitate the number of consecutive empty lines int ateol = 0; - // Stuff for numbered anchors at each term match + // Value for numbered anchors at each term match int anchoridx = 1; for (string::size_type pos = 0; pos != string::npos; pos = chariter++) { + // Check from time to time if we need to stop if ((pos & 0xfff) == 0) { CancelCheck::instance().checkCancel(); } @@ -401,20 +374,20 @@ bool plaintorich(const string& in, list& out, if (tPosIt != tboffsend) { int ibyteidx = chariter.getBpos(); if (ibyteidx == tPosIt->first) { - if (lastAnchor) - *sit += termBeacon(anchoridx++); - *sit += ""; + *olit += startAnchor(anchoridx++); + *olit += startMatch(); } else if (ibyteidx == tPosIt->second) { // Output end tag, then skip all highlight areas that // would overlap this one - *sit += ""; + *olit += endMatch(); + *olit += endAnchor(); int crend = tPosIt->second; while (tPosIt != cb.tboffs.end() && tPosIt->first < crend) tPosIt++; - // Maybe end chunk - if (sit->size() > (unsigned int)chunksize) { + // Maybe end this chunk, begin next + if (olit->size() > (unsigned int)chunksize) { out.push_back(""); - sit++; + olit++; } } } @@ -422,33 +395,29 @@ bool plaintorich(const string& in, list& out, switch(*chariter) { case '\n': if (ateol < 2) { - *sit += "
\n"; + *olit += "
\n"; ateol++; } break; case '\r': break; - case '\007': // used as anchor char, strip other instances - break; case '<': ateol = 0; - *sit += "<"; + *olit += "<"; break; case '&': ateol = 0; - *sit += "&"; + *olit += "&"; break; default: // We don't change the eol status for whitespace, want a real line if (!(*chariter == ' ' || *chariter == '\t')) { ateol = 0; } - chariter.appendchartostring(*sit); + chariter.appendchartostring(*olit); } } - if (lastAnchor) - *lastAnchor = anchoridx - 1; -#if 0 +#if 1 { FILE *fp = fopen("/tmp/debugplaintorich", "a"); fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n"); diff --git a/src/qtgui/plaintorich.h b/src/qtgui/plaintorich.h index 895aa0ee..29ee4c93 100644 --- a/src/qtgui/plaintorich.h +++ b/src/qtgui/plaintorich.h @@ -16,42 +16,65 @@ */ #ifndef _PLAINTORICH_H_INCLUDED_ #define _PLAINTORICH_H_INCLUDED_ -/* @(#$Id: plaintorich.h,v 1.16 2007-11-15 18:05:32 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: plaintorich.h,v 1.17 2008-07-01 08:27:58 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include using std::list; using std::string; -// A data struct to hold words and groups of words to be highlighted +/// Holder for plaintorich() input data: words and groups of words to +/// be highlighted struct HiliteData { + // Single terms vector terms; + // NEAR and PHRASE elements vector > groups; - vector gslks; // group slacks (number of permitted non-matched words) + // Group slacks (number of permitted non-matched words). + // Parallel vector to the above 'groups' + vector gslks; }; -/** - * Transform plain text into qt rich text for the preview window. - * - * We escape characters like < or &, and add qt rich text tags to - * colorize the query terms. The latter is a quite complicated matter because - * of phrase/near searches. We treat all such searches as "near", not "phrase" - * - * @param in raw text out of internfile. - * @param out rich text output, divided in chunks (to help our caller - * avoid inserting half tags into textedit which doesnt like it) - * @param hdata terms and groups to be highlighted. These are - * lowercase and unaccented. - * @param noHeader if true don't output header (...) - * @param needBeacons Need to navigate highlighted terms, mark them,return last +/** + * A class for highlighting search results. Overridable methods allow + * for different styles */ -extern bool plaintorich(const string &in, list<string> &out, - const HiliteData& hdata, - bool noHeader, - int *needBeacons, - int chunksize = 50000 - ); +class PlainToRich { +public: + static const string snull; + virtual ~PlainToRich() {} + /** + * Transform plain text for highlighting search terms, ie in the + * preview window or result list entries. + * + * The actual tags used for highlighting and anchoring are + * determined by deriving from this class which handles the searching for + * terms and groups, but there is an assumption that the output will be + * html-like: we escape characters like < or & + * + * Finding the search terms is relatively complicated because of + * phrase/near searches, which need group highlights. As a matter + * of simplification, we handle "phrase" as "near", not filtering + * on word order. + * + * @param in raw text out of internfile. + * @param out rich text output, divided in chunks (to help our caller + * avoid inserting half tags into textedit which doesnt like it) + * @param hdata terms and groups to be highlighted. These are + * lowercase and unaccented. + * @param chunksize max size of chunks in output list + */ + virtual bool plaintorich(const string &in, list<string> &out, + const HiliteData& hdata, + int chunksize = 50000 + ); -extern string termAnchorName(int i); + /* Methods to ouput headers, highlighting and marking tags */ + virtual string header() {return snull;} + virtual string startMatch() {return snull;} + virtual string endMatch() {return snull;} + virtual string startAnchor(int) {return snull;} + virtual string endAnchor() {return snull;} +}; #endif /* _PLAINTORICH_H_INCLUDED_ */ diff --git a/src/qtgui/preview_w.cpp b/src/qtgui/preview_w.cpp index 6cb2f4e2..4c6c92cc 100644 --- a/src/qtgui/preview_w.cpp +++ b/src/qtgui/preview_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.34 2008-05-21 07:21:37 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.35 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -345,17 +345,17 @@ void Preview::doSearch(const QString &_text, bool next, bool reverse, return; if (reverse) { if (m_curAnchor == 1) - m_curAnchor = m_lastAnchor; + m_curAnchor = m_plaintorich.lastanchor; else m_curAnchor--; } else { - if (m_curAnchor == m_lastAnchor) + if (m_curAnchor == m_plaintorich.lastanchor) m_curAnchor = 1; else m_curAnchor++; } QString aname = - QString::fromUtf8(termAnchorName(m_curAnchor).c_str()); + QString::fromUtf8(m_plaintorich.termAnchorName(m_curAnchor).c_str()); edit->moveToAnchor(aname); return; } @@ -552,7 +552,7 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc, } LOGDEB(("Doc.url: [%s]\n", doc.url.c_str())); string url; - printableUrl(doc.url, url); + printableUrl(rclconfig->getDefCharset(), doc.url, url); string tiptxt = url + string("\n"); tiptxt += doc.mimetype + " " + string(datebuf) + "\n"; if (meta_it != doc.meta.end() && !meta_it->second.empty()) @@ -670,7 +670,7 @@ class LoadThread : public QThread { *statusp = -1; return; } - + FileInterner interner(filename, &st, rclconfig, tmpdir, mtype); try { FileInterner::Status ret = interner.internfile(*out, ipath); @@ -699,11 +699,11 @@ class ToRichThread : public QThread { const HiliteData &hdata; list<string> &out; int loglevel; - int *lastanchor; + PlainToRichQtPreview& ptr; public: ToRichThread(string &i, const HiliteData& hd, list<string> &o, - int *lsta) - : in(i), hdata(hd), out(o), lastanchor(lsta) + PlainToRichQtPreview& _ptr) + : in(i), hdata(hd), out(o), ptr(_ptr) { loglevel = DebugLog::getdbl()->getlevel(); } @@ -711,7 +711,7 @@ class ToRichThread : public QThread { { DebugLog::getdbl()->setloglevel(loglevel); try { - plaintorich(in, out, hdata, false, lastanchor); + ptr.plaintorich(in, out, hdata); } catch (CancelExcept) { } } @@ -828,7 +828,7 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, progress.setLabelText(tr("Creating preview text")); qApp->processEvents(); list<string> richlst; - ToRichThread rthr(fdoc.text, m_hData, richlst, &m_lastAnchor); + ToRichThread rthr(fdoc.text, m_hData, richlst, m_plaintorich); rthr.start(); for (;;prog++) { @@ -911,7 +911,7 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, progress.close(); - m_haveAnchors = m_lastAnchor != 0; + m_haveAnchors = m_plaintorich.lastanchor != 0; if (searchTextLine->text().length() != 0) { // If there is a current search string, perform the search m_canBeep = true; @@ -919,7 +919,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, } else { // Position to the first query term if (m_haveAnchors) { - QString aname = QString::fromUtf8(termAnchorName(1).c_str()); + QString aname = + QString::fromUtf8(m_plaintorich.termAnchorName(1).c_str()); LOGDEB2(("Call movetoanchor(%s)\n", (const char *)aname.utf8())); editor->moveToAnchor(aname); m_curAnchor = 1; diff --git a/src/qtgui/preview_w.h b/src/qtgui/preview_w.h index 864aab38..7e4c230b 100644 --- a/src/qtgui/preview_w.h +++ b/src/qtgui/preview_w.h @@ -1,6 +1,6 @@ #ifndef _PREVIEW_W_H_INCLUDED_ #define _PREVIEW_W_H_INCLUDED_ -/* @(#$Id: preview_w.h,v 1.17 2007-11-15 18:34:49 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: preview_w.h,v 1.18 2008-07-01 08:27:58 dockes Exp $ (C) 2006 J.F.Dockes */ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -65,6 +65,33 @@ class TabData { {} }; +// Subclass plainToRich to add <termtag>s and anchors to the preview text +class PlainToRichQtPreview : public PlainToRich { +public: + int lastanchor; + PlainToRichQtPreview() { + lastanchor = 0; + } + virtual ~PlainToRichQtPreview() {} + virtual string header() { + return string("<qt><head><title>

"); + } + virtual string startMatch() {return string("");} + virtual string endMatch() {return string("");} + virtual string termAnchorName(int i) { + static const char *termAnchorNameBase = "TRM"; + char acname[sizeof(termAnchorNameBase) + 20]; + sprintf(acname, "%s%d", termAnchorNameBase, i); + if (i > lastanchor) + lastanchor = i; + return string(acname); + } + + virtual string startAnchor(int i) { + return string(""; + } +}; + class Preview : public QWidget { Q_OBJECT @@ -116,6 +143,7 @@ private: QWidget *m_currentW; HiliteData m_hData; bool m_justCreated; // First tab create is different + PlainToRichQtPreview m_plaintorich; bool m_haveAnchors; // Search terms are marked in text int m_lastAnchor; // Number of last anchor. Then rewind to 1 int m_curAnchor; diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 07c71d2a..6bd2d822 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: reslist.cpp,v 1.39 2008-05-21 07:21:37 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: reslist.cpp,v 1.40 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -49,6 +49,14 @@ static char rcsid[] = "@(#$Id: reslist.cpp,v 1.39 2008-05-21 07:21:37 dockes Exp #define MIN(A,B) ((A) < (B) ? (A) : (B)) #endif + +class PlainToRichQtReslist : public PlainToRich { +public: + virtual ~PlainToRichQtReslist() {} + virtual string startMatch() {return string("");} + virtual string endMatch() {return string("");} +}; + ResList::ResList(QWidget* parent, const char* name) : QTEXTBROWSER(parent, name) { @@ -430,7 +438,7 @@ void ResList::resultPageNext() // Printable url: either utf-8 if transcoding succeeds, or url-encoded string url; - printableUrl(doc.url, url); + printableUrl(rclconfig->getDefCharset(), doc.url, url); // Make title out of file name if none yet if (doc.meta["title"].empty()) { @@ -480,7 +488,8 @@ void ResList::resultPageNext() } // No need to call escapeHtml(), plaintorich handles it list lr; - plaintorich(abstract, lr, hdata, true, 0, 100000); + PlainToRichQtReslist ptr; + ptr.plaintorich(abstract, lr, hdata); string richabst = lr.front(); // Links;