add option to preview html instead of plain text

2008-10-03 08:09:36 +00:00 · 2008-10-03 08:09:36 +00:00 · 2c27cbb504
commit 2c27cbb504
parent 31b841de7b
9 changed files with 155 additions and 69 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.58 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.59 2008-10-03 08:09:35 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -509,7 +509,7 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
 	for (list<string>::const_iterator it = sl.begin(); 
 	     it != sl.end(); it++) {
 	    string fld = fieldCanon(stringtolower(*it));
-	    LOGDEB(("Inserting [%s] in stored list\n", fld.c_str()));
+	    LOGDEB0(("Inserting [%s] in stored list\n", fld.c_str()));
 	    m_storedFields.insert(fld);
 	}
    }
--- a/src/qtgui/guiutils.cpp
+++ b/src/qtgui/guiutils.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.38 2008-09-28 14:20:50 dockes Exp $ (C) 2005 Jean-Francois Dockes";
+static char rcsid[] = "@(#$Id: guiutils.cpp,v 1.39 2008-10-03 08:09:35 dockes Exp $ (C) 2005 Jean-Francois Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -147,6 +147,8 @@ void rwSettings(bool writing)
 	       "/Recoll/prefs/startWithAdvSearchOpen", Bool, false);
    SETTING_RW(prefs.startWithSortToolOpen, 
 	       "/Recoll/prefs/startWithSortToolOpen", Bool, false);
+    SETTING_RW(prefs.previewHtml, 
+	       "/Recoll/prefs/previewHtml", Bool, true);

    QString advSearchClauses;
    QString ascdflt;
--- a/src/qtgui/guiutils.h
+++ b/src/qtgui/guiutils.h
@ -17,7 +17,7 @@
 #ifndef _GUIUTILS_H_INCLUDED_
 #define _GUIUTILS_H_INCLUDED_
 /* 
- * @(#$Id: guiutils.h,v 1.28 2008-09-28 14:20:50 dockes Exp $  (C) 2005 Jean-Francois Dockes 
+ * @(#$Id: guiutils.h,v 1.29 2008-10-03 08:09:35 dockes Exp $  (C) 2005 Jean-Francois Dockes 
 *                         jean-francois.dockes@wanadoo.fr
 *
 *   This program is free software; you can redistribute it and/or modify
@ -81,6 +81,7 @@ class PrefsPack {
    bool queryReplaceAbstract;
    bool startWithAdvSearchOpen;
    bool startWithSortToolOpen;
+    bool previewHtml;
    // Extra query indexes. This are encoded to base64 before storing
    // to the qt settings file to avoid any bin string/ charset conv issues
    list<string> allExtraDbs;
--- a/src/qtgui/plaintorich.cpp
+++ b/src/qtgui/plaintorich.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.32 2008-07-04 09:29:50 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.33 2008-10-03 08:09:35 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -296,10 +296,12 @@ bool myTextSplitCB::matchGroups()
 // duplicate whitespace etc...). This was tricky business, dependant
 // on qtextedit internals, and we don't do it any more, so we finally
 // don't know the term par/car positions in the editor text.  
-// Instead, we mark the search term positions either with html anchor
-// (qt currently has problems with them), or a special string, and the
-// caller will use the editor's find() function to position on it
-bool PlainToRich::plaintorich(const string& in, list<string>& out, 
+// Instead, we now mark the search term positions with html anchors
+//
+// We output the result in chunks, arranging not to cut in the middle of
+// a tag, which would confuse qtextedit.
+bool PlainToRich::plaintorich(const string& in, 
+			      list<string>& out, // Output chunk list
 			      const HiliteData& hdata,
 			      int chunksize)
 {
@ -323,16 +325,17 @@ bool PlainToRich::plaintorich(const string& in, list<string>& out,
 	LOGDEB0(("  %s", sterms.c_str()));
    }

-    // We first use the text splitter to break the text into words,
-    // and compare the words to the search terms, which yields the
-    // query terms positions inside the text
+    // Compute the positions for the query terms.  We use the text
+    // splitter to break the text into words, and compare the words to
+    // the search terms,
    myTextSplitCB cb(terms, groups, slacks);
    TextSplit splitter(&cb);
-    // Note that splitter returns the term locations in byte, not
-    // character offset
+    // Note: the splitter returns the term locations in byte, not
+    // character, offsets.
    splitter.text_to_words(in);
    LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));

+    // Compute the positions for NEAR and PHRASE groups.
    cb.matchGroups();

    out.clear();
@ -346,7 +349,7 @@ bool PlainToRich::plaintorich(const string& in, list<string>& out,
    // output highlight tags and to compute term positions in the
    // output text
    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
-    vector<pair<int, int> >::iterator tboffsend = cb.tboffs.end();
+    vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();

 #if 0
    for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
@ -357,12 +360,21 @@ bool PlainToRich::plaintorich(const string& in, list<string>& out,

    // Input character iterator
    Utf8Iter chariter(in);
-    // State variable used to limitate the number of consecutive empty lines 
+    // State variable used to limit the number of consecutive empty lines 
    int ateol = 0;

    // Value for numbered anchors at each term match
    int anchoridx = 1;
-
+    // html state
+    bool intag = false, inparamvalue = false;
+    unsigned int headend = 0;
+    if (m_inputhtml) {
+	headend = in.find("</head>");
+	if (headend == string::npos)
+	    headend = in.find("</HEAD>");
+	if (headend != string::npos)
+	    headend += 7;
+    }
    for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
 	// Check from time to time if we need to stop
 	if ((pos & 0xfff) == 0) {
@ -371,51 +383,77 @@ bool PlainToRich::plaintorich(const string& in, list<string>& out,

 	// If we still have terms positions, check (byte) position. If
 	// we are at or after a term match, mark.
-	if (tPosIt != tboffsend) {
+	if (tPosIt != tPosEnd) {
 	    int ibyteidx = chariter.getBpos();
 	    if (ibyteidx == tPosIt->first) {
-		*olit += startAnchor(anchoridx++);
-		*olit += startMatch();
+		if (!intag && ibyteidx > headend) {
+		    *olit += startAnchor(anchoridx);
+		    *olit += startMatch();
+		}
+		anchoridx++;
 	    } else if (ibyteidx == tPosIt->second) {
-		// Output end tag, then skip all highlight areas that
-		// would overlap this one
-		*olit += endMatch();
-		*olit += endAnchor();
+		// Output end or match region tags
+		if (!intag && ibyteidx > headend) {
+		    *olit += endMatch();
+		    *olit += endAnchor();
+		}
+		// Skip all highlight areas that would overlap this one
 		int crend = tPosIt->second;
 		while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
 		    tPosIt++;
-		// Maybe end this chunk, begin next
-		if (olit->size() > (unsigned int)chunksize) {
+
+		// Maybe end this chunk, begin next. Don't do it on html
+		// there is just no way to do it right (qtextedit cant grok
+		// chunks cut in the middle of <a></a> for example).
+		if (!m_inputhtml && olit->size() > (unsigned int)chunksize) {
 		    out.push_back("");
 		    olit++;
 		}
 	    }
 	}

-	switch(*chariter) {
-	case '\n':
-	    if (ateol < 2) {
-		*olit += "<br>\n";
-		ateol++;
-	    }
-	    break;
-	case '\r': 
-	    break;
-	case '<':
-	    ateol = 0;
-	    *olit += "&lt;";
-	    break;
-	case '&':
-	    ateol = 0;
-	    *olit += "&amp;";
-	    break;
-	default:
-	    // We don't change the eol status for whitespace, want a real line
-	    if (!(*chariter == ' ' || *chariter == '\t')) {
-		ateol = 0;
+	if (m_inputhtml) {
+	    switch (*chariter) {
+	    case '<':
+		if (!inparamvalue)
+		    intag = true;
+		break;
+	    case '>':
+		if (!inparamvalue)
+		    intag = false;
+		break;
+	    case '"':
+		if (intag) {
+		    inparamvalue = !inparamvalue;
+		}
+		break;
 	    }
 	    chariter.appendchartostring(*olit);
-	}
+	} else switch (*chariter) {
+	    case '\n':
+		if (ateol < 2) {
+		    *olit += "<br>\n";
+		    ateol++;
+		}
+		break;
+	    case '\r': 
+		break;
+	    case '<':
+		ateol = 0;
+		*olit += "&lt;";
+		break;
+	    case '&':
+		ateol = 0;
+		*olit += "&amp;";
+		break;
+	    default:
+		// We don't change the eol status for whitespace, want
+		// a real line
+		if (!(*chariter == ' ' || *chariter == '\t')) {
+		    ateol = 0;
+		}
+		chariter.appendchartostring(*olit);
+	    }
    }
 #if 0
    {
--- a/src/qtgui/plaintorich.h
+++ b/src/qtgui/plaintorich.h
@ -16,7 +16,7 @@
 */
 #ifndef _PLAINTORICH_H_INCLUDED_
 #define _PLAINTORICH_H_INCLUDED_
-/* @(#$Id: plaintorich.h,v 1.17 2008-07-01 08:27:58 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: plaintorich.h,v 1.18 2008-10-03 08:09:35 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -37,12 +37,15 @@ struct HiliteData {

 /** 
 * A class for highlighting search results. Overridable methods allow
- * for different styles
+ * for different styles. We can handle plain text or html input. In the latter
+ * case, we may fail to highligt term groups if they are mixed with html tags.
 */
 class PlainToRich {
 public:
-    static const string snull;
+    PlainToRich(bool inputhtml = false) : m_inputhtml(inputhtml) {}
    virtual ~PlainToRich() {}
+    void set_inputhtml(bool v) {m_inputhtml = v;}
+
    /**
     * Transform plain text for highlighting search terms, ie in the
     * preview window or result list entries.
@ -75,6 +78,10 @@ public:
    virtual string endMatch() {return snull;}
    virtual string startAnchor(int) {return snull;}
    virtual string endAnchor() {return snull;}
+
+protected:
+    static const string snull;
+    bool m_inputhtml;
 };

 #endif /* _PLAINTORICH_H_INCLUDED_ */
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.36 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.37 2008-10-03 08:09:35 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -672,6 +672,10 @@ class LoadThread : public QThread {
 	}
 	
 	FileInterner interner(filename, &st, rclconfig, tmpdir, mtype);
+	// We don't set the interner's target mtype to html because we
+	// do want the html filter to do its work: we won't use the
+	// text, but we need the conversion to utf-8
+	// interner.setTargetMType("text/html");
 	try {
 	    FileInterner::Status ret = interner.internfile(*out, ipath);
 	    if (ret == FileInterner::FIDone || ret == FileInterner::FIAgain) {
@ -682,6 +686,10 @@ class LoadThread : public QThread {
 		// a mysterious error. Happens when the file name matches a
 		// a search term of course.
 		*statusp = 0;
+		if (prefs.previewHtml && !interner.get_html().empty()) {
+		    out->text = interner.get_html();
+		    out->mimetype = "text/html";
+		}
 	    } else {
 		out->mimetype = interner.getMimetype();
 		interner.getMissingExternal(missing);
@ -820,13 +828,20 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
    // somehow slipped through previous processing.
    bool highlightTerms = fdoc.text.length() < 
 	(unsigned long)prefs.maxhltextmbs * 1024 * 1024;
+
    // Final text is produced in chunks so that we can display the top
    // while still inserting at bottom
    list<QString> qrichlst;
-
+    bool inputishtml = !fdoc.mimetype.compare("text/html");
    if (highlightTerms) {
 	progress.setLabelText(tr("Creating preview text"));
 	qApp->processEvents();
+	if (inputishtml) {
+	    LOGDEB(("Preview: got html %s\n", fdoc.text.c_str()));
+	    m_plaintorich.set_inputhtml(true);
+	} else {
+	    m_plaintorich.set_inputhtml(false);
+	}
 	list<string> richlst;
 	ToRichThread rthr(fdoc.text, m_hData, richlst, m_plaintorich);
 	rthr.start();
@ -855,23 +870,29 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
 		richlst.back() += "<b>Cancelled !</b>";
 	    }
 	}
-	// Convert to QString list
+	// Convert C++ string list to QString list
 	for (list<string>::iterator it = richlst.begin(); 
 	     it != richlst.end(); it++) {
 	    qrichlst.push_back(QString::fromUtf8(it->c_str(), it->length()));
 	}
    } else {
-	// No plaintorich() call.
-	// In this case, the text will no be identified as
-	// richtxt/html (no <html> or <qt> etc. at the beginning), and
-	// there is no need to escape special characters.
-	// Also we need to split in chunks (so that the top is displayed faster),
-	// and we must do it on a QString (to avoid utf8 issues).
+	LOGDEB(("Preview: no hilighting\n"));
+	// No plaintorich() call.  In this case, either the text is
+	// html and the html quoting is hopefully correct, or it's
+	// plain-text and there is no need to escape special
+	// characters. We'd still want to split in chunks (so that the
+	// top is displayed faster), but we must not cut tags, and
+	// it's too difficult on html. For text we do the splitting on
+	// a QString to avoid utf8 issues.
 	QString qr = QString::fromUtf8(fdoc.text.c_str(), fdoc.text.length());
 	int l = 0;
-	for (int pos = 0; pos < (int)qr.length(); pos += l) {
-	    l = MIN(CHUNKL, qr.length() - pos);
-	    qrichlst.push_back(qr.mid(pos, l));
+	if (inputishtml) {
+	    qrichlst.push_back(qr);
+	} else {
+	    for (int pos = 0; pos < (int)qr.length(); pos += l) {
+		l = MIN(CHUNKL, qr.length() - pos);
+		qrichlst.push_back(qr.mid(pos, l));
+	    }
 	}
    }
 	    
@ -895,7 +916,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
 	qApp->processEvents();

 	editor->append(*it);
-
+	LOGDEB(("Preview:: loaded: [%s]\n", 
+		string((const char *)(*it).utf8()).c_str()));
 	// Stay at top
 	if (instep < 5) {
 	    editor->setCursorPosition(0,0);
--- a/src/qtgui/preview_w.h
+++ b/src/qtgui/preview_w.h
@ -1,6 +1,6 @@
 #ifndef _PREVIEW_W_H_INCLUDED_
 #define _PREVIEW_W_H_INCLUDED_
-/* @(#$Id: preview_w.h,v 1.18 2008-07-01 08:27:58 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: preview_w.h,v 1.19 2008-10-03 08:09:35 dockes Exp $  (C) 2006 J.F.Dockes */
 /*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -69,12 +69,16 @@ class TabData {
 class PlainToRichQtPreview : public PlainToRich {
 public:
    int lastanchor;
-    PlainToRichQtPreview() {
+    PlainToRichQtPreview(bool inputhtml = false) : PlainToRich(inputhtml) {
 	lastanchor = 0;
    }    
    virtual ~PlainToRichQtPreview() {}
    virtual string header() {
-	return string("<qt><head><title></title></head><body><p>");
+	if (m_inputhtml) {
+	    return snull;
+	} else {
+	    return string("<qt><head><title></title></head><body><p>");
+	}
    }
    virtual string startMatch() {return string("<termtag>");}
    virtual string endMatch() {return string("</termtag>");}
--- a/src/qtgui/uiprefs.ui
+++ b/src/qtgui/uiprefs.ui
@ -388,6 +388,17 @@
                                            <bool>false</bool>
                                        </property>
                                    </widget>
+                                    <widget class="QCheckBox">
+                                        <property name="name">
+                                            <cstring>previewHtmlCB</cstring>
+                                        </property>
+                                        <property name="text">
+                                            <string>Prefer Html to plain text for preview.</string>
+                                        </property>
+                                        <property name="checked">
+                                            <bool>false</bool>
+                                        </property>
+                                    </widget>
                                </vbox>
                            </widget>
                        </vbox>
--- a/src/qtgui/uiprefs_w.cpp
+++ b/src/qtgui/uiprefs_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.25 2008-07-28 08:42:52 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.26 2008-10-03 08:09:36 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -106,7 +106,7 @@ void UIPrefsDialog::setFromPrefs()
    initStartSortCB->setChecked(prefs.startWithSortToolOpen);
    useDesktopOpenCB->setChecked(prefs.useDesktopOpen);
    keepSortCB->setChecked(prefs.keepSort);
-
+    previewHtmlCB->setChecked(prefs.previewHtml);
    // Query terms color
    qtermColorLE->setText(prefs.qtermcolor);
    
@ -209,6 +209,7 @@ void UIPrefsDialog::accept()
    prefs.startWithSortToolOpen = initStartSortCB->isChecked();
    prefs.useDesktopOpen = useDesktopOpenCB->isChecked();
    prefs.keepSort = keepSortCB->isChecked();
+    prefs.previewHtml = previewHtmlCB->isChecked();

    prefs.syntAbsLen = syntlenSB->value();
    prefs.syntAbsCtx = syntctxSB->value();