simple term highlighting in query preview

2005-02-07 13:17:47 +00:00 · 2005-02-07 13:17:47 +00:00 · 2a020407da
commit 2a020407da
parent 74434a3b02
5 changed files with 160 additions and 73 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT

@ -7,6 +7,7 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Ex
 #include <string>

 #include "textsplit.h"
+#include "debuglog.h"

 using namespace std;

@ -57,9 +58,12 @@ static void setcharclasses()
    init = 1;
 }

-bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
+bool TextSplit::emitterm(string &w, int pos, bool doerase,
+			 int btstart, int btend)
 {
-    if (!termsink)
+    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
+    
+    if (!cb)
 	return false;

    // Maybe trim end of word. These are chars that we would keep inside 
@ -77,7 +81,7 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
    }
 breakloop:
    if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
-	bool ret = termsink(cdata, w, pos);
+	bool ret = cb->takeword(w, pos, btstart, btend);
 	if (doerase)
 	    w.erase();
 	return ret;
@ -92,14 +96,16 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
 */
 bool TextSplit::text_to_words(const string &in)
 {
+    LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb));
    setcharclasses();
    string span;
    string word;
    bool number = false;
    int wordpos = 0;
    int spanpos = 0;
+    unsigned int i;

-    for (unsigned int i = 0; i < in.length(); i++) {
+    for (i = 0; i < in.length(); i++) {
 	int c = in[i];
 	int cc = charclasses[c]; 
 	switch (cc) {
@ -107,10 +113,10 @@ bool TextSplit::text_to_words(const string &in)
 	SPACE:
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos)) 
+		    if (!emitterm(span, spanpos, true, i-span.length(), i)) 
 			return false;
 		}
-		if (!emitterm(word, wordpos++))
+		if (!emitterm(word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 	    }
@ -127,10 +133,10 @@ bool TextSplit::text_to_words(const string &in)
 		}
 	    } else {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos, false))
+		    if (!emitterm(span, spanpos, false, i-span.length(), i))
 			return false;
 		}
-		if (!emitterm(word, wordpos++))
+		if (!emitterm(word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 		span += c;
@ -140,10 +146,10 @@ bool TextSplit::text_to_words(const string &in)
 	case '@':
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos, false))
+		    if (!emitterm(span, spanpos, false, i-span.length(), i))
 			return false;
 		}
-		if (!emitterm(word, wordpos++))
+		if (!emitterm(word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 	    } else
@ -155,7 +161,7 @@ bool TextSplit::text_to_words(const string &in)
 		word += c;
 	    } else {
 		if (word.length()) {
-		    if (!emitterm(word, wordpos++))
+		    if (!emitterm(word, wordpos++, true, i-word.length(), i))
 			return false;
 		    number = false;
 		} else 
@ -202,9 +208,9 @@ bool TextSplit::text_to_words(const string &in)
    }
    if (word.length()) {
 	if (span.length() != word.length())
-	    if (!emitterm(span, spanpos))
+	    if (!emitterm(span, spanpos, true, i-span.length(), i))
 		return false;
-	return emitterm(word, wordpos);
+	return emitterm(word, wordpos, true, i-word.length(), i);
    }
    return true;
 }
@ -222,12 +228,14 @@ bool TextSplit::text_to_words(const string &in)

 using namespace std;

-bool termsink(void *, const string &term, int pos)
-{
-    cout << pos << " " << term << endl;
-    return true;
-}
-
+// A small class to hold state while splitting text
+class mySplitterCB : public TextSplitCB {
+ public:
+    bool takeword(const std::string &term, int pos, int bs, int be) {
+	cout << pos << " " << term << " bs " << bs << " be " << be << endl;
+	return true;
+    }
+};

 static string teststring = 
    "jfd@okyz.com "
@ -241,7 +249,8 @@ static string teststring =

 int main(int argc, char **argv)
 {
-    TextSplit splitter(termsink, 0);
+    mySplitterCB cb;
+    TextSplit splitter(&cb);
    if (argc == 2) {
 	string data;
 	if (!file_to_string(argv[1], data)) 
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -1,9 +1,20 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

+// Function class whose called for every detected word
+class TextSplitCB {
+ public:
+    virtual ~TextSplitCB() {}
+    virtual bool takeword(const std::string& term, 
+			  int pos,  // term pos
+			  int bts,      // byte offset of first char in term
+			  int bte      // byte offset of first char after term
+			  ) = 0; 
+};
+
 /** 
 * Split text into words. 
 * See comments at top of .cpp for more explanations.
@ -11,19 +22,14 @@
 * but 'ts much simpler this way...
 */
 class TextSplit {
- public:
-    typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
- private:
-    TermSink termsink;
-    void *cdata;
+    TextSplitCB *cb;
    int maxWordLength;
-    bool emitterm(std::string &term, int pos, bool doerase);
+    bool emitterm(std::string &term, int pos, bool doerase, int, int);
 public:
    /**
     * Constructor: just store callback and client data
     */
-    TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
-    {}
+    TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
    /**
     * Split text, emit words and positions.
     */
--- a/src/qtgui/recollmain.ui.h
+++ b/src/qtgui/recollmain.ui.h
@ -15,9 +15,13 @@
 #include <unistd.h>
 #include <fcntl.h>

+#include <utility>
+using std::pair;
+
 #include <qmessagebox.h>
 #include <qcstring.h>

+
 #include "rcldb.h"
 #include "rclconfig.h"
 #include "debuglog.h"
@ -25,10 +29,12 @@
 #include "pathut.h"
 #include "recoll.h"
 #include "internfile.h"
+#include "textsplit.h"
+#include "smallut.h"

 void RecollMain::fileExit()
 {
-    LOGDEB(("RecollMain: fileExit\n"));
+    LOGDEB1(("RecollMain: fileExit\n"));
    exit(0);
 }

@ -52,17 +58,66 @@ void RecollMain::fileStart_IndexingAction_activated()
 	startindexing = 1;
 }

-static string plaintorich(const string &in)
+// Text splitter callback used to take note of the query terms byte offsets 
+// inside the text. This is then used to post highlight tags. 
+class myTextSplitCB : public TextSplitCB {
+ public:
+    list<pair<int, int> > tboffs;
+    const list<string> *terms;
+    myTextSplitCB(const list<string>& terms) : terms(&terms) {}
+    virtual bool takeword(const std::string& term, int, int bts,  int bte) {
+	for (list<string>::const_iterator it = terms->begin(); 
+	     it != terms->end(); it++) {
+	    if (!stringlowercmp(*it, term)) {
+		tboffs.push_back(pair<int, int>(bts, bte));
+		break;
+	    }
+	}
+	return true;
+    }
+};
+
+static string plaintorich(const string &in, const list<string>& terms,
+			  list<pair<int, int> >&termoffsets)
 {
+#if 0
+    {string t;
+	for (list<string>::const_iterator it = terms.begin();it != terms.end();it++) 
+	    t += "'" + *it + "' ";
+	LOGDEB(("plaintorich: term: %s\n", t.c_str()));
+    }
+#endif
+    myTextSplitCB cb(terms);
+    TextSplit splitter(&cb);
+    splitter.text_to_words(in);
+    string out1;
+    if (cb.tboffs.empty()) {
+	out1 = in;
+    } else { 
+	list<pair<int, int> >::iterator it = cb.tboffs.begin();
+	for (unsigned int i = 0; i < in.length() ; i++) {
+	    if (it != cb.tboffs.end()) {
+		if (i == (unsigned int)it->first) {
+		    out1 += "<termtag>";
+		} else if (i == (unsigned int)it->second) {
+		    if (it != cb.tboffs.end())
+			it++;
+		    out1 += "</termtag>";
+		}
+	    }
+	    out1 += in[i];
+	}
+    }
    string out = "<qt><head><title></title></head><body><p>";
-    for (unsigned int i = 0; i < in.length() ; i++) {
-	if (in[i] == '\n') {
+    for (string::const_iterator it = out1.begin();it != out1.end(); it++) {
+	if (*it == '\n') {
 	    out += "<br>";
 	    //	    out += '\n';
 	} else {
-	    out += in[i];
+	    out += *it;
 	}
    }
+    termoffsets = cb.tboffs;
    return out;
 }

@ -137,7 +192,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
    int reldocnum = par - 1;
    reslist_current = reldocnum;
    previewTextEdit->clear();
-    LOGDEB(("Cleared preview\n"));
+
    if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) {
 	QMessageBox::warning(0, "Recoll",
 			     QString("Can't retrieve document from database"));
@ -154,26 +209,28 @@ void RecollMain::reslistTE_clicked(int par, int car)
 			     doc.mimetype.c_str());
 	return;
    }
+    list<string> terms;
+    rcldb->getQueryTerms(terms);
+    list<pair<int, int> > termoffsets;
+    string rich = plaintorich(fdoc.text, terms, termoffsets);

-    string rich = plaintorich(fdoc.text);
-
-#if 0
-    //Highlighting; pass a list of (search term, style name) to plaintorich
-    // and create the corresponding styles with different colors here
-    // We need to :
-    //  - Break the query into terms : wait for the query analyzer
-    //  - Break the text into words. This should use a version of 
-    //    textsplit with an option to keep the punctuation (see how to do
-    //    this). We do want the same splitter code to be used here and 
-    //    when indexing.
    QStyleSheetItem *item = 
-	new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" );
-    item->setColor("red");
+	new QStyleSheetItem( previewTextEdit->styleSheet(), "termtag" );
+    item->setColor("blue");
    item->setFontWeight(QFont::Bold);
-#endif

    QString str = QString::fromUtf8(rich.c_str(), rich.length());
    previewTextEdit->setText(str);
+    int para = 0, index = 1;
+    if (!termoffsets.empty()) {
+	index = (termoffsets.begin())->first;
+	LOGDEB1(("Setting cursor position to para %d, index %d\n",para,index));
+	previewTextEdit->setCursorPosition(0, index);
+    }
+    previewTextEdit->ensureCursorVisible();
+    previewTextEdit->getCursorPosition(&para, &index);
+    LOGDEB1(("PREVIEW Paragraphs: %d. Cpos: %d %d\n", 
+	    previewTextEdit->paragraphs(), para, index));
 }


@ -181,7 +238,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
 // first page of results
 void RecollMain::queryText_returnPressed()
 {
-    LOGDEB(("RecollMain::queryText_returnPressed()\n"));
+    LOGDEB1(("RecollMain::queryText_returnPressed()\n"));
    if (!rcldb->isopen()) {
 	string dbdir;
 	if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
@ -206,6 +263,7 @@ void RecollMain::queryText_returnPressed()

    if (!rcldb->setQuery(string((const char *)u8)))
 	return;
+    list<string> terms;
    listNextPB_clicked();
 }

@ -234,7 +292,7 @@ void RecollMain::listPrevPB_clicked()
 // Fill up result list window with next screen of hits
 void RecollMain::listNextPB_clicked()
 {
-    LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
+    LOGDEB1(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));

    if (reslist_winfirst < 0)
 	reslist_winfirst = 0;
@ -284,7 +342,7 @@ void RecollMain::listNextPB_clicked()
 	    struct tm *tm = localtime(&mtime);
 	    strftime(datebuf, 99, "<i>Modified:</i>&nbsp;%F&nbsp;%T", tm);
 	}
-	LOGDEB(("Abstract: %s\n", doc.abstract.c_str()));
+	LOGDEB1(("Abstract: %s\n", doc.abstract.c_str()));
 	string result = "<p>" + 
 	    string(perbuf) + " <b>" + doc.title + "</b><br>" +
 	    doc.mimetype + "&nbsp;" +
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.18 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -171,20 +171,19 @@ bool Rcl::Db::isopen()
 }

 // A small class to hold state while splitting text
-class wsData {
+class mySplitterCB : public TextSplitCB {
 public:
    Xapian::Document &doc;
    Xapian::termpos basepos; // Base for document section
    Xapian::termpos curpos;  // Last position sent to callback
-    wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
+    mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
    {}
+    bool takeword(const std::string &term, int pos, int, int);
 };

 // Callback for the document to word splitting class during indexation
-static bool splitCb(void *cdata, const std::string &term, int pos)
+bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
 {
-    wsData *data = (wsData*)cdata;
-
    // cerr << "splitCb: term " << term << endl;
    //string printable;
    //transcode(term, printable, "UTF-8", "ISO8859-1");
@ -193,8 +192,8 @@ static bool splitCb(void *cdata, const std::string &term, int pos)
    try {
 	// 1 is the value for wdfinc in index_text when called from omindex
 	// TOBEDONE: check what this is used for
-	data->curpos = pos;
-	data->doc.add_posting(term, data->basepos + data->curpos, 1);
+	curpos = pos;
+	doc.add_posting(term, basepos + curpos, 1);
    } catch (...) {
 	LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
 	return false;
@ -281,9 +280,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)

    Xapian::Document newdocument;

-    wsData splitData(newdocument);
+    mySplitterCB splitData(newdocument);

-    TextSplit splitter(splitCb, &splitData);
+    TextSplit splitter(&splitData);

    string noacc;
    if (!unac_cpp(doc.title, noacc)) {
@ -436,18 +435,16 @@ bool Rcl::Db::purge()

 #include <vector>

-class wsQData {
+class wsQData : public TextSplitCB {
 public:
    vector<string> terms;
+
+    bool takeword(const std::string &term, int , int, int) {
+	terms.push_back(term);
+	return true;
+    }
 };

-// Callback for the query-to-words splitting
-static bool splitQCb(void *cdata, const std::string &term, int )
-{
-    wsQData *data = (wsQData*)cdata;
-    data->terms.push_back(term);
-    return true;
-}

 bool Rcl::Db::setQuery(const std::string &querystring)
 {
@ -457,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
 	return false;

    wsQData splitData;
-    TextSplit splitter(splitQCb, &splitData);
+    TextSplit splitter(&splitData);

    string noacc;
    if (!dumb_string(querystring, noacc)) {
@ -475,6 +472,21 @@ bool Rcl::Db::setQuery(const std::string &querystring)
    return true;
 }

+bool Rcl::Db::getQueryTerms(list<string>& terms)
+{
+    Native *ndb = (Native *)pdata;
+    if (!ndb)
+	return false;
+
+    terms.clear();
+    Xapian::TermIterator it;
+    for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end();
+	 it++) {
+	terms.push_back(*it);
+    }
+    return true;
+}
+
 int Rcl::Db::getResCnt()
 {
    Native *ndb = (Native *)pdata;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -1,8 +1,9 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.8 2005-01-31 14:31:09 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.9 2005-02-07 13:17:47 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
+#include <list>

 // rcldb defines an interface for a 'real' text database. The current 
 // implementation uses xapian only, and xapian-related code is in rcldb.cpp
@ -72,6 +73,7 @@ class Db {

    // Parse query string and initialize query
    bool setQuery(const std::string &q);
+    bool getQueryTerms(std::list<std::string>& terms);

    // Get document at rank i. This is probably vastly inferior to the type
    // of interface in Xapian, but we have to start with something simple