Remember searchData and use it in plaintorich for phrase/group highlighting

2006-11-17 10:09:07 +00:00 · 2006-11-17 10:09:07 +00:00 · a963035b93
commit a963035b93
parent a8e0fe31bd
7 changed files with 264 additions and 54 deletions
--- a/src/qtgui/plaintorich.cpp
+++ b/src/qtgui/plaintorich.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.14 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -24,6 +24,9 @@ static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes
 #include <list>
 #include <set>
 #include <vector>
+#include <map>
+#include <algorithm>
+
 #ifndef NO_NAMESPACES
 using std::vector;
 using std::list;
@ -41,42 +44,218 @@ using std::set;
 #include "plaintorich.h"
 #include "cancelcheck.h"

+
+static string vecStringToString(const vector<string>& t)
+{
+    string sterms;
+    for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
+	sterms += "[" + *it + "] ";
+    }
+    return sterms;
+}
+
 // Text splitter callback used to take note of the position of query terms 
-// inside the result text. This is then used to post highlight tags. 
+// inside the result text. This is then used to insert highlight tags. 
 class myTextSplitCB : public TextSplitCB {
 public:
-    // in: user query terms
+    // In: user query terms
    set<string>    terms; 
-    // Out: begin and end byte positions of query terms in text
-    vector<pair<int, int> > tboffs;  
+    // 
+    const vector<vector<string> >& m_groups;
+    const vector<int>& m_slacks;
+    set<string> gterms;
+
    // Out: first term found in text
    string firstTerm;
+    int firstTermPos;

-    myTextSplitCB(const list<string>& its) {
-	for (list<string>::const_iterator it = its.begin(); it != its.end();
-	     it++) {
-	    string s;
-	    Rcl::dumb_string(*it, s);
-	    terms.insert(s);
+    // Out: begin and end byte positions of query terms/groups in text
+    vector<pair<int, int> > tboffs;  
+
+    // group/near terms word positions.
+    map<string, vector<int> > m_plists;
+    map<int, pair<int, int> > m_gpostobytes;
+
+    myTextSplitCB(const vector<string>& its, vector<vector<string> >&groups, 
+		  vector<int>& slacks) : m_groups(groups), m_slacks(slacks)
+    {
+	for (vector<string>::const_iterator it = its.begin(); 
+	     it != its.end(); it++) {
+	    terms.insert(*it);
+	}
+	for (vector<vector<string> >::const_iterator vit = m_groups.begin(); 
+	     vit != m_groups.end(); vit++) {
+	    for (vector<string>::const_iterator it = (*vit).begin(); 
+		 it != (*vit).end(); it++) {
+		gterms.insert(*it);
+	    }
 	}
    }

    // Callback called by the text-to-words breaker for each word
-    virtual bool takeword(const std::string& term, int, int bts, int bte) {
+    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
 	string dumb;
 	Rcl::dumb_string(term, dumb);
 	//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
 	// pos, bts, bte));
+
+	// Single search term highlighting: if this word is a search term,
+	// Note its byte-offset span. 
 	if (terms.find(dumb) != terms.end()) {
 	    tboffs.push_back(pair<int, int>(bts, bte));
-	    if (firstTerm.empty())
+	    if (firstTerm.empty()) {
 		firstTerm = term;
+		firstTermPos = pos;
+	    }
 	}
+	
+	if (gterms.find(dumb) != gterms.end()) {
+	    // Term group (phrase/near) handling
+	    m_plists[dumb].push_back(pos);
+	    m_gpostobytes[pos] = pair<int,int>(bts, bte);
+	    LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
+	}
+
 	CancelCheck::instance().checkCancel();
 	return true;
    }
+    virtual bool matchGroup(const vector<string>& terms, int dist);
+    virtual bool matchGroups();
 };

+// Code for checking for a NEAR match comes out of xapian phrasepostlist.cc
+/** Sort by shorter comparison class */
+class VecIntCmpShorter {
+    public:
+	/** Return true if and only if a is strictly shorter than b.
+	 */
+        bool operator()(const vector<int> *a, const vector<int> *b) {
+            return a->size() < b->size();
+        }
+};
+
+bool do_test(int window, vector<vector<int>* >& plists, 
+	     unsigned int i, int min, int max, int *sp, int *ep)
+{
+    int tmp = max + 1;
+    // take care to avoid underflow
+    if (window <= tmp) 
+	tmp -= window; 
+    else 
+	tmp = 0;
+    vector<int>::iterator it = plists[i]->begin();
+
+    // Find 1st position bigger than window start
+    while (it != plists[i]->end() && *it < tmp)
+	it++;
+
+    // Try each position inside window in turn for match with other lists
+    while (it != plists[i]->end()) {
+	int pos = *it;
+	if (pos > min + window - 1) 
+	    return false;
+	if (i + 1 == plists.size()) {
+	    *sp = min;
+	    *ep = max;
+	    return true;
+	}
+	if (pos < min) 
+	    min = pos;
+	else if (pos > max) 
+	    max = pos;
+	if (do_test(window, plists, i + 1, min, max, sp, ep)) 
+	    return true;
+	it++;
+    }
+    return false;
+}
+
+// Check if there is a NEAR match for the group of terms
+bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
+{
+    LOGDEB(("myTextSplitCB::matchGroup:d %d: %s\n", window,
+	    vecStringToString(terms).c_str()));
+    vector<vector<int>* > plists;
+    // Check that each of the group terms has a position list
+    for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
+	 it++) {
+	map<string, vector<int> >::iterator pl;
+	if ((pl = m_plists.find(*it)) == m_plists.end()) {
+	    LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
+		    (*it).c_str()));
+	    return false;
+	}
+	plists.push_back(&(pl->second));
+    }
+
+    // Sort the positions lists so that the shorter is first
+    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
+
+    // Walk the shortest plist and look for matches
+    int sta, sto;
+    int pos;
+    vector<int>::iterator it = plists[0]->begin();
+    do {
+	if (it == plists[0]->end())
+	    return false;
+	pos = *it++;
+    } while (!do_test(window, plists, 1, pos, pos, &sta, &sto));
+
+    LOGDEB(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto)); 
+
+    if (firstTerm.empty() || firstTermPos > sta) {
+	// firsTerm is used to try an position the preview window over
+	// the match. As it's difficult to divine byte/word positions,
+	// we use a string search. Try to use the shortest plist for
+	// this, which hopefully gives a better chance for the group
+	// to be found (it's hopeless to try and match the whole
+	// group)
+	unsigned int minl = (unsigned int)10E9;
+	for (vector<string>::const_iterator it = terms.begin(); 
+	     it != terms.end(); it++) {
+	    map<string, vector<int> >::iterator pl = m_plists.find(*it);
+	    if (pl != m_plists.end() && pl->second.size() < minl) {
+		firstTerm = *it;
+		LOGDEB(("Firstterm->%s\n", firstTerm.c_str()));
+		minl = pl->second.size();
+	    }
+	}
+    }
+
+    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
+    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
+    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
+	LOGDEB(("myTextSplitCB::matchGroup: pushing %d %d\n",
+		i1->second.first, i2->second.second));
+	tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
+    } else {
+	LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", 
+		sta, sto));
+    }
+    return true;
+}
+
+class PairIntCmpFirst {
+public:
+    /** Return true if and only if a is strictly shorter than b.
+     */
+    bool operator()(pair<int,int> a, pair<int, int>b) {
+	return a.first < b.first;
+    }
+};
+
+bool myTextSplitCB::matchGroups()
+{
+    vector<vector<string> >::const_iterator vit = m_groups.begin();
+    vector<int>::const_iterator sit = m_slacks.begin();
+    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
+	matchGroup(*vit, *sit + (*vit).size());
+    }
+
+    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
+    return true;
+}
+
 // Fix result text for display inside the gui text window.
 //
 // To compute the term character positions in the output text, we used
@ -86,22 +265,41 @@ class myTextSplitCB : public TextSplitCB {
 // don't know the term par/car positions in the editor text.  Instead,
 // we return the first term encountered, and the caller will use the
 // editor's find() function to position on it
-bool plaintorich(const string& in, string& out, const list<string>& terms,
+bool plaintorich(const string& in, string& out, 
+		 RefCntr<Rcl::SearchData> sdata,
 		 string *firstTerm, bool noHeader)
 {
    Chrono chron;
-    LOGDEB(("plaintorich: terms: %s\n", 
-	    stringlistdisp(terms).c_str()));
    out.erase();
+    vector<string> terms;
+    vector<vector<string> > groups;
+    vector<int> slacks;
+
+    sdata->getTerms(terms, groups, slacks);
+
+    {
+	LOGDEB(("plaintorich: terms: \n"));
+	string sterms = vecStringToString(terms);
+	LOGDEB(("  %s\n", sterms.c_str()));
+	sterms = "\n";
+	LOGDEB(("plaintorich: groups: \n"));
+	for (vector<vector<string> >::iterator vit = groups.begin(); 
+	     vit != groups.end(); vit++) {
+	    sterms += vecStringToString(*vit);
+	    sterms += "\n";
+	}
+	LOGDEB(("  %s", sterms.c_str()));
+    }

    // We first use the text splitter to break the text into words,
    // and compare the words to the search terms, which yields the
    // query terms positions inside the text
-    myTextSplitCB cb(terms);
+    myTextSplitCB cb(terms, groups, slacks);
    TextSplit splitter(&cb, TextSplit::TXTS_ONLYSPANS);
    // Note that splitter returns the term locations in byte, not
    // character offset
    splitter.text_to_words(in);
+    cb.matchGroups();

    if (firstTerm)
 	*firstTerm = cb.firstTerm;
@ -118,6 +316,10 @@ bool plaintorich(const string& in, string& out, const list<string>& terms,
    // output text
    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();

+    for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
+	 it != cb.tboffs.end(); it++) {
+	LOGDEB(("plaintorich: region: %d %d\n", it->first, it->second));
+    }
    // Input character iterator
    Utf8Iter chariter(in);
    // State variable used to limitate the number of consecutive empty lines 
--- a/src/qtgui/plaintorich.h
+++ b/src/qtgui/plaintorich.h
@ -16,10 +16,12 @@
 */
 #ifndef _PLAINTORICH_H_INCLUDED_
 #define _PLAINTORICH_H_INCLUDED_
-/* @(#$Id: plaintorich.h,v 1.7 2006-09-13 14:57:56 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: plaintorich.h,v 1.8 2006-11-17 10:09:07 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

+#include "searchdata.h"
+
 /**
 * Transform plain text into qt rich text for the preview window.
 *
@ -33,7 +35,7 @@
 * @param noHeader    if true don't output header (<qt><title>...)
 */
 extern bool plaintorich(const string &in, string &out,
-			const list<string>& terms,
+			RefCntr<Rcl::SearchData> sdata,
 			string* firstTerm, bool noHeader = false);

 #endif /* _PLAINTORICH_H_INCLUDED_ */
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.5 2006-11-09 19:04:28 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -171,7 +171,8 @@ QTextEdit *Preview::getCurrentEditor()
 // current search, trying to advance and possibly wrapping around. If next is
 // false, the search string has been modified, we search for the new string, 
 // starting from the current position
-void Preview::doSearch(const QString &text, bool next, bool reverse)
+void Preview::doSearch(const QString &text, bool next, bool reverse, 
+		       bool wo)
 {
    LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
    QTextEdit *edit = getCurrentEditor();
@ -203,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse)
 	}
    }

-    bool found = edit->find(text, matchCase, false, 
+    bool found = edit->find(text, matchCase, wo, 
 			      !reverse, &mspara, &msindex);
    LOGDEB(("Found at para: %d index %d\n", mspara, msindex));

@ -448,14 +449,14 @@ class LoadThread : public QThread {
 /* A thread to convert to rich text (mark search terms) */
 class ToRichThread : public QThread {
    string &in;
-    list<string> &terms;
+    RefCntr<Rcl::SearchData> m_searchData;
    string& firstTerm;
    QString &out;
    int loglevel;
 public:
-    ToRichThread(string &i, list<string> &trms, 
+    ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
 		 string& ft, QString &o) 
-	: in(i), terms(trms), firstTerm(ft), out(o)
+	: in(i), m_searchData(searchData), firstTerm(ft), out(o)
    {
 	    loglevel = DebugLog::getdbl()->getlevel();
    }
@ -464,7 +465,7 @@ class ToRichThread : public QThread {
 	DebugLog::getdbl()->setloglevel(loglevel);
 	string rich;
 	try {
-	    plaintorich(in, rich, terms, &firstTerm);
+	    plaintorich(in, rich, m_searchData, &firstTerm);
 	} catch (CancelExcept) {
 	}
 	out = QString::fromUtf8(rich.c_str(), rich.length());
@ -546,11 +547,9 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
    QString richTxt;
    bool highlightTerms = fdoc.text.length() < 1000 *1024;
    string firstTerm;
-    list<string> terms;
-    rcldb->getMatchTerms(idoc, terms);
    if (highlightTerms) {
 	progress.setLabelText(tr("Creating preview text"));
-	ToRichThread rthr(fdoc.text, terms, firstTerm, richTxt);
+	ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
 	rthr.start();

 	for (;;prog++) {
@ -630,11 +629,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
 	if (!firstTerm.empty()) {
 	    bool wasC = matchCheck->isChecked();
 	    matchCheck->setChecked(false);
-	    doSearch(QString::fromUtf8(terms.begin()->c_str()), true, false);
+	    doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
 	    matchCheck->setChecked(wasC);
 	}
    }
    emit(previewExposed(m_searchId, docnum));
    return true;
 }
-
--- a/src/qtgui/preview_w.h
+++ b/src/qtgui/preview_w.h
@ -1,6 +1,6 @@
 #ifndef _PREVIEW_W_H_INCLUDED_
 #define _PREVIEW_W_H_INCLUDED_
-/* @(#$Id: preview_w.h,v 1.3 2006-09-21 12:56:57 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: preview_w.h,v 1.4 2006-11-17 10:09:07 dockes Exp $  (C) 2006 J.F.Dockes */
 /*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,6 +22,8 @@
 #include <qwidget.h>
 #include "rcldb.h"
 #include "preview.h"
+#include "refcntr.h"
+#include "searchdata.h"

 // We keep a list of data associated to each tab
 class TabData {
@ -45,7 +47,11 @@ public:
 	
    ~Preview(){}

-    virtual void setSId(int sid) {m_searchId = sid;}
+    virtual void setSId(int sid, RefCntr<Rcl::SearchData> sdata) 
+    {
+	m_searchId = sid;
+	m_searchData = sdata;
+    }
    virtual void closeEvent( QCloseEvent *e );
    virtual bool eventFilter( QObject *target, QEvent *event );
    virtual bool makeDocCurrent( const string & fn, const Rcl::Doc & doc );
@ -56,7 +62,8 @@ public:

 public slots:
    virtual void searchTextLine_textChanged( const QString & text );
-    virtual void doSearch( const QString &str, bool next, bool reverse );
+    virtual void doSearch(const QString &str, bool next, bool reverse,
+			   bool wo = false);
    virtual void nextPressed();
    virtual void prevPressed();
    virtual void currentChanged( QWidget * tw );
@ -72,7 +79,7 @@ signals:
    void showPrev(int sid, int docnum);
    void previewExposed(int sid, int docnum);

-protected:
+private:
    int m_searchId; // Identifier of search in main window. This is so that
                  // we make sense when requesting the next document when 
                  // browsing successive search results in a tab.
@ -82,8 +89,7 @@ protected:
    bool canBeep;
    list<TabData> tabData;
    QWidget *currentW;
-
-private:
+    RefCntr<Rcl::SearchData> m_searchData;
    void init();
    virtual void destroy();
    TabData *tabDataForCurrent(); // Return auxiliary data pointer for cur tab
--- a/src/qtgui/rclmain_w.cpp
+++ b/src/qtgui/rclmain_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.7 2006-11-14 13:55:43 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.8 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -472,7 +472,7 @@ void RclMain::startPreview(int docnum)
 				 QMessageBox::NoButton);
 	    return;
 	}
-	curPreview->setSId(m_searchId);
+	curPreview->setSId(m_searchId, resList->getSearchData());
 	curPreview->setCaption(resList->getDescription());
 	connect(curPreview, SIGNAL(previewClosed(QWidget *)), 
 		this, SLOT(previewClosed(QWidget *)));
@ -712,14 +712,17 @@ void RclMain::showDocHistory()
    if (sortspecs.sortwidth > 0) {
 	DocSequenceHistory myseq(rcldb, g_dynconf, 
 				 string(tr("Document history").utf8()));
-	docsource = new DocSeqSorted(myseq, sortspecs,
-				     string(tr("Document history (sorted)").utf8()));
+	docsource = new 
+	    DocSeqSorted(myseq, sortspecs,
+			 string(tr("Document history (sorted)").utf8()));
    } else {
-	docsource = new DocSequenceHistory(rcldb, g_dynconf, 
-					   string(tr("Document history").utf8()));
+	docsource = new 
+	    DocSequenceHistory(rcldb, g_dynconf, 
+			       string(tr("Document history").utf8()));
    }
+    // Construct a bogus SearchData
    RefCntr<Rcl::SearchData> sdata(new Rcl::SearchData(Rcl::SCLT_AND));
-    sdata->m_description = tr("History data").utf8();
+    sdata->setDescription((const char *)tr("History data").utf8());
    m_searchId++;
    resList->setDocSource(docsource, sdata);
 }
--- a/src/qtgui/reslist.cpp
+++ b/src/qtgui/reslist.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: reslist.cpp,v 1.9 2006-11-13 08:58:47 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: reslist.cpp,v 1.10 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif

 #include <time.h>
@ -76,7 +76,7 @@ void ResList::setDocSource(DocSequence *docsource,
 	delete m_docsource;
    m_winfirst = -1;
    m_docsource = docsource;
-    m_queryData = sdt;
+    m_searchData = sdt;
    m_curPvDoc = -1;

    resultPageNext();
@ -264,9 +264,7 @@ void ResList::resultPageNext()
    QStyleSheetItem *item = 
 	new QStyleSheetItem(styleSheet(), "termtag" );
    item->setColor("blue");
-    //    item->setFontWeight(QFont::Bold);
-    list<string> qTerms;
-    m_docsource->getTerms(qTerms);
+    // item->setFontWeight(QFont::Bold);

    // Result paragraph format
    string sformat = string(prefs.reslistformat.utf8());
@ -383,7 +381,7 @@ void ResList::resultPageNext()

 	// Abstract
 	string abst;
-	plaintorich(doc.abstract, abst, qTerms, 0, true);
+	plaintorich(doc.abstract, abst, m_searchData, 0, true);

 	// Links;
 	string linksbuf;
@ -609,7 +607,7 @@ void ResList::menuExpand()

 QString ResList::getDescription()
 {
-    return QString::fromUtf8(m_queryData->m_description.c_str());
+    return QString::fromUtf8(m_searchData->getDescription().c_str());
 }

 /** Show detailed expansion of a query */
@ -619,7 +617,7 @@ void ResList::showQueryDetails()
    // Also limit the total number of lines. 
    const unsigned int ll = 100;
    const unsigned int maxlines = 50;
-    string query = m_queryData->m_description;
+    string query = m_searchData->getDescription();
    string oq;
    unsigned int nlines = 0;
    while (query.length() > 0) {
--- a/src/qtgui/reslist.h
+++ b/src/qtgui/reslist.h
@ -1,6 +1,6 @@
 #ifndef _RESLIST_H_INCLUDED_
 #define _RESLIST_H_INCLUDED_
-/* @(#$Id: reslist.h,v 1.2 2006-11-13 08:58:47 dockes Exp $  (C) 2005 J.F.Dockes */
+/* @(#$Id: reslist.h,v 1.3 2006-11-17 10:09:07 dockes Exp $  (C) 2005 J.F.Dockes */

 #include <list>

@ -35,6 +35,7 @@ class ResList : public QTextBrowser
    virtual QPopupMenu *createPopupMenu(const QPoint& pos);
    virtual QString getDescription(); // Printable actual query performed on db
    virtual int getResCnt(); // Return total result list size
+    virtual RefCntr<Rcl::SearchData> getSearchData() {return m_searchData;}

 public slots:
    virtual void resetSearch() {m_winfirst = -1;clear();}
@ -71,7 +72,7 @@ class ResList : public QTextBrowser

 private:
    std::map<int,int>  m_pageParaToReldocnums;
-    RefCntr<Rcl::SearchData> m_queryData;
+    RefCntr<Rcl::SearchData> m_searchData;
    DocSequence       *m_docsource;
    std::vector<Rcl::Doc> m_curDocs;
    int                m_winfirst;