improve positionning on term groups by storing/passing an occurrence index

2006-11-18 12:31:16 +00:00 · 2006-11-18 12:31:16 +00:00 · db8d89f986
commit db8d89f986
parent 1e55b88443
4 changed files with 68 additions and 36 deletions
--- a/src/qtgui/plaintorich.cpp
+++ b/src/qtgui/plaintorich.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.15 2006-11-17 12:32:40 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.16 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -61,6 +61,7 @@ class myTextSplitCB : public TextSplitCB {

    // Out: first query term found in text
    string firstTerm;
+    int    firstTermOcc;

    // Out: begin and end byte positions of query terms/groups in text
    vector<pair<int, int> > tboffs;  
@ -190,30 +191,49 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 {
    LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
 	    vecStringToString(terms).c_str()));
+
+    // The position lists we are going to work with. We extract them from the 
+    // (string->plist) map
    vector<vector<int>* > plists;
-    // Check that each of the group terms has a position list
-    for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
-	 it++) {
-	map<string, vector<int> >::iterator pl;
-	if ((pl = m_plists.find(*it)) == m_plists.end()) {
-	    LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
+    // A revert plist->term map. This is so that we can find who is who after
+    // sorting the plists by length.
+    map<vector<int>*, string> plistToTerm;
+    // For traces
+    vector<string> realgroup;
+
+    // Find the position list for each term in the group. Not all
+    // necessarily exist (esp for NEAR where terms have been
+    // stem-expanded: we don't know which matched)
+    for (vector<string>::const_iterator it = terms.begin(); 
+	 it != terms.end(); it++) {
+	map<string, vector<int> >::iterator pl = m_plists.find(*it);
+	if (pl == m_plists.end()) {
+	    LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
 		    (*it).c_str()));
-	    return false;
+	    continue;
 	}
 	plists.push_back(&(pl->second));
+	plistToTerm[&(pl->second)] = *it;
+	realgroup.push_back(*it);
    }
-
+    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
+	     vecStringToString(realgroup).c_str()));
+    if (plists.size() < 2)
+	return false;
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());

    // Walk the shortest plist and look for matches
    int sta = int(10E9), sto = 0;
    int pos;
+    // Occurrences are from 1->N
+    firstTermOcc = 0;
    vector<int>::iterator it = plists[0]->begin();
    do {
 	if (it == plists[0]->end())
 	    return false;
 	pos = *it++;
+	firstTermOcc++;
    } while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
    SETMINMAX(pos, sta, sto);

@ -221,22 +241,20 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)

    if (firstTerm.empty() || m_firstTermPos > sta) {
 	// firsTerm is used to try an position the preview window over
-	// the match. As it's difficult to divine byte/word positions,
-	// we use a string search. Try to use the shortest plist for
-	// this, which hopefully gives a better chance for the group
-	// to be found (it's hopeless to try and match the whole
-	// group)
-	unsigned int minl = (unsigned int)10E9;
-	for (vector<string>::const_iterator it = terms.begin(); 
-	     it != terms.end(); it++) {
-	    map<string, vector<int> >::iterator pl = m_plists.find(*it);
-	    if (pl != m_plists.end() && pl->second.size() < minl) {
-		firstTerm = *it;
-		minl = pl->second.size();
-	    }
-	}
+	// the match. As it's difficult to divine byte/word positions
+	// in qtextedit, we use a string search. Use the
+	// shortest plist for this, which hopefully gives a better
+	// chance for the group to be found (it's hopeless to try and
+	// match the whole group)
+	map<vector<int>*, string>::iterator it = 
+	    plistToTerm.find(plists.front());
+	if (it != plistToTerm.end())
+	    firstTerm = it->second;
+	LOGDEB(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
+		firstTerm.c_str(), firstTermOcc));
    }

+    // Translate the position window into a byte offset window
    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
@ -247,6 +265,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
 	LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", 
 		sta, sto));
    }
+
    return true;
 }

@ -281,7 +300,9 @@ bool myTextSplitCB::matchGroups()
 // editor's find() function to position on it
 bool plaintorich(const string& in, string& out, 
 		 RefCntr<Rcl::SearchData> sdata,
-		 string *firstTerm, bool noHeader)
+		 string *firstTerm, 
+		 int *firstTermOcc,
+		 bool noHeader)
 {
    Chrono chron;
    out.erase();
@ -319,6 +340,8 @@ bool plaintorich(const string& in, string& out,

    if (firstTerm)
 	*firstTerm = cb.firstTerm;
+    if (firstTermOcc)
+	*firstTermOcc = cb.firstTermOcc;

    // Rich text output
    if (noHeader)
--- a/src/qtgui/plaintorich.h
+++ b/src/qtgui/plaintorich.h
@ -16,7 +16,7 @@
 */
 #ifndef _PLAINTORICH_H_INCLUDED_
 #define _PLAINTORICH_H_INCLUDED_
-/* @(#$Id: plaintorich.h,v 1.9 2006-11-17 12:31:50 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: plaintorich.h,v 1.10 2006-11-18 12:31:16 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

@ -33,10 +33,12 @@
 * @param out         rich text output
 * @param terms       list of query terms. These are out of Rcl::Db and dumb
 * @param firstTerm   out: value of the first search term in text.
+ * @param frsttocc    out: occurrence of 1st term to look for
 * @param noHeader    if true don't output header (<qt><title>...)
 */
 extern bool plaintorich(const string &in, string &out,
 			RefCntr<Rcl::SearchData> sdata,
-			string* firstTerm, bool noHeader = false);
+			string* firstTerm, int *frsttocc, 
+			bool noHeader = false);

 #endif /* _PLAINTORICH_H_INCLUDED_ */
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.7 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -172,7 +172,7 @@ QTextEdit *Preview::getCurrentEditor()
 // false, the search string has been modified, we search for the new string, 
 // starting from the current position
 void Preview::doSearch(const QString &text, bool next, bool reverse, 
-		       bool wo)
+		       bool wordOnly)
 {
    LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
    QTextEdit *edit = getCurrentEditor();
@ -204,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse,
 	}
    }

-    bool found = edit->find(text, matchCase, wo, 
+    bool found = edit->find(text, matchCase, wordOnly, 
 			      !reverse, &mspara, &msindex);
    LOGDEB(("Found at para: %d index %d\n", mspara, msindex));

@ -451,12 +451,14 @@ class ToRichThread : public QThread {
    string &in;
    RefCntr<Rcl::SearchData> m_searchData;
    string& firstTerm;
+    int& firstTermOcc;
    QString &out;
    int loglevel;
 public:
    ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
-		 string& ft, QString &o) 
-	: in(i), m_searchData(searchData), firstTerm(ft), out(o)
+		 string& ft, int& fto, QString &o) 
+	: in(i), m_searchData(searchData), firstTerm(ft), firstTermOcc(fto),
+	  out(o)
    {
 	    loglevel = DebugLog::getdbl()->getlevel();
    }
@ -465,7 +467,7 @@ class ToRichThread : public QThread {
 	DebugLog::getdbl()->setloglevel(loglevel);
 	string rich;
 	try {
-	    plaintorich(in, rich, m_searchData, &firstTerm);
+	    plaintorich(in, rich, m_searchData, &firstTerm, &firstTermOcc);
 	} catch (CancelExcept) {
 	}
 	out = QString::fromUtf8(rich.c_str(), rich.length());
@ -547,9 +549,11 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
    QString richTxt;
    bool highlightTerms = fdoc.text.length() < 1000 *1024;
    string firstTerm;
+    int firstTermOcc;
    if (highlightTerms) {
 	progress.setLabelText(tr("Creating preview text"));
-	ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
+	ToRichThread rthr(fdoc.text, m_searchData, firstTerm, firstTermOcc,
+			  richTxt);
 	rthr.start();

 	for (;;prog++) {
@ -629,7 +633,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
 	if (!firstTerm.empty()) {
 	    bool wasC = matchCheck->isChecked();
 	    matchCheck->setChecked(false);
-	    doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
+	    for (int i = 0; i < firstTermOcc; i++) {
+		doSearch(QString::fromUtf8(firstTerm.c_str()), i, 
+			 false, true);
+	    }
 	    matchCheck->setChecked(wasC);
 	}
    }
--- a/src/qtgui/reslist.cpp
+++ b/src/qtgui/reslist.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: reslist.cpp,v 1.11 2006-11-17 12:55:59 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: reslist.cpp,v 1.12 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif

 #include <time.h>
@ -381,7 +381,7 @@ void ResList::resultPageNext()

 	// Abstract
 	string abst;
-	plaintorich(doc.abstract, abst, m_searchData, 0, true);
+	plaintorich(doc.abstract, abst, m_searchData, 0, 0, true);

 	// Links;
 	string linksbuf;