improve positionning on term groups by storing/passing an occurrence index

This commit is contained in:
dockes 2006-11-18 12:31:16 +00:00
parent 1e55b88443
commit db8d89f986
4 changed files with 68 additions and 36 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.15 2006-11-17 12:32:40 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.16 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -61,6 +61,7 @@ class myTextSplitCB : public TextSplitCB {
// Out: first query term found in text // Out: first query term found in text
string firstTerm; string firstTerm;
int firstTermOcc;
// Out: begin and end byte positions of query terms/groups in text // Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs; vector<pair<int, int> > tboffs;
@ -190,30 +191,49 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
{ {
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window, LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
vecStringToString(terms).c_str())); vecStringToString(terms).c_str()));
// The position lists we are going to work with. We extract them from the
// (string->plist) map
vector<vector<int>* > plists; vector<vector<int>* > plists;
// Check that each of the group terms has a position list // A revert plist->term map. This is so that we can find who is who after
for (vector<string>::const_iterator it = terms.begin(); it != terms.end(); // sorting the plists by length.
it++) { map<vector<int>*, string> plistToTerm;
map<string, vector<int> >::iterator pl; // For traces
if ((pl = m_plists.find(*it)) == m_plists.end()) { vector<string> realgroup;
LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
// Find the position list for each term in the group. Not all
// necessarily exist (esp for NEAR where terms have been
// stem-expanded: we don't know which matched)
for (vector<string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl == m_plists.end()) {
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
(*it).c_str())); (*it).c_str()));
return false; continue;
} }
plists.push_back(&(pl->second)); plists.push_back(&(pl->second));
plistToTerm[&(pl->second)] = *it;
realgroup.push_back(*it);
} }
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
vecStringToString(realgroup).c_str()));
if (plists.size() < 2)
return false;
// Sort the positions lists so that the shorter is first // Sort the positions lists so that the shorter is first
std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
// Walk the shortest plist and look for matches // Walk the shortest plist and look for matches
int sta = int(10E9), sto = 0; int sta = int(10E9), sto = 0;
int pos; int pos;
// Occurrences are from 1->N
firstTermOcc = 0;
vector<int>::iterator it = plists[0]->begin(); vector<int>::iterator it = plists[0]->begin();
do { do {
if (it == plists[0]->end()) if (it == plists[0]->end())
return false; return false;
pos = *it++; pos = *it++;
firstTermOcc++;
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)); } while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
SETMINMAX(pos, sta, sto); SETMINMAX(pos, sta, sto);
@ -221,22 +241,20 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
if (firstTerm.empty() || m_firstTermPos > sta) { if (firstTerm.empty() || m_firstTermPos > sta) {
// firsTerm is used to try an position the preview window over // firsTerm is used to try an position the preview window over
// the match. As it's difficult to divine byte/word positions, // the match. As it's difficult to divine byte/word positions
// we use a string search. Try to use the shortest plist for // in qtextedit, we use a string search. Use the
// this, which hopefully gives a better chance for the group // shortest plist for this, which hopefully gives a better
// to be found (it's hopeless to try and match the whole // chance for the group to be found (it's hopeless to try and
// group) // match the whole group)
unsigned int minl = (unsigned int)10E9; map<vector<int>*, string>::iterator it =
for (vector<string>::const_iterator it = terms.begin(); plistToTerm.find(plists.front());
it != terms.end(); it++) { if (it != plistToTerm.end())
map<string, vector<int> >::iterator pl = m_plists.find(*it); firstTerm = it->second;
if (pl != m_plists.end() && pl->second.size() < minl) { LOGDEB(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
firstTerm = *it; firstTerm.c_str(), firstTermOcc));
minl = pl->second.size();
}
}
} }
// Translate the position window into a byte offset window
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta); map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto); map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
@ -247,6 +265,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
sta, sto)); sta, sto));
} }
return true; return true;
} }
@ -281,7 +300,9 @@ bool myTextSplitCB::matchGroups()
// editor's find() function to position on it // editor's find() function to position on it
bool plaintorich(const string& in, string& out, bool plaintorich(const string& in, string& out,
RefCntr<Rcl::SearchData> sdata, RefCntr<Rcl::SearchData> sdata,
string *firstTerm, bool noHeader) string *firstTerm,
int *firstTermOcc,
bool noHeader)
{ {
Chrono chron; Chrono chron;
out.erase(); out.erase();
@ -319,6 +340,8 @@ bool plaintorich(const string& in, string& out,
if (firstTerm) if (firstTerm)
*firstTerm = cb.firstTerm; *firstTerm = cb.firstTerm;
if (firstTermOcc)
*firstTermOcc = cb.firstTermOcc;
// Rich text output // Rich text output
if (noHeader) if (noHeader)

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _PLAINTORICH_H_INCLUDED_ #ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_ #define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.9 2006-11-17 12:31:50 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: plaintorich.h,v 1.10 2006-11-18 12:31:16 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
@ -33,10 +33,12 @@
* @param out rich text output * @param out rich text output
* @param terms list of query terms. These are out of Rcl::Db and dumb * @param terms list of query terms. These are out of Rcl::Db and dumb
* @param firstTerm out: value of the first search term in text. * @param firstTerm out: value of the first search term in text.
* @param frsttocc out: occurrence of 1st term to look for
* @param noHeader if true don't output header (<qt><title>...) * @param noHeader if true don't output header (<qt><title>...)
*/ */
extern bool plaintorich(const string &in, string &out, extern bool plaintorich(const string &in, string &out,
RefCntr<Rcl::SearchData> sdata, RefCntr<Rcl::SearchData> sdata,
string* firstTerm, bool noHeader = false); string* firstTerm, int *frsttocc,
bool noHeader = false);
#endif /* _PLAINTORICH_H_INCLUDED_ */ #endif /* _PLAINTORICH_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.7 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -172,7 +172,7 @@ QTextEdit *Preview::getCurrentEditor()
// false, the search string has been modified, we search for the new string, // false, the search string has been modified, we search for the new string,
// starting from the current position // starting from the current position
void Preview::doSearch(const QString &text, bool next, bool reverse, void Preview::doSearch(const QString &text, bool next, bool reverse,
bool wo) bool wordOnly)
{ {
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse))); LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
QTextEdit *edit = getCurrentEditor(); QTextEdit *edit = getCurrentEditor();
@ -204,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse,
} }
} }
bool found = edit->find(text, matchCase, wo, bool found = edit->find(text, matchCase, wordOnly,
!reverse, &mspara, &msindex); !reverse, &mspara, &msindex);
LOGDEB(("Found at para: %d index %d\n", mspara, msindex)); LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
@ -451,12 +451,14 @@ class ToRichThread : public QThread {
string &in; string &in;
RefCntr<Rcl::SearchData> m_searchData; RefCntr<Rcl::SearchData> m_searchData;
string& firstTerm; string& firstTerm;
int& firstTermOcc;
QString &out; QString &out;
int loglevel; int loglevel;
public: public:
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData, ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
string& ft, QString &o) string& ft, int& fto, QString &o)
: in(i), m_searchData(searchData), firstTerm(ft), out(o) : in(i), m_searchData(searchData), firstTerm(ft), firstTermOcc(fto),
out(o)
{ {
loglevel = DebugLog::getdbl()->getlevel(); loglevel = DebugLog::getdbl()->getlevel();
} }
@ -465,7 +467,7 @@ class ToRichThread : public QThread {
DebugLog::getdbl()->setloglevel(loglevel); DebugLog::getdbl()->setloglevel(loglevel);
string rich; string rich;
try { try {
plaintorich(in, rich, m_searchData, &firstTerm); plaintorich(in, rich, m_searchData, &firstTerm, &firstTermOcc);
} catch (CancelExcept) { } catch (CancelExcept) {
} }
out = QString::fromUtf8(rich.c_str(), rich.length()); out = QString::fromUtf8(rich.c_str(), rich.length());
@ -547,9 +549,11 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
QString richTxt; QString richTxt;
bool highlightTerms = fdoc.text.length() < 1000 *1024; bool highlightTerms = fdoc.text.length() < 1000 *1024;
string firstTerm; string firstTerm;
int firstTermOcc;
if (highlightTerms) { if (highlightTerms) {
progress.setLabelText(tr("Creating preview text")); progress.setLabelText(tr("Creating preview text"));
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt); ToRichThread rthr(fdoc.text, m_searchData, firstTerm, firstTermOcc,
richTxt);
rthr.start(); rthr.start();
for (;;prog++) { for (;;prog++) {
@ -629,7 +633,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
if (!firstTerm.empty()) { if (!firstTerm.empty()) {
bool wasC = matchCheck->isChecked(); bool wasC = matchCheck->isChecked();
matchCheck->setChecked(false); matchCheck->setChecked(false);
doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true); for (int i = 0; i < firstTermOcc; i++) {
doSearch(QString::fromUtf8(firstTerm.c_str()), i,
false, true);
}
matchCheck->setChecked(wasC); matchCheck->setChecked(wasC);
} }
} }

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.11 2006-11-17 12:55:59 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: reslist.cpp,v 1.12 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
#include <time.h> #include <time.h>
@ -381,7 +381,7 @@ void ResList::resultPageNext()
// Abstract // Abstract
string abst; string abst;
plaintorich(doc.abstract, abst, m_searchData, 0, true); plaintorich(doc.abstract, abst, m_searchData, 0, 0, true);
// Links; // Links;
string linksbuf; string linksbuf;