improve positionning on term groups by storing/passing an occurrence index

This commit is contained in:
dockes 2006-11-18 12:31:16 +00:00
parent 1e55b88443
commit db8d89f986
4 changed files with 68 additions and 36 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.15 2006-11-17 12:32:40 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.16 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -61,6 +61,7 @@ class myTextSplitCB : public TextSplitCB {
// Out: first query term found in text
string firstTerm;
int firstTermOcc;
// Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs;
@ -190,30 +191,49 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
{
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
vecStringToString(terms).c_str()));
// The position lists we are going to work with. We extract them from the
// (string->plist) map
vector<vector<int>* > plists;
// Check that each of the group terms has a position list
for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
it++) {
map<string, vector<int> >::iterator pl;
if ((pl = m_plists.find(*it)) == m_plists.end()) {
LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
// A revert plist->term map. This is so that we can find who is who after
// sorting the plists by length.
map<vector<int>*, string> plistToTerm;
// For traces
vector<string> realgroup;
// Find the position list for each term in the group. Not all
// necessarily exist (esp for NEAR where terms have been
// stem-expanded: we don't know which matched)
for (vector<string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl == m_plists.end()) {
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
(*it).c_str()));
return false;
continue;
}
plists.push_back(&(pl->second));
plistToTerm[&(pl->second)] = *it;
realgroup.push_back(*it);
}
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
vecStringToString(realgroup).c_str()));
if (plists.size() < 2)
return false;
// Sort the positions lists so that the shorter is first
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
// Walk the shortest plist and look for matches
int sta = int(10E9), sto = 0;
int pos;
// Occurrences are from 1->N
firstTermOcc = 0;
vector<int>::iterator it = plists[0]->begin();
do {
if (it == plists[0]->end())
return false;
pos = *it++;
firstTermOcc++;
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
SETMINMAX(pos, sta, sto);
@ -221,22 +241,20 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
if (firstTerm.empty() || m_firstTermPos > sta) {
// firsTerm is used to try an position the preview window over
// the match. As it's difficult to divine byte/word positions,
// we use a string search. Try to use the shortest plist for
// this, which hopefully gives a better chance for the group
// to be found (it's hopeless to try and match the whole
// group)
unsigned int minl = (unsigned int)10E9;
for (vector<string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl != m_plists.end() && pl->second.size() < minl) {
firstTerm = *it;
minl = pl->second.size();
}
}
// the match. As it's difficult to divine byte/word positions
// in qtextedit, we use a string search. Use the
// shortest plist for this, which hopefully gives a better
// chance for the group to be found (it's hopeless to try and
// match the whole group)
map<vector<int>*, string>::iterator it =
plistToTerm.find(plists.front());
if (it != plistToTerm.end())
firstTerm = it->second;
LOGDEB(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
firstTerm.c_str(), firstTermOcc));
}
// Translate the position window into a byte offset window
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
@ -247,6 +265,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
sta, sto));
}
return true;
}
@ -281,7 +300,9 @@ bool myTextSplitCB::matchGroups()
// editor's find() function to position on it
bool plaintorich(const string& in, string& out,
RefCntr<Rcl::SearchData> sdata,
string *firstTerm, bool noHeader)
string *firstTerm,
int *firstTermOcc,
bool noHeader)
{
Chrono chron;
out.erase();
@ -319,6 +340,8 @@ bool plaintorich(const string& in, string& out,
if (firstTerm)
*firstTerm = cb.firstTerm;
if (firstTermOcc)
*firstTermOcc = cb.firstTermOcc;
// Rich text output
if (noHeader)

View File

@ -16,7 +16,7 @@
*/
#ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.9 2006-11-17 12:31:50 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: plaintorich.h,v 1.10 2006-11-18 12:31:16 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -33,10 +33,12 @@
* @param out rich text output
* @param terms list of query terms. These are out of Rcl::Db and dumb
* @param firstTerm out: value of the first search term in text.
* @param frsttocc out: occurrence of 1st term to look for
* @param noHeader if true don't output header (<qt><title>...)
*/
extern bool plaintorich(const string &in, string &out,
RefCntr<Rcl::SearchData> sdata,
string* firstTerm, bool noHeader = false);
string* firstTerm, int *frsttocc,
bool noHeader = false);
#endif /* _PLAINTORICH_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.7 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -172,7 +172,7 @@ QTextEdit *Preview::getCurrentEditor()
// false, the search string has been modified, we search for the new string,
// starting from the current position
void Preview::doSearch(const QString &text, bool next, bool reverse,
bool wo)
bool wordOnly)
{
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
QTextEdit *edit = getCurrentEditor();
@ -204,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse,
}
}
bool found = edit->find(text, matchCase, wo,
bool found = edit->find(text, matchCase, wordOnly,
!reverse, &mspara, &msindex);
LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
@ -451,12 +451,14 @@ class ToRichThread : public QThread {
string &in;
RefCntr<Rcl::SearchData> m_searchData;
string& firstTerm;
int& firstTermOcc;
QString &out;
int loglevel;
public:
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
string& ft, QString &o)
: in(i), m_searchData(searchData), firstTerm(ft), out(o)
string& ft, int& fto, QString &o)
: in(i), m_searchData(searchData), firstTerm(ft), firstTermOcc(fto),
out(o)
{
loglevel = DebugLog::getdbl()->getlevel();
}
@ -465,7 +467,7 @@ class ToRichThread : public QThread {
DebugLog::getdbl()->setloglevel(loglevel);
string rich;
try {
plaintorich(in, rich, m_searchData, &firstTerm);
plaintorich(in, rich, m_searchData, &firstTerm, &firstTermOcc);
} catch (CancelExcept) {
}
out = QString::fromUtf8(rich.c_str(), rich.length());
@ -547,9 +549,11 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
QString richTxt;
bool highlightTerms = fdoc.text.length() < 1000 *1024;
string firstTerm;
int firstTermOcc;
if (highlightTerms) {
progress.setLabelText(tr("Creating preview text"));
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, firstTermOcc,
richTxt);
rthr.start();
for (;;prog++) {
@ -629,7 +633,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
if (!firstTerm.empty()) {
bool wasC = matchCheck->isChecked();
matchCheck->setChecked(false);
doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
for (int i = 0; i < firstTermOcc; i++) {
doSearch(QString::fromUtf8(firstTerm.c_str()), i,
false, true);
}
matchCheck->setChecked(wasC);
}
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.11 2006-11-17 12:55:59 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.12 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <time.h>
@ -381,7 +381,7 @@ void ResList::resultPageNext()
// Abstract
string abst;
plaintorich(doc.abstract, abst, m_searchData, 0, true);
plaintorich(doc.abstract, abst, m_searchData, 0, 0, true);
// Links;
string linksbuf;