improve positionning on term groups by storing/passing an occurrence index
This commit is contained in:
parent
1e55b88443
commit
db8d89f986
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.15 2006-11-17 12:32:40 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.16 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -61,6 +61,7 @@ class myTextSplitCB : public TextSplitCB {
|
||||
|
||||
// Out: first query term found in text
|
||||
string firstTerm;
|
||||
int firstTermOcc;
|
||||
|
||||
// Out: begin and end byte positions of query terms/groups in text
|
||||
vector<pair<int, int> > tboffs;
|
||||
@ -190,30 +191,49 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
{
|
||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
||||
vecStringToString(terms).c_str()));
|
||||
|
||||
// The position lists we are going to work with. We extract them from the
|
||||
// (string->plist) map
|
||||
vector<vector<int>* > plists;
|
||||
// Check that each of the group terms has a position list
|
||||
for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
|
||||
it++) {
|
||||
map<string, vector<int> >::iterator pl;
|
||||
if ((pl = m_plists.find(*it)) == m_plists.end()) {
|
||||
LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
||||
// A revert plist->term map. This is so that we can find who is who after
|
||||
// sorting the plists by length.
|
||||
map<vector<int>*, string> plistToTerm;
|
||||
// For traces
|
||||
vector<string> realgroup;
|
||||
|
||||
// Find the position list for each term in the group. Not all
|
||||
// necessarily exist (esp for NEAR where terms have been
|
||||
// stem-expanded: we don't know which matched)
|
||||
for (vector<string>::const_iterator it = terms.begin();
|
||||
it != terms.end(); it++) {
|
||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||
if (pl == m_plists.end()) {
|
||||
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
||||
(*it).c_str()));
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
plists.push_back(&(pl->second));
|
||||
plistToTerm[&(pl->second)] = *it;
|
||||
realgroup.push_back(*it);
|
||||
}
|
||||
|
||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
|
||||
vecStringToString(realgroup).c_str()));
|
||||
if (plists.size() < 2)
|
||||
return false;
|
||||
// Sort the positions lists so that the shorter is first
|
||||
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
||||
|
||||
// Walk the shortest plist and look for matches
|
||||
int sta = int(10E9), sto = 0;
|
||||
int pos;
|
||||
// Occurrences are from 1->N
|
||||
firstTermOcc = 0;
|
||||
vector<int>::iterator it = plists[0]->begin();
|
||||
do {
|
||||
if (it == plists[0]->end())
|
||||
return false;
|
||||
pos = *it++;
|
||||
firstTermOcc++;
|
||||
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
|
||||
SETMINMAX(pos, sta, sto);
|
||||
|
||||
@ -221,22 +241,20 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
|
||||
if (firstTerm.empty() || m_firstTermPos > sta) {
|
||||
// firsTerm is used to try an position the preview window over
|
||||
// the match. As it's difficult to divine byte/word positions,
|
||||
// we use a string search. Try to use the shortest plist for
|
||||
// this, which hopefully gives a better chance for the group
|
||||
// to be found (it's hopeless to try and match the whole
|
||||
// group)
|
||||
unsigned int minl = (unsigned int)10E9;
|
||||
for (vector<string>::const_iterator it = terms.begin();
|
||||
it != terms.end(); it++) {
|
||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||
if (pl != m_plists.end() && pl->second.size() < minl) {
|
||||
firstTerm = *it;
|
||||
minl = pl->second.size();
|
||||
}
|
||||
}
|
||||
// the match. As it's difficult to divine byte/word positions
|
||||
// in qtextedit, we use a string search. Use the
|
||||
// shortest plist for this, which hopefully gives a better
|
||||
// chance for the group to be found (it's hopeless to try and
|
||||
// match the whole group)
|
||||
map<vector<int>*, string>::iterator it =
|
||||
plistToTerm.find(plists.front());
|
||||
if (it != plistToTerm.end())
|
||||
firstTerm = it->second;
|
||||
LOGDEB(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
|
||||
firstTerm.c_str(), firstTermOcc));
|
||||
}
|
||||
|
||||
// Translate the position window into a byte offset window
|
||||
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
||||
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
||||
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
||||
@ -247,6 +265,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
||||
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
|
||||
sta, sto));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -281,7 +300,9 @@ bool myTextSplitCB::matchGroups()
|
||||
// editor's find() function to position on it
|
||||
bool plaintorich(const string& in, string& out,
|
||||
RefCntr<Rcl::SearchData> sdata,
|
||||
string *firstTerm, bool noHeader)
|
||||
string *firstTerm,
|
||||
int *firstTermOcc,
|
||||
bool noHeader)
|
||||
{
|
||||
Chrono chron;
|
||||
out.erase();
|
||||
@ -319,6 +340,8 @@ bool plaintorich(const string& in, string& out,
|
||||
|
||||
if (firstTerm)
|
||||
*firstTerm = cb.firstTerm;
|
||||
if (firstTermOcc)
|
||||
*firstTermOcc = cb.firstTermOcc;
|
||||
|
||||
// Rich text output
|
||||
if (noHeader)
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _PLAINTORICH_H_INCLUDED_
|
||||
#define _PLAINTORICH_H_INCLUDED_
|
||||
/* @(#$Id: plaintorich.h,v 1.9 2006-11-17 12:31:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: plaintorich.h,v 1.10 2006-11-18 12:31:16 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -33,10 +33,12 @@
|
||||
* @param out rich text output
|
||||
* @param terms list of query terms. These are out of Rcl::Db and dumb
|
||||
* @param firstTerm out: value of the first search term in text.
|
||||
* @param frsttocc out: occurrence of 1st term to look for
|
||||
* @param noHeader if true don't output header (<qt><title>...)
|
||||
*/
|
||||
extern bool plaintorich(const string &in, string &out,
|
||||
RefCntr<Rcl::SearchData> sdata,
|
||||
string* firstTerm, bool noHeader = false);
|
||||
string* firstTerm, int *frsttocc,
|
||||
bool noHeader = false);
|
||||
|
||||
#endif /* _PLAINTORICH_H_INCLUDED_ */
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.7 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -172,7 +172,7 @@ QTextEdit *Preview::getCurrentEditor()
|
||||
// false, the search string has been modified, we search for the new string,
|
||||
// starting from the current position
|
||||
void Preview::doSearch(const QString &text, bool next, bool reverse,
|
||||
bool wo)
|
||||
bool wordOnly)
|
||||
{
|
||||
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
|
||||
QTextEdit *edit = getCurrentEditor();
|
||||
@ -204,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse,
|
||||
}
|
||||
}
|
||||
|
||||
bool found = edit->find(text, matchCase, wo,
|
||||
bool found = edit->find(text, matchCase, wordOnly,
|
||||
!reverse, &mspara, &msindex);
|
||||
LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
|
||||
|
||||
@ -451,12 +451,14 @@ class ToRichThread : public QThread {
|
||||
string ∈
|
||||
RefCntr<Rcl::SearchData> m_searchData;
|
||||
string& firstTerm;
|
||||
int& firstTermOcc;
|
||||
QString &out;
|
||||
int loglevel;
|
||||
public:
|
||||
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
|
||||
string& ft, QString &o)
|
||||
: in(i), m_searchData(searchData), firstTerm(ft), out(o)
|
||||
string& ft, int& fto, QString &o)
|
||||
: in(i), m_searchData(searchData), firstTerm(ft), firstTermOcc(fto),
|
||||
out(o)
|
||||
{
|
||||
loglevel = DebugLog::getdbl()->getlevel();
|
||||
}
|
||||
@ -465,7 +467,7 @@ class ToRichThread : public QThread {
|
||||
DebugLog::getdbl()->setloglevel(loglevel);
|
||||
string rich;
|
||||
try {
|
||||
plaintorich(in, rich, m_searchData, &firstTerm);
|
||||
plaintorich(in, rich, m_searchData, &firstTerm, &firstTermOcc);
|
||||
} catch (CancelExcept) {
|
||||
}
|
||||
out = QString::fromUtf8(rich.c_str(), rich.length());
|
||||
@ -547,9 +549,11 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
|
||||
QString richTxt;
|
||||
bool highlightTerms = fdoc.text.length() < 1000 *1024;
|
||||
string firstTerm;
|
||||
int firstTermOcc;
|
||||
if (highlightTerms) {
|
||||
progress.setLabelText(tr("Creating preview text"));
|
||||
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
|
||||
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, firstTermOcc,
|
||||
richTxt);
|
||||
rthr.start();
|
||||
|
||||
for (;;prog++) {
|
||||
@ -629,7 +633,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
|
||||
if (!firstTerm.empty()) {
|
||||
bool wasC = matchCheck->isChecked();
|
||||
matchCheck->setChecked(false);
|
||||
doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
|
||||
for (int i = 0; i < firstTermOcc; i++) {
|
||||
doSearch(QString::fromUtf8(firstTerm.c_str()), i,
|
||||
false, true);
|
||||
}
|
||||
matchCheck->setChecked(wasC);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.11 2006-11-17 12:55:59 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.12 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <time.h>
|
||||
@ -381,7 +381,7 @@ void ResList::resultPageNext()
|
||||
|
||||
// Abstract
|
||||
string abst;
|
||||
plaintorich(doc.abstract, abst, m_searchData, 0, true);
|
||||
plaintorich(doc.abstract, abst, m_searchData, 0, 0, true);
|
||||
|
||||
// Links;
|
||||
string linksbuf;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user