improve positionning on term groups by storing/passing an occurrence index
This commit is contained in:
parent
1e55b88443
commit
db8d89f986
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.15 2006-11-17 12:32:40 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.16 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -61,6 +61,7 @@ class myTextSplitCB : public TextSplitCB {
|
|||||||
|
|
||||||
// Out: first query term found in text
|
// Out: first query term found in text
|
||||||
string firstTerm;
|
string firstTerm;
|
||||||
|
int firstTermOcc;
|
||||||
|
|
||||||
// Out: begin and end byte positions of query terms/groups in text
|
// Out: begin and end byte positions of query terms/groups in text
|
||||||
vector<pair<int, int> > tboffs;
|
vector<pair<int, int> > tboffs;
|
||||||
@ -190,30 +191,49 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
{
|
{
|
||||||
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
||||||
vecStringToString(terms).c_str()));
|
vecStringToString(terms).c_str()));
|
||||||
|
|
||||||
|
// The position lists we are going to work with. We extract them from the
|
||||||
|
// (string->plist) map
|
||||||
vector<vector<int>* > plists;
|
vector<vector<int>* > plists;
|
||||||
// Check that each of the group terms has a position list
|
// A revert plist->term map. This is so that we can find who is who after
|
||||||
for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
|
// sorting the plists by length.
|
||||||
it++) {
|
map<vector<int>*, string> plistToTerm;
|
||||||
map<string, vector<int> >::iterator pl;
|
// For traces
|
||||||
if ((pl = m_plists.find(*it)) == m_plists.end()) {
|
vector<string> realgroup;
|
||||||
LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
|
||||||
|
// Find the position list for each term in the group. Not all
|
||||||
|
// necessarily exist (esp for NEAR where terms have been
|
||||||
|
// stem-expanded: we don't know which matched)
|
||||||
|
for (vector<string>::const_iterator it = terms.begin();
|
||||||
|
it != terms.end(); it++) {
|
||||||
|
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||||
|
if (pl == m_plists.end()) {
|
||||||
|
LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
||||||
(*it).c_str()));
|
(*it).c_str()));
|
||||||
return false;
|
continue;
|
||||||
}
|
}
|
||||||
plists.push_back(&(pl->second));
|
plists.push_back(&(pl->second));
|
||||||
|
plistToTerm[&(pl->second)] = *it;
|
||||||
|
realgroup.push_back(*it);
|
||||||
}
|
}
|
||||||
|
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
|
||||||
|
vecStringToString(realgroup).c_str()));
|
||||||
|
if (plists.size() < 2)
|
||||||
|
return false;
|
||||||
// Sort the positions lists so that the shorter is first
|
// Sort the positions lists so that the shorter is first
|
||||||
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
||||||
|
|
||||||
// Walk the shortest plist and look for matches
|
// Walk the shortest plist and look for matches
|
||||||
int sta = int(10E9), sto = 0;
|
int sta = int(10E9), sto = 0;
|
||||||
int pos;
|
int pos;
|
||||||
|
// Occurrences are from 1->N
|
||||||
|
firstTermOcc = 0;
|
||||||
vector<int>::iterator it = plists[0]->begin();
|
vector<int>::iterator it = plists[0]->begin();
|
||||||
do {
|
do {
|
||||||
if (it == plists[0]->end())
|
if (it == plists[0]->end())
|
||||||
return false;
|
return false;
|
||||||
pos = *it++;
|
pos = *it++;
|
||||||
|
firstTermOcc++;
|
||||||
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
|
} while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
|
||||||
SETMINMAX(pos, sta, sto);
|
SETMINMAX(pos, sta, sto);
|
||||||
|
|
||||||
@ -221,22 +241,20 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
|
|
||||||
if (firstTerm.empty() || m_firstTermPos > sta) {
|
if (firstTerm.empty() || m_firstTermPos > sta) {
|
||||||
// firsTerm is used to try an position the preview window over
|
// firsTerm is used to try an position the preview window over
|
||||||
// the match. As it's difficult to divine byte/word positions,
|
// the match. As it's difficult to divine byte/word positions
|
||||||
// we use a string search. Try to use the shortest plist for
|
// in qtextedit, we use a string search. Use the
|
||||||
// this, which hopefully gives a better chance for the group
|
// shortest plist for this, which hopefully gives a better
|
||||||
// to be found (it's hopeless to try and match the whole
|
// chance for the group to be found (it's hopeless to try and
|
||||||
// group)
|
// match the whole group)
|
||||||
unsigned int minl = (unsigned int)10E9;
|
map<vector<int>*, string>::iterator it =
|
||||||
for (vector<string>::const_iterator it = terms.begin();
|
plistToTerm.find(plists.front());
|
||||||
it != terms.end(); it++) {
|
if (it != plistToTerm.end())
|
||||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
firstTerm = it->second;
|
||||||
if (pl != m_plists.end() && pl->second.size() < minl) {
|
LOGDEB(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
|
||||||
firstTerm = *it;
|
firstTerm.c_str(), firstTermOcc));
|
||||||
minl = pl->second.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Translate the position window into a byte offset window
|
||||||
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
||||||
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
||||||
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
||||||
@ -247,6 +265,7 @@ bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
|||||||
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
|
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
|
||||||
sta, sto));
|
sta, sto));
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,7 +300,9 @@ bool myTextSplitCB::matchGroups()
|
|||||||
// editor's find() function to position on it
|
// editor's find() function to position on it
|
||||||
bool plaintorich(const string& in, string& out,
|
bool plaintorich(const string& in, string& out,
|
||||||
RefCntr<Rcl::SearchData> sdata,
|
RefCntr<Rcl::SearchData> sdata,
|
||||||
string *firstTerm, bool noHeader)
|
string *firstTerm,
|
||||||
|
int *firstTermOcc,
|
||||||
|
bool noHeader)
|
||||||
{
|
{
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
out.erase();
|
out.erase();
|
||||||
@ -319,6 +340,8 @@ bool plaintorich(const string& in, string& out,
|
|||||||
|
|
||||||
if (firstTerm)
|
if (firstTerm)
|
||||||
*firstTerm = cb.firstTerm;
|
*firstTerm = cb.firstTerm;
|
||||||
|
if (firstTermOcc)
|
||||||
|
*firstTermOcc = cb.firstTermOcc;
|
||||||
|
|
||||||
// Rich text output
|
// Rich text output
|
||||||
if (noHeader)
|
if (noHeader)
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _PLAINTORICH_H_INCLUDED_
|
#ifndef _PLAINTORICH_H_INCLUDED_
|
||||||
#define _PLAINTORICH_H_INCLUDED_
|
#define _PLAINTORICH_H_INCLUDED_
|
||||||
/* @(#$Id: plaintorich.h,v 1.9 2006-11-17 12:31:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: plaintorich.h,v 1.10 2006-11-18 12:31:16 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -33,10 +33,12 @@
|
|||||||
* @param out rich text output
|
* @param out rich text output
|
||||||
* @param terms list of query terms. These are out of Rcl::Db and dumb
|
* @param terms list of query terms. These are out of Rcl::Db and dumb
|
||||||
* @param firstTerm out: value of the first search term in text.
|
* @param firstTerm out: value of the first search term in text.
|
||||||
|
* @param frsttocc out: occurrence of 1st term to look for
|
||||||
* @param noHeader if true don't output header (<qt><title>...)
|
* @param noHeader if true don't output header (<qt><title>...)
|
||||||
*/
|
*/
|
||||||
extern bool plaintorich(const string &in, string &out,
|
extern bool plaintorich(const string &in, string &out,
|
||||||
RefCntr<Rcl::SearchData> sdata,
|
RefCntr<Rcl::SearchData> sdata,
|
||||||
string* firstTerm, bool noHeader = false);
|
string* firstTerm, int *frsttocc,
|
||||||
|
bool noHeader = false);
|
||||||
|
|
||||||
#endif /* _PLAINTORICH_H_INCLUDED_ */
|
#endif /* _PLAINTORICH_H_INCLUDED_ */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.7 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -172,7 +172,7 @@ QTextEdit *Preview::getCurrentEditor()
|
|||||||
// false, the search string has been modified, we search for the new string,
|
// false, the search string has been modified, we search for the new string,
|
||||||
// starting from the current position
|
// starting from the current position
|
||||||
void Preview::doSearch(const QString &text, bool next, bool reverse,
|
void Preview::doSearch(const QString &text, bool next, bool reverse,
|
||||||
bool wo)
|
bool wordOnly)
|
||||||
{
|
{
|
||||||
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
|
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
|
||||||
QTextEdit *edit = getCurrentEditor();
|
QTextEdit *edit = getCurrentEditor();
|
||||||
@ -204,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool found = edit->find(text, matchCase, wo,
|
bool found = edit->find(text, matchCase, wordOnly,
|
||||||
!reverse, &mspara, &msindex);
|
!reverse, &mspara, &msindex);
|
||||||
LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
|
LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
|
||||||
|
|
||||||
@ -451,12 +451,14 @@ class ToRichThread : public QThread {
|
|||||||
string ∈
|
string ∈
|
||||||
RefCntr<Rcl::SearchData> m_searchData;
|
RefCntr<Rcl::SearchData> m_searchData;
|
||||||
string& firstTerm;
|
string& firstTerm;
|
||||||
|
int& firstTermOcc;
|
||||||
QString &out;
|
QString &out;
|
||||||
int loglevel;
|
int loglevel;
|
||||||
public:
|
public:
|
||||||
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
|
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
|
||||||
string& ft, QString &o)
|
string& ft, int& fto, QString &o)
|
||||||
: in(i), m_searchData(searchData), firstTerm(ft), out(o)
|
: in(i), m_searchData(searchData), firstTerm(ft), firstTermOcc(fto),
|
||||||
|
out(o)
|
||||||
{
|
{
|
||||||
loglevel = DebugLog::getdbl()->getlevel();
|
loglevel = DebugLog::getdbl()->getlevel();
|
||||||
}
|
}
|
||||||
@ -465,7 +467,7 @@ class ToRichThread : public QThread {
|
|||||||
DebugLog::getdbl()->setloglevel(loglevel);
|
DebugLog::getdbl()->setloglevel(loglevel);
|
||||||
string rich;
|
string rich;
|
||||||
try {
|
try {
|
||||||
plaintorich(in, rich, m_searchData, &firstTerm);
|
plaintorich(in, rich, m_searchData, &firstTerm, &firstTermOcc);
|
||||||
} catch (CancelExcept) {
|
} catch (CancelExcept) {
|
||||||
}
|
}
|
||||||
out = QString::fromUtf8(rich.c_str(), rich.length());
|
out = QString::fromUtf8(rich.c_str(), rich.length());
|
||||||
@ -547,9 +549,11 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
|
|||||||
QString richTxt;
|
QString richTxt;
|
||||||
bool highlightTerms = fdoc.text.length() < 1000 *1024;
|
bool highlightTerms = fdoc.text.length() < 1000 *1024;
|
||||||
string firstTerm;
|
string firstTerm;
|
||||||
|
int firstTermOcc;
|
||||||
if (highlightTerms) {
|
if (highlightTerms) {
|
||||||
progress.setLabelText(tr("Creating preview text"));
|
progress.setLabelText(tr("Creating preview text"));
|
||||||
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
|
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, firstTermOcc,
|
||||||
|
richTxt);
|
||||||
rthr.start();
|
rthr.start();
|
||||||
|
|
||||||
for (;;prog++) {
|
for (;;prog++) {
|
||||||
@ -629,7 +633,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
|
|||||||
if (!firstTerm.empty()) {
|
if (!firstTerm.empty()) {
|
||||||
bool wasC = matchCheck->isChecked();
|
bool wasC = matchCheck->isChecked();
|
||||||
matchCheck->setChecked(false);
|
matchCheck->setChecked(false);
|
||||||
doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
|
for (int i = 0; i < firstTermOcc; i++) {
|
||||||
|
doSearch(QString::fromUtf8(firstTerm.c_str()), i,
|
||||||
|
false, true);
|
||||||
|
}
|
||||||
matchCheck->setChecked(wasC);
|
matchCheck->setChecked(wasC);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.11 2006-11-17 12:55:59 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.12 2006-11-18 12:31:16 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
@ -381,7 +381,7 @@ void ResList::resultPageNext()
|
|||||||
|
|
||||||
// Abstract
|
// Abstract
|
||||||
string abst;
|
string abst;
|
||||||
plaintorich(doc.abstract, abst, m_searchData, 0, true);
|
plaintorich(doc.abstract, abst, m_searchData, 0, 0, true);
|
||||||
|
|
||||||
// Links;
|
// Links;
|
||||||
string linksbuf;
|
string linksbuf;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user