Remember searchData and use it in plaintorich for phrase/group highlighting

This commit is contained in:
dockes 2006-11-17 10:09:07 +00:00
parent a8e0fe31bd
commit a963035b93
7 changed files with 264 additions and 54 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.14 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -24,6 +24,9 @@ static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.13 2006-11-13 08:15:57 dockes
#include <list>
#include <set>
#include <vector>
#include <map>
#include <algorithm>
#ifndef NO_NAMESPACES
using std::vector;
using std::list;
@ -41,42 +44,218 @@ using std::set;
#include "plaintorich.h"
#include "cancelcheck.h"
static string vecStringToString(const vector<string>& t)
{
string sterms;
for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
sterms += "[" + *it + "] ";
}
return sterms;
}
// Text splitter callback used to take note of the position of query terms
// inside the result text. This is then used to post highlight tags.
// inside the result text. This is then used to insert highlight tags.
class myTextSplitCB : public TextSplitCB {
public:
// in: user query terms
// In: user query terms
set<string> terms;
// Out: begin and end byte positions of query terms in text
vector<pair<int, int> > tboffs;
//
const vector<vector<string> >& m_groups;
const vector<int>& m_slacks;
set<string> gterms;
// Out: first term found in text
string firstTerm;
int firstTermPos;
myTextSplitCB(const list<string>& its) {
for (list<string>::const_iterator it = its.begin(); it != its.end();
it++) {
string s;
Rcl::dumb_string(*it, s);
terms.insert(s);
// Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs;
// group/near terms word positions.
map<string, vector<int> > m_plists;
map<int, pair<int, int> > m_gpostobytes;
myTextSplitCB(const vector<string>& its, vector<vector<string> >&groups,
vector<int>& slacks) : m_groups(groups), m_slacks(slacks)
{
for (vector<string>::const_iterator it = its.begin();
it != its.end(); it++) {
terms.insert(*it);
}
for (vector<vector<string> >::const_iterator vit = m_groups.begin();
vit != m_groups.end(); vit++) {
for (vector<string>::const_iterator it = (*vit).begin();
it != (*vit).end(); it++) {
gterms.insert(*it);
}
}
}
// Callback called by the text-to-words breaker for each word
virtual bool takeword(const std::string& term, int, int bts, int bte) {
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
Rcl::dumb_string(term, dumb);
//LOGDEB(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
// pos, bts, bte));
// Single search term highlighting: if this word is a search term,
// Note its byte-offset span.
if (terms.find(dumb) != terms.end()) {
tboffs.push_back(pair<int, int>(bts, bte));
if (firstTerm.empty())
if (firstTerm.empty()) {
firstTerm = term;
firstTermPos = pos;
}
}
if (gterms.find(dumb) != gterms.end()) {
// Term group (phrase/near) handling
m_plists[dumb].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bts, bte);
LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
}
CancelCheck::instance().checkCancel();
return true;
}
virtual bool matchGroup(const vector<string>& terms, int dist);
virtual bool matchGroups();
};
// Code for checking for a NEAR match comes out of xapian phrasepostlist.cc
/** Sort by shorter comparison class */
class VecIntCmpShorter {
public:
/** Return true if and only if a is strictly shorter than b.
*/
bool operator()(const vector<int> *a, const vector<int> *b) {
return a->size() < b->size();
}
};
bool do_test(int window, vector<vector<int>* >& plists,
unsigned int i, int min, int max, int *sp, int *ep)
{
int tmp = max + 1;
// take care to avoid underflow
if (window <= tmp)
tmp -= window;
else
tmp = 0;
vector<int>::iterator it = plists[i]->begin();
// Find 1st position bigger than window start
while (it != plists[i]->end() && *it < tmp)
it++;
// Try each position inside window in turn for match with other lists
while (it != plists[i]->end()) {
int pos = *it;
if (pos > min + window - 1)
return false;
if (i + 1 == plists.size()) {
*sp = min;
*ep = max;
return true;
}
if (pos < min)
min = pos;
else if (pos > max)
max = pos;
if (do_test(window, plists, i + 1, min, max, sp, ep))
return true;
it++;
}
return false;
}
// Check if there is a NEAR match for the group of terms
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
{
LOGDEB(("myTextSplitCB::matchGroup:d %d: %s\n", window,
vecStringToString(terms).c_str()));
vector<vector<int>* > plists;
// Check that each of the group terms has a position list
for (vector<string>::const_iterator it = terms.begin(); it != terms.end();
it++) {
map<string, vector<int> >::iterator pl;
if ((pl = m_plists.find(*it)) == m_plists.end()) {
LOGDEB(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
(*it).c_str()));
return false;
}
plists.push_back(&(pl->second));
}
// Sort the positions lists so that the shorter is first
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
// Walk the shortest plist and look for matches
int sta, sto;
int pos;
vector<int>::iterator it = plists[0]->begin();
do {
if (it == plists[0]->end())
return false;
pos = *it++;
} while (!do_test(window, plists, 1, pos, pos, &sta, &sto));
LOGDEB(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto));
if (firstTerm.empty() || firstTermPos > sta) {
// firsTerm is used to try an position the preview window over
// the match. As it's difficult to divine byte/word positions,
// we use a string search. Try to use the shortest plist for
// this, which hopefully gives a better chance for the group
// to be found (it's hopeless to try and match the whole
// group)
unsigned int minl = (unsigned int)10E9;
for (vector<string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl != m_plists.end() && pl->second.size() < minl) {
firstTerm = *it;
LOGDEB(("Firstterm->%s\n", firstTerm.c_str()));
minl = pl->second.size();
}
}
}
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
LOGDEB(("myTextSplitCB::matchGroup: pushing %d %d\n",
i1->second.first, i2->second.second));
tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
} else {
LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n",
sta, sto));
}
return true;
}
class PairIntCmpFirst {
public:
/** Return true if and only if a is strictly shorter than b.
*/
bool operator()(pair<int,int> a, pair<int, int>b) {
return a.first < b.first;
}
};
bool myTextSplitCB::matchGroups()
{
vector<vector<string> >::const_iterator vit = m_groups.begin();
vector<int>::const_iterator sit = m_slacks.begin();
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
matchGroup(*vit, *sit + (*vit).size());
}
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
return true;
}
// Fix result text for display inside the gui text window.
//
// To compute the term character positions in the output text, we used
@ -86,22 +265,41 @@ class myTextSplitCB : public TextSplitCB {
// don't know the term par/car positions in the editor text. Instead,
// we return the first term encountered, and the caller will use the
// editor's find() function to position on it
bool plaintorich(const string& in, string& out, const list<string>& terms,
bool plaintorich(const string& in, string& out,
RefCntr<Rcl::SearchData> sdata,
string *firstTerm, bool noHeader)
{
Chrono chron;
LOGDEB(("plaintorich: terms: %s\n",
stringlistdisp(terms).c_str()));
out.erase();
vector<string> terms;
vector<vector<string> > groups;
vector<int> slacks;
sdata->getTerms(terms, groups, slacks);
{
LOGDEB(("plaintorich: terms: \n"));
string sterms = vecStringToString(terms);
LOGDEB((" %s\n", sterms.c_str()));
sterms = "\n";
LOGDEB(("plaintorich: groups: \n"));
for (vector<vector<string> >::iterator vit = groups.begin();
vit != groups.end(); vit++) {
sterms += vecStringToString(*vit);
sterms += "\n";
}
LOGDEB((" %s", sterms.c_str()));
}
// We first use the text splitter to break the text into words,
// and compare the words to the search terms, which yields the
// query terms positions inside the text
myTextSplitCB cb(terms);
myTextSplitCB cb(terms, groups, slacks);
TextSplit splitter(&cb, TextSplit::TXTS_ONLYSPANS);
// Note that splitter returns the term locations in byte, not
// character offset
splitter.text_to_words(in);
cb.matchGroups();
if (firstTerm)
*firstTerm = cb.firstTerm;
@ -118,6 +316,10 @@ bool plaintorich(const string& in, string& out, const list<string>& terms,
// output text
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
it != cb.tboffs.end(); it++) {
LOGDEB(("plaintorich: region: %d %d\n", it->first, it->second));
}
// Input character iterator
Utf8Iter chariter(in);
// State variable used to limitate the number of consecutive empty lines

View File

@ -16,10 +16,12 @@
*/
#ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_
/* @(#$Id: plaintorich.h,v 1.7 2006-09-13 14:57:56 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: plaintorich.h,v 1.8 2006-11-17 10:09:07 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include "searchdata.h"
/**
* Transform plain text into qt rich text for the preview window.
*
@ -33,7 +35,7 @@
* @param noHeader if true don't output header (<qt><title>...)
*/
extern bool plaintorich(const string &in, string &out,
const list<string>& terms,
RefCntr<Rcl::SearchData> sdata,
string* firstTerm, bool noHeader = false);
#endif /* _PLAINTORICH_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.5 2006-11-09 19:04:28 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.6 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -171,7 +171,8 @@ QTextEdit *Preview::getCurrentEditor()
// current search, trying to advance and possibly wrapping around. If next is
// false, the search string has been modified, we search for the new string,
// starting from the current position
void Preview::doSearch(const QString &text, bool next, bool reverse)
void Preview::doSearch(const QString &text, bool next, bool reverse,
bool wo)
{
LOGDEB1(("Preview::doSearch: next %d rev %d\n", int(next), int(reverse)));
QTextEdit *edit = getCurrentEditor();
@ -203,7 +204,7 @@ void Preview::doSearch(const QString &text, bool next, bool reverse)
}
}
bool found = edit->find(text, matchCase, false,
bool found = edit->find(text, matchCase, wo,
!reverse, &mspara, &msindex);
LOGDEB(("Found at para: %d index %d\n", mspara, msindex));
@ -448,14 +449,14 @@ class LoadThread : public QThread {
/* A thread to convert to rich text (mark search terms) */
class ToRichThread : public QThread {
string &in;
list<string> &terms;
RefCntr<Rcl::SearchData> m_searchData;
string& firstTerm;
QString &out;
int loglevel;
public:
ToRichThread(string &i, list<string> &trms,
ToRichThread(string &i, RefCntr<Rcl::SearchData> searchData,
string& ft, QString &o)
: in(i), terms(trms), firstTerm(ft), out(o)
: in(i), m_searchData(searchData), firstTerm(ft), out(o)
{
loglevel = DebugLog::getdbl()->getlevel();
}
@ -464,7 +465,7 @@ class ToRichThread : public QThread {
DebugLog::getdbl()->setloglevel(loglevel);
string rich;
try {
plaintorich(in, rich, terms, &firstTerm);
plaintorich(in, rich, m_searchData, &firstTerm);
} catch (CancelExcept) {
}
out = QString::fromUtf8(rich.c_str(), rich.length());
@ -546,11 +547,9 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
QString richTxt;
bool highlightTerms = fdoc.text.length() < 1000 *1024;
string firstTerm;
list<string> terms;
rcldb->getMatchTerms(idoc, terms);
if (highlightTerms) {
progress.setLabelText(tr("Creating preview text"));
ToRichThread rthr(fdoc.text, terms, firstTerm, richTxt);
ToRichThread rthr(fdoc.text, m_searchData, firstTerm, richTxt);
rthr.start();
for (;;prog++) {
@ -630,11 +629,10 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
if (!firstTerm.empty()) {
bool wasC = matchCheck->isChecked();
matchCheck->setChecked(false);
doSearch(QString::fromUtf8(terms.begin()->c_str()), true, false);
doSearch(QString::fromUtf8(firstTerm.c_str()), true, false, true);
matchCheck->setChecked(wasC);
}
}
emit(previewExposed(m_searchId, docnum));
return true;
}

View File

@ -1,6 +1,6 @@
#ifndef _PREVIEW_W_H_INCLUDED_
#define _PREVIEW_W_H_INCLUDED_
/* @(#$Id: preview_w.h,v 1.3 2006-09-21 12:56:57 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: preview_w.h,v 1.4 2006-11-17 10:09:07 dockes Exp $ (C) 2006 J.F.Dockes */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -22,6 +22,8 @@
#include <qwidget.h>
#include "rcldb.h"
#include "preview.h"
#include "refcntr.h"
#include "searchdata.h"
// We keep a list of data associated to each tab
class TabData {
@ -45,7 +47,11 @@ public:
~Preview(){}
virtual void setSId(int sid) {m_searchId = sid;}
virtual void setSId(int sid, RefCntr<Rcl::SearchData> sdata)
{
m_searchId = sid;
m_searchData = sdata;
}
virtual void closeEvent( QCloseEvent *e );
virtual bool eventFilter( QObject *target, QEvent *event );
virtual bool makeDocCurrent( const string & fn, const Rcl::Doc & doc );
@ -56,7 +62,8 @@ public:
public slots:
virtual void searchTextLine_textChanged( const QString & text );
virtual void doSearch( const QString &str, bool next, bool reverse );
virtual void doSearch(const QString &str, bool next, bool reverse,
bool wo = false);
virtual void nextPressed();
virtual void prevPressed();
virtual void currentChanged( QWidget * tw );
@ -72,7 +79,7 @@ signals:
void showPrev(int sid, int docnum);
void previewExposed(int sid, int docnum);
protected:
private:
int m_searchId; // Identifier of search in main window. This is so that
// we make sense when requesting the next document when
// browsing successive search results in a tab.
@ -82,8 +89,7 @@ protected:
bool canBeep;
list<TabData> tabData;
QWidget *currentW;
private:
RefCntr<Rcl::SearchData> m_searchData;
void init();
virtual void destroy();
TabData *tabDataForCurrent(); // Return auxiliary data pointer for cur tab

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.7 2006-11-14 13:55:43 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclmain_w.cpp,v 1.8 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -472,7 +472,7 @@ void RclMain::startPreview(int docnum)
QMessageBox::NoButton);
return;
}
curPreview->setSId(m_searchId);
curPreview->setSId(m_searchId, resList->getSearchData());
curPreview->setCaption(resList->getDescription());
connect(curPreview, SIGNAL(previewClosed(QWidget *)),
this, SLOT(previewClosed(QWidget *)));
@ -712,14 +712,17 @@ void RclMain::showDocHistory()
if (sortspecs.sortwidth > 0) {
DocSequenceHistory myseq(rcldb, g_dynconf,
string(tr("Document history").utf8()));
docsource = new DocSeqSorted(myseq, sortspecs,
string(tr("Document history (sorted)").utf8()));
docsource = new
DocSeqSorted(myseq, sortspecs,
string(tr("Document history (sorted)").utf8()));
} else {
docsource = new DocSequenceHistory(rcldb, g_dynconf,
string(tr("Document history").utf8()));
docsource = new
DocSequenceHistory(rcldb, g_dynconf,
string(tr("Document history").utf8()));
}
// Construct a bogus SearchData
RefCntr<Rcl::SearchData> sdata(new Rcl::SearchData(Rcl::SCLT_AND));
sdata->m_description = tr("History data").utf8();
sdata->setDescription((const char *)tr("History data").utf8());
m_searchId++;
resList->setDocSource(docsource, sdata);
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.9 2006-11-13 08:58:47 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: reslist.cpp,v 1.10 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <time.h>
@ -76,7 +76,7 @@ void ResList::setDocSource(DocSequence *docsource,
delete m_docsource;
m_winfirst = -1;
m_docsource = docsource;
m_queryData = sdt;
m_searchData = sdt;
m_curPvDoc = -1;
resultPageNext();
@ -264,9 +264,7 @@ void ResList::resultPageNext()
QStyleSheetItem *item =
new QStyleSheetItem(styleSheet(), "termtag" );
item->setColor("blue");
// item->setFontWeight(QFont::Bold);
list<string> qTerms;
m_docsource->getTerms(qTerms);
// item->setFontWeight(QFont::Bold);
// Result paragraph format
string sformat = string(prefs.reslistformat.utf8());
@ -383,7 +381,7 @@ void ResList::resultPageNext()
// Abstract
string abst;
plaintorich(doc.abstract, abst, qTerms, 0, true);
plaintorich(doc.abstract, abst, m_searchData, 0, true);
// Links;
string linksbuf;
@ -609,7 +607,7 @@ void ResList::menuExpand()
QString ResList::getDescription()
{
return QString::fromUtf8(m_queryData->m_description.c_str());
return QString::fromUtf8(m_searchData->getDescription().c_str());
}
/** Show detailed expansion of a query */
@ -619,7 +617,7 @@ void ResList::showQueryDetails()
// Also limit the total number of lines.
const unsigned int ll = 100;
const unsigned int maxlines = 50;
string query = m_queryData->m_description;
string query = m_searchData->getDescription();
string oq;
unsigned int nlines = 0;
while (query.length() > 0) {

View File

@ -1,6 +1,6 @@
#ifndef _RESLIST_H_INCLUDED_
#define _RESLIST_H_INCLUDED_
/* @(#$Id: reslist.h,v 1.2 2006-11-13 08:58:47 dockes Exp $ (C) 2005 J.F.Dockes */
/* @(#$Id: reslist.h,v 1.3 2006-11-17 10:09:07 dockes Exp $ (C) 2005 J.F.Dockes */
#include <list>
@ -35,6 +35,7 @@ class ResList : public QTextBrowser
virtual QPopupMenu *createPopupMenu(const QPoint& pos);
virtual QString getDescription(); // Printable actual query performed on db
virtual int getResCnt(); // Return total result list size
virtual RefCntr<Rcl::SearchData> getSearchData() {return m_searchData;}
public slots:
virtual void resetSearch() {m_winfirst = -1;clear();}
@ -71,7 +72,7 @@ class ResList : public QTextBrowser
private:
std::map<int,int> m_pageParaToReldocnums;
RefCntr<Rcl::SearchData> m_queryData;
RefCntr<Rcl::SearchData> m_searchData;
DocSequence *m_docsource;
std::vector<Rcl::Doc> m_curDocs;
int m_winfirst;