simple term highlighting in query preview

This commit is contained in:
dockes 2005-02-07 13:17:47 +00:00
parent 74434a3b02
commit 2a020407da
5 changed files with 160 additions and 73 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TEXTSPLIT
@ -7,6 +7,7 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Ex
#include <string>
#include "textsplit.h"
#include "debuglog.h"
using namespace std;
@ -57,9 +58,12 @@ static void setcharclasses()
init = 1;
}
bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
bool TextSplit::emitterm(string &w, int pos, bool doerase,
int btstart, int btend)
{
if (!termsink)
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (!cb)
return false;
// Maybe trim end of word. These are chars that we would keep inside
@ -77,7 +81,7 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
}
breakloop:
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
bool ret = termsink(cdata, w, pos);
bool ret = cb->takeword(w, pos, btstart, btend);
if (doerase)
w.erase();
return ret;
@ -92,14 +96,16 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
*/
bool TextSplit::text_to_words(const string &in)
{
LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb));
setcharclasses();
string span;
string word;
bool number = false;
int wordpos = 0;
int spanpos = 0;
unsigned int i;
for (unsigned int i = 0; i < in.length(); i++) {
for (i = 0; i < in.length(); i++) {
int c = in[i];
int cc = charclasses[c];
switch (cc) {
@ -107,10 +113,10 @@ bool TextSplit::text_to_words(const string &in)
SPACE:
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos))
if (!emitterm(span, spanpos, true, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++))
if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false;
number = false;
}
@ -127,10 +133,10 @@ bool TextSplit::text_to_words(const string &in)
}
} else {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false))
if (!emitterm(span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++))
if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false;
number = false;
span += c;
@ -140,10 +146,10 @@ bool TextSplit::text_to_words(const string &in)
case '@':
if (word.length()) {
if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false))
if (!emitterm(span, spanpos, false, i-span.length(), i))
return false;
}
if (!emitterm(word, wordpos++))
if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false;
number = false;
} else
@ -155,7 +161,7 @@ bool TextSplit::text_to_words(const string &in)
word += c;
} else {
if (word.length()) {
if (!emitterm(word, wordpos++))
if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false;
number = false;
} else
@ -202,9 +208,9 @@ bool TextSplit::text_to_words(const string &in)
}
if (word.length()) {
if (span.length() != word.length())
if (!emitterm(span, spanpos))
if (!emitterm(span, spanpos, true, i-span.length(), i))
return false;
return emitterm(word, wordpos);
return emitterm(word, wordpos, true, i-word.length(), i);
}
return true;
}
@ -222,12 +228,14 @@ bool TextSplit::text_to_words(const string &in)
using namespace std;
bool termsink(void *, const string &term, int pos)
{
cout << pos << " " << term << endl;
return true;
}
// A small class to hold state while splitting text
class mySplitterCB : public TextSplitCB {
public:
bool takeword(const std::string &term, int pos, int bs, int be) {
cout << pos << " " << term << " bs " << bs << " be " << be << endl;
return true;
}
};
static string teststring =
"jfd@okyz.com "
@ -241,7 +249,8 @@ static string teststring =
int main(int argc, char **argv)
{
TextSplit splitter(termsink, 0);
mySplitterCB cb;
TextSplit splitter(&cb);
if (argc == 2) {
string data;
if (!file_to_string(argv[1], data))

View File

@ -1,9 +1,20 @@
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
// Function class whose called for every detected word
class TextSplitCB {
public:
virtual ~TextSplitCB() {}
virtual bool takeword(const std::string& term,
int pos, // term pos
int bts, // byte offset of first char in term
int bte // byte offset of first char after term
) = 0;
};
/**
* Split text into words.
* See comments at top of .cpp for more explanations.
@ -11,19 +22,14 @@
* but 'ts much simpler this way...
*/
class TextSplit {
public:
typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
private:
TermSink termsink;
void *cdata;
TextSplitCB *cb;
int maxWordLength;
bool emitterm(std::string &term, int pos, bool doerase);
bool emitterm(std::string &term, int pos, bool doerase, int, int);
public:
/**
* Constructor: just store callback and client data
*/
TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
{}
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
/**
* Split text, emit words and positions.
*/

View File

@ -15,9 +15,13 @@
#include <unistd.h>
#include <fcntl.h>
#include <utility>
using std::pair;
#include <qmessagebox.h>
#include <qcstring.h>
#include "rcldb.h"
#include "rclconfig.h"
#include "debuglog.h"
@ -25,10 +29,12 @@
#include "pathut.h"
#include "recoll.h"
#include "internfile.h"
#include "textsplit.h"
#include "smallut.h"
void RecollMain::fileExit()
{
LOGDEB(("RecollMain: fileExit\n"));
LOGDEB1(("RecollMain: fileExit\n"));
exit(0);
}
@ -52,17 +58,66 @@ void RecollMain::fileStart_IndexingAction_activated()
startindexing = 1;
}
static string plaintorich(const string &in)
// Text splitter callback used to take note of the query terms byte offsets
// inside the text. This is then used to post highlight tags.
class myTextSplitCB : public TextSplitCB {
public:
list<pair<int, int> > tboffs;
const list<string> *terms;
myTextSplitCB(const list<string>& terms) : terms(&terms) {}
virtual bool takeword(const std::string& term, int, int bts, int bte) {
for (list<string>::const_iterator it = terms->begin();
it != terms->end(); it++) {
if (!stringlowercmp(*it, term)) {
tboffs.push_back(pair<int, int>(bts, bte));
break;
}
}
return true;
}
};
static string plaintorich(const string &in, const list<string>& terms,
list<pair<int, int> >&termoffsets)
{
#if 0
{string t;
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++)
t += "'" + *it + "' ";
LOGDEB(("plaintorich: term: %s\n", t.c_str()));
}
#endif
myTextSplitCB cb(terms);
TextSplit splitter(&cb);
splitter.text_to_words(in);
string out1;
if (cb.tboffs.empty()) {
out1 = in;
} else {
list<pair<int, int> >::iterator it = cb.tboffs.begin();
for (unsigned int i = 0; i < in.length() ; i++) {
if (it != cb.tboffs.end()) {
if (i == (unsigned int)it->first) {
out1 += "<termtag>";
} else if (i == (unsigned int)it->second) {
if (it != cb.tboffs.end())
it++;
out1 += "</termtag>";
}
}
out1 += in[i];
}
}
string out = "<qt><head><title></title></head><body><p>";
for (unsigned int i = 0; i < in.length() ; i++) {
if (in[i] == '\n') {
for (string::const_iterator it = out1.begin();it != out1.end(); it++) {
if (*it == '\n') {
out += "<br>";
// out += '\n';
} else {
out += in[i];
out += *it;
}
}
termoffsets = cb.tboffs;
return out;
}
@ -137,7 +192,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
int reldocnum = par - 1;
reslist_current = reldocnum;
previewTextEdit->clear();
LOGDEB(("Cleared preview\n"));
if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) {
QMessageBox::warning(0, "Recoll",
QString("Can't retrieve document from database"));
@ -154,26 +209,28 @@ void RecollMain::reslistTE_clicked(int par, int car)
doc.mimetype.c_str());
return;
}
list<string> terms;
rcldb->getQueryTerms(terms);
list<pair<int, int> > termoffsets;
string rich = plaintorich(fdoc.text, terms, termoffsets);
string rich = plaintorich(fdoc.text);
#if 0
//Highlighting; pass a list of (search term, style name) to plaintorich
// and create the corresponding styles with different colors here
// We need to :
// - Break the query into terms : wait for the query analyzer
// - Break the text into words. This should use a version of
// textsplit with an option to keep the punctuation (see how to do
// this). We do want the same splitter code to be used here and
// when indexing.
QStyleSheetItem *item =
new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" );
item->setColor("red");
new QStyleSheetItem( previewTextEdit->styleSheet(), "termtag" );
item->setColor("blue");
item->setFontWeight(QFont::Bold);
#endif
QString str = QString::fromUtf8(rich.c_str(), rich.length());
previewTextEdit->setText(str);
int para = 0, index = 1;
if (!termoffsets.empty()) {
index = (termoffsets.begin())->first;
LOGDEB1(("Setting cursor position to para %d, index %d\n",para,index));
previewTextEdit->setCursorPosition(0, index);
}
previewTextEdit->ensureCursorVisible();
previewTextEdit->getCursorPosition(&para, &index);
LOGDEB1(("PREVIEW Paragraphs: %d. Cpos: %d %d\n",
previewTextEdit->paragraphs(), para, index));
}
@ -181,7 +238,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
// first page of results
void RecollMain::queryText_returnPressed()
{
LOGDEB(("RecollMain::queryText_returnPressed()\n"));
LOGDEB1(("RecollMain::queryText_returnPressed()\n"));
if (!rcldb->isopen()) {
string dbdir;
if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
@ -206,6 +263,7 @@ void RecollMain::queryText_returnPressed()
if (!rcldb->setQuery(string((const char *)u8)))
return;
list<string> terms;
listNextPB_clicked();
}
@ -234,7 +292,7 @@ void RecollMain::listPrevPB_clicked()
// Fill up result list window with next screen of hits
void RecollMain::listNextPB_clicked()
{
LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
LOGDEB1(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
if (reslist_winfirst < 0)
reslist_winfirst = 0;
@ -284,7 +342,7 @@ void RecollMain::listNextPB_clicked()
struct tm *tm = localtime(&mtime);
strftime(datebuf, 99, "<i>Modified:</i>&nbsp;%F&nbsp;%T", tm);
}
LOGDEB(("Abstract: %s\n", doc.abstract.c_str()));
LOGDEB1(("Abstract: %s\n", doc.abstract.c_str()));
string result = "<p>" +
string(perbuf) + " <b>" + doc.title + "</b><br>" +
doc.mimetype + "&nbsp;" +

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.18 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -171,20 +171,19 @@ bool Rcl::Db::isopen()
}
// A small class to hold state while splitting text
class wsData {
class mySplitterCB : public TextSplitCB {
public:
Xapian::Document &doc;
Xapian::termpos basepos; // Base for document section
Xapian::termpos curpos; // Last position sent to callback
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
{}
bool takeword(const std::string &term, int pos, int, int);
};
// Callback for the document to word splitting class during indexation
static bool splitCb(void *cdata, const std::string &term, int pos)
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{
wsData *data = (wsData*)cdata;
// cerr << "splitCb: term " << term << endl;
//string printable;
//transcode(term, printable, "UTF-8", "ISO8859-1");
@ -193,8 +192,8 @@ static bool splitCb(void *cdata, const std::string &term, int pos)
try {
// 1 is the value for wdfinc in index_text when called from omindex
// TOBEDONE: check what this is used for
data->curpos = pos;
data->doc.add_posting(term, data->basepos + data->curpos, 1);
curpos = pos;
doc.add_posting(term, basepos + curpos, 1);
} catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
return false;
@ -281,9 +280,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
Xapian::Document newdocument;
wsData splitData(newdocument);
mySplitterCB splitData(newdocument);
TextSplit splitter(splitCb, &splitData);
TextSplit splitter(&splitData);
string noacc;
if (!unac_cpp(doc.title, noacc)) {
@ -436,18 +435,16 @@ bool Rcl::Db::purge()
#include <vector>
class wsQData {
class wsQData : public TextSplitCB {
public:
vector<string> terms;
bool takeword(const std::string &term, int , int, int) {
terms.push_back(term);
return true;
}
};
// Callback for the query-to-words splitting
static bool splitQCb(void *cdata, const std::string &term, int )
{
wsQData *data = (wsQData*)cdata;
data->terms.push_back(term);
return true;
}
bool Rcl::Db::setQuery(const std::string &querystring)
{
@ -457,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return false;
wsQData splitData;
TextSplit splitter(splitQCb, &splitData);
TextSplit splitter(&splitData);
string noacc;
if (!dumb_string(querystring, noacc)) {
@ -475,6 +472,21 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return true;
}
bool Rcl::Db::getQueryTerms(list<string>& terms)
{
Native *ndb = (Native *)pdata;
if (!ndb)
return false;
terms.clear();
Xapian::TermIterator it;
for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end();
it++) {
terms.push_back(*it);
}
return true;
}
int Rcl::Db::getResCnt()
{
Native *ndb = (Native *)pdata;

View File

@ -1,8 +1,9 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.8 2005-01-31 14:31:09 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.9 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
// rcldb defines an interface for a 'real' text database. The current
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
@ -72,6 +73,7 @@ class Db {
// Parse query string and initialize query
bool setQuery(const std::string &q);
bool getQueryTerms(std::list<std::string>& terms);
// Get document at rank i. This is probably vastly inferior to the type
// of interface in Xapian, but we have to start with something simple