simple term highlighting in query preview

This commit is contained in:
dockes 2005-02-07 13:17:47 +00:00
parent 74434a3b02
commit 2a020407da
5 changed files with 160 additions and 73 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_TEXTSPLIT #ifndef TEST_TEXTSPLIT
@ -7,6 +7,7 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Ex
#include <string> #include <string>
#include "textsplit.h" #include "textsplit.h"
#include "debuglog.h"
using namespace std; using namespace std;
@ -57,9 +58,12 @@ static void setcharclasses()
init = 1; init = 1;
} }
bool TextSplit::emitterm(string &w, int pos, bool doerase = true) bool TextSplit::emitterm(string &w, int pos, bool doerase,
int btstart, int btend)
{ {
if (!termsink) LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (!cb)
return false; return false;
// Maybe trim end of word. These are chars that we would keep inside // Maybe trim end of word. These are chars that we would keep inside
@ -77,7 +81,7 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
} }
breakloop: breakloop:
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
bool ret = termsink(cdata, w, pos); bool ret = cb->takeword(w, pos, btstart, btend);
if (doerase) if (doerase)
w.erase(); w.erase();
return ret; return ret;
@ -92,14 +96,16 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
*/ */
bool TextSplit::text_to_words(const string &in) bool TextSplit::text_to_words(const string &in)
{ {
LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb));
setcharclasses(); setcharclasses();
string span; string span;
string word; string word;
bool number = false; bool number = false;
int wordpos = 0; int wordpos = 0;
int spanpos = 0; int spanpos = 0;
unsigned int i;
for (unsigned int i = 0; i < in.length(); i++) { for (i = 0; i < in.length(); i++) {
int c = in[i]; int c = in[i];
int cc = charclasses[c]; int cc = charclasses[c];
switch (cc) { switch (cc) {
@ -107,10 +113,10 @@ bool TextSplit::text_to_words(const string &in)
SPACE: SPACE:
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos)) if (!emitterm(span, spanpos, true, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++)) if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} }
@ -127,10 +133,10 @@ bool TextSplit::text_to_words(const string &in)
} }
} else { } else {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false)) if (!emitterm(span, spanpos, false, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++)) if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
span += c; span += c;
@ -140,10 +146,10 @@ bool TextSplit::text_to_words(const string &in)
case '@': case '@':
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) { if (span.length() != word.length()) {
if (!emitterm(span, spanpos, false)) if (!emitterm(span, spanpos, false, i-span.length(), i))
return false; return false;
} }
if (!emitterm(word, wordpos++)) if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} else } else
@ -155,7 +161,7 @@ bool TextSplit::text_to_words(const string &in)
word += c; word += c;
} else { } else {
if (word.length()) { if (word.length()) {
if (!emitterm(word, wordpos++)) if (!emitterm(word, wordpos++, true, i-word.length(), i))
return false; return false;
number = false; number = false;
} else } else
@ -202,9 +208,9 @@ bool TextSplit::text_to_words(const string &in)
} }
if (word.length()) { if (word.length()) {
if (span.length() != word.length()) if (span.length() != word.length())
if (!emitterm(span, spanpos)) if (!emitterm(span, spanpos, true, i-span.length(), i))
return false; return false;
return emitterm(word, wordpos); return emitterm(word, wordpos, true, i-word.length(), i);
} }
return true; return true;
} }
@ -222,12 +228,14 @@ bool TextSplit::text_to_words(const string &in)
using namespace std; using namespace std;
bool termsink(void *, const string &term, int pos) // A small class to hold state while splitting text
{ class mySplitterCB : public TextSplitCB {
cout << pos << " " << term << endl; public:
return true; bool takeword(const std::string &term, int pos, int bs, int be) {
} cout << pos << " " << term << " bs " << bs << " be " << be << endl;
return true;
}
};
static string teststring = static string teststring =
"jfd@okyz.com " "jfd@okyz.com "
@ -241,7 +249,8 @@ static string teststring =
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
TextSplit splitter(termsink, 0); mySplitterCB cb;
TextSplit splitter(&cb);
if (argc == 2) { if (argc == 2) {
string data; string data;
if (!file_to_string(argv[1], data)) if (!file_to_string(argv[1], data))

View File

@ -1,9 +1,20 @@
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
// Function class whose called for every detected word
class TextSplitCB {
public:
virtual ~TextSplitCB() {}
virtual bool takeword(const std::string& term,
int pos, // term pos
int bts, // byte offset of first char in term
int bte // byte offset of first char after term
) = 0;
};
/** /**
* Split text into words. * Split text into words.
* See comments at top of .cpp for more explanations. * See comments at top of .cpp for more explanations.
@ -11,19 +22,14 @@
* but 'ts much simpler this way... * but 'ts much simpler this way...
*/ */
class TextSplit { class TextSplit {
public: TextSplitCB *cb;
typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
private:
TermSink termsink;
void *cdata;
int maxWordLength; int maxWordLength;
bool emitterm(std::string &term, int pos, bool doerase); bool emitterm(std::string &term, int pos, bool doerase, int, int);
public: public:
/** /**
* Constructor: just store callback and client data * Constructor: just store callback and client data
*/ */
TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40) TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
{}
/** /**
* Split text, emit words and positions. * Split text, emit words and positions.
*/ */

View File

@ -15,9 +15,13 @@
#include <unistd.h> #include <unistd.h>
#include <fcntl.h> #include <fcntl.h>
#include <utility>
using std::pair;
#include <qmessagebox.h> #include <qmessagebox.h>
#include <qcstring.h> #include <qcstring.h>
#include "rcldb.h" #include "rcldb.h"
#include "rclconfig.h" #include "rclconfig.h"
#include "debuglog.h" #include "debuglog.h"
@ -25,10 +29,12 @@
#include "pathut.h" #include "pathut.h"
#include "recoll.h" #include "recoll.h"
#include "internfile.h" #include "internfile.h"
#include "textsplit.h"
#include "smallut.h"
void RecollMain::fileExit() void RecollMain::fileExit()
{ {
LOGDEB(("RecollMain: fileExit\n")); LOGDEB1(("RecollMain: fileExit\n"));
exit(0); exit(0);
} }
@ -52,17 +58,66 @@ void RecollMain::fileStart_IndexingAction_activated()
startindexing = 1; startindexing = 1;
} }
static string plaintorich(const string &in) // Text splitter callback used to take note of the query terms byte offsets
// inside the text. This is then used to post highlight tags.
class myTextSplitCB : public TextSplitCB {
public:
list<pair<int, int> > tboffs;
const list<string> *terms;
myTextSplitCB(const list<string>& terms) : terms(&terms) {}
virtual bool takeword(const std::string& term, int, int bts, int bte) {
for (list<string>::const_iterator it = terms->begin();
it != terms->end(); it++) {
if (!stringlowercmp(*it, term)) {
tboffs.push_back(pair<int, int>(bts, bte));
break;
}
}
return true;
}
};
static string plaintorich(const string &in, const list<string>& terms,
list<pair<int, int> >&termoffsets)
{ {
#if 0
{string t;
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++)
t += "'" + *it + "' ";
LOGDEB(("plaintorich: term: %s\n", t.c_str()));
}
#endif
myTextSplitCB cb(terms);
TextSplit splitter(&cb);
splitter.text_to_words(in);
string out1;
if (cb.tboffs.empty()) {
out1 = in;
} else {
list<pair<int, int> >::iterator it = cb.tboffs.begin();
for (unsigned int i = 0; i < in.length() ; i++) {
if (it != cb.tboffs.end()) {
if (i == (unsigned int)it->first) {
out1 += "<termtag>";
} else if (i == (unsigned int)it->second) {
if (it != cb.tboffs.end())
it++;
out1 += "</termtag>";
}
}
out1 += in[i];
}
}
string out = "<qt><head><title></title></head><body><p>"; string out = "<qt><head><title></title></head><body><p>";
for (unsigned int i = 0; i < in.length() ; i++) { for (string::const_iterator it = out1.begin();it != out1.end(); it++) {
if (in[i] == '\n') { if (*it == '\n') {
out += "<br>"; out += "<br>";
// out += '\n'; // out += '\n';
} else { } else {
out += in[i]; out += *it;
} }
} }
termoffsets = cb.tboffs;
return out; return out;
} }
@ -137,7 +192,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
int reldocnum = par - 1; int reldocnum = par - 1;
reslist_current = reldocnum; reslist_current = reldocnum;
previewTextEdit->clear(); previewTextEdit->clear();
LOGDEB(("Cleared preview\n"));
if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) { if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) {
QMessageBox::warning(0, "Recoll", QMessageBox::warning(0, "Recoll",
QString("Can't retrieve document from database")); QString("Can't retrieve document from database"));
@ -154,26 +209,28 @@ void RecollMain::reslistTE_clicked(int par, int car)
doc.mimetype.c_str()); doc.mimetype.c_str());
return; return;
} }
list<string> terms;
rcldb->getQueryTerms(terms);
list<pair<int, int> > termoffsets;
string rich = plaintorich(fdoc.text, terms, termoffsets);
string rich = plaintorich(fdoc.text);
#if 0
//Highlighting; pass a list of (search term, style name) to plaintorich
// and create the corresponding styles with different colors here
// We need to :
// - Break the query into terms : wait for the query analyzer
// - Break the text into words. This should use a version of
// textsplit with an option to keep the punctuation (see how to do
// this). We do want the same splitter code to be used here and
// when indexing.
QStyleSheetItem *item = QStyleSheetItem *item =
new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" ); new QStyleSheetItem( previewTextEdit->styleSheet(), "termtag" );
item->setColor("red"); item->setColor("blue");
item->setFontWeight(QFont::Bold); item->setFontWeight(QFont::Bold);
#endif
QString str = QString::fromUtf8(rich.c_str(), rich.length()); QString str = QString::fromUtf8(rich.c_str(), rich.length());
previewTextEdit->setText(str); previewTextEdit->setText(str);
int para = 0, index = 1;
if (!termoffsets.empty()) {
index = (termoffsets.begin())->first;
LOGDEB1(("Setting cursor position to para %d, index %d\n",para,index));
previewTextEdit->setCursorPosition(0, index);
}
previewTextEdit->ensureCursorVisible();
previewTextEdit->getCursorPosition(&para, &index);
LOGDEB1(("PREVIEW Paragraphs: %d. Cpos: %d %d\n",
previewTextEdit->paragraphs(), para, index));
} }
@ -181,7 +238,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
// first page of results // first page of results
void RecollMain::queryText_returnPressed() void RecollMain::queryText_returnPressed()
{ {
LOGDEB(("RecollMain::queryText_returnPressed()\n")); LOGDEB1(("RecollMain::queryText_returnPressed()\n"));
if (!rcldb->isopen()) { if (!rcldb->isopen()) {
string dbdir; string dbdir;
if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
@ -206,6 +263,7 @@ void RecollMain::queryText_returnPressed()
if (!rcldb->setQuery(string((const char *)u8))) if (!rcldb->setQuery(string((const char *)u8)))
return; return;
list<string> terms;
listNextPB_clicked(); listNextPB_clicked();
} }
@ -234,7 +292,7 @@ void RecollMain::listPrevPB_clicked()
// Fill up result list window with next screen of hits // Fill up result list window with next screen of hits
void RecollMain::listNextPB_clicked() void RecollMain::listNextPB_clicked()
{ {
LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst)); LOGDEB1(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
if (reslist_winfirst < 0) if (reslist_winfirst < 0)
reslist_winfirst = 0; reslist_winfirst = 0;
@ -284,7 +342,7 @@ void RecollMain::listNextPB_clicked()
struct tm *tm = localtime(&mtime); struct tm *tm = localtime(&mtime);
strftime(datebuf, 99, "<i>Modified:</i>&nbsp;%F&nbsp;%T", tm); strftime(datebuf, 99, "<i>Modified:</i>&nbsp;%F&nbsp;%T", tm);
} }
LOGDEB(("Abstract: %s\n", doc.abstract.c_str())); LOGDEB1(("Abstract: %s\n", doc.abstract.c_str()));
string result = "<p>" + string result = "<p>" +
string(perbuf) + " <b>" + doc.title + "</b><br>" + string(perbuf) + " <b>" + doc.title + "</b><br>" +
doc.mimetype + "&nbsp;" + doc.mimetype + "&nbsp;" +

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.18 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -171,20 +171,19 @@ bool Rcl::Db::isopen()
} }
// A small class to hold state while splitting text // A small class to hold state while splitting text
class wsData { class mySplitterCB : public TextSplitCB {
public: public:
Xapian::Document &doc; Xapian::Document &doc;
Xapian::termpos basepos; // Base for document section Xapian::termpos basepos; // Base for document section
Xapian::termpos curpos; // Last position sent to callback Xapian::termpos curpos; // Last position sent to callback
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0) mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
{} {}
bool takeword(const std::string &term, int pos, int, int);
}; };
// Callback for the document to word splitting class during indexation // Callback for the document to word splitting class during indexation
static bool splitCb(void *cdata, const std::string &term, int pos) bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{ {
wsData *data = (wsData*)cdata;
// cerr << "splitCb: term " << term << endl; // cerr << "splitCb: term " << term << endl;
//string printable; //string printable;
//transcode(term, printable, "UTF-8", "ISO8859-1"); //transcode(term, printable, "UTF-8", "ISO8859-1");
@ -193,8 +192,8 @@ static bool splitCb(void *cdata, const std::string &term, int pos)
try { try {
// 1 is the value for wdfinc in index_text when called from omindex // 1 is the value for wdfinc in index_text when called from omindex
// TOBEDONE: check what this is used for // TOBEDONE: check what this is used for
data->curpos = pos; curpos = pos;
data->doc.add_posting(term, data->basepos + data->curpos, 1); doc.add_posting(term, basepos + curpos, 1);
} catch (...) { } catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n")); LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
return false; return false;
@ -281,9 +280,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
Xapian::Document newdocument; Xapian::Document newdocument;
wsData splitData(newdocument); mySplitterCB splitData(newdocument);
TextSplit splitter(splitCb, &splitData); TextSplit splitter(&splitData);
string noacc; string noacc;
if (!unac_cpp(doc.title, noacc)) { if (!unac_cpp(doc.title, noacc)) {
@ -436,18 +435,16 @@ bool Rcl::Db::purge()
#include <vector> #include <vector>
class wsQData { class wsQData : public TextSplitCB {
public: public:
vector<string> terms; vector<string> terms;
bool takeword(const std::string &term, int , int, int) {
terms.push_back(term);
return true;
}
}; };
// Callback for the query-to-words splitting
static bool splitQCb(void *cdata, const std::string &term, int )
{
wsQData *data = (wsQData*)cdata;
data->terms.push_back(term);
return true;
}
bool Rcl::Db::setQuery(const std::string &querystring) bool Rcl::Db::setQuery(const std::string &querystring)
{ {
@ -457,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return false; return false;
wsQData splitData; wsQData splitData;
TextSplit splitter(splitQCb, &splitData); TextSplit splitter(&splitData);
string noacc; string noacc;
if (!dumb_string(querystring, noacc)) { if (!dumb_string(querystring, noacc)) {
@ -475,6 +472,21 @@ bool Rcl::Db::setQuery(const std::string &querystring)
return true; return true;
} }
bool Rcl::Db::getQueryTerms(list<string>& terms)
{
Native *ndb = (Native *)pdata;
if (!ndb)
return false;
terms.clear();
Xapian::TermIterator it;
for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end();
it++) {
terms.push_back(*it);
}
return true;
}
int Rcl::Db::getResCnt() int Rcl::Db::getResCnt()
{ {
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;

View File

@ -1,8 +1,9 @@
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.8 2005-01-31 14:31:09 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.9 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list>
// rcldb defines an interface for a 'real' text database. The current // rcldb defines an interface for a 'real' text database. The current
// implementation uses xapian only, and xapian-related code is in rcldb.cpp // implementation uses xapian only, and xapian-related code is in rcldb.cpp
@ -72,6 +73,7 @@ class Db {
// Parse query string and initialize query // Parse query string and initialize query
bool setQuery(const std::string &q); bool setQuery(const std::string &q);
bool getQueryTerms(std::list<std::string>& terms);
// Get document at rank i. This is probably vastly inferior to the type // Get document at rank i. This is probably vastly inferior to the type
// of interface in Xapian, but we have to start with something simple // of interface in Xapian, but we have to start with something simple