simple term highlighting in query preview
This commit is contained in:
parent
74434a3b02
commit
2a020407da
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_TEXTSPLIT
|
||||
|
||||
@ -7,6 +7,7 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Ex
|
||||
#include <string>
|
||||
|
||||
#include "textsplit.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -57,9 +58,12 @@ static void setcharclasses()
|
||||
init = 1;
|
||||
}
|
||||
|
||||
bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
||||
bool TextSplit::emitterm(string &w, int pos, bool doerase,
|
||||
int btstart, int btend)
|
||||
{
|
||||
if (!termsink)
|
||||
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
||||
|
||||
if (!cb)
|
||||
return false;
|
||||
|
||||
// Maybe trim end of word. These are chars that we would keep inside
|
||||
@ -77,7 +81,7 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
||||
}
|
||||
breakloop:
|
||||
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
|
||||
bool ret = termsink(cdata, w, pos);
|
||||
bool ret = cb->takeword(w, pos, btstart, btend);
|
||||
if (doerase)
|
||||
w.erase();
|
||||
return ret;
|
||||
@ -92,14 +96,16 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
|
||||
*/
|
||||
bool TextSplit::text_to_words(const string &in)
|
||||
{
|
||||
LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb));
|
||||
setcharclasses();
|
||||
string span;
|
||||
string word;
|
||||
bool number = false;
|
||||
int wordpos = 0;
|
||||
int spanpos = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (unsigned int i = 0; i < in.length(); i++) {
|
||||
for (i = 0; i < in.length(); i++) {
|
||||
int c = in[i];
|
||||
int cc = charclasses[c];
|
||||
switch (cc) {
|
||||
@ -107,10 +113,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
SPACE:
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos))
|
||||
if (!emitterm(span, spanpos, true, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++))
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
}
|
||||
@ -127,10 +133,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
} else {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos, false))
|
||||
if (!emitterm(span, spanpos, false, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++))
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
span += c;
|
||||
@ -140,10 +146,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
case '@':
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length()) {
|
||||
if (!emitterm(span, spanpos, false))
|
||||
if (!emitterm(span, spanpos, false, i-span.length(), i))
|
||||
return false;
|
||||
}
|
||||
if (!emitterm(word, wordpos++))
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
@ -155,7 +161,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
word += c;
|
||||
} else {
|
||||
if (word.length()) {
|
||||
if (!emitterm(word, wordpos++))
|
||||
if (!emitterm(word, wordpos++, true, i-word.length(), i))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
@ -202,9 +208,9 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
if (word.length()) {
|
||||
if (span.length() != word.length())
|
||||
if (!emitterm(span, spanpos))
|
||||
if (!emitterm(span, spanpos, true, i-span.length(), i))
|
||||
return false;
|
||||
return emitterm(word, wordpos);
|
||||
return emitterm(word, wordpos, true, i-word.length(), i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -222,12 +228,14 @@ bool TextSplit::text_to_words(const string &in)
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool termsink(void *, const string &term, int pos)
|
||||
{
|
||||
cout << pos << " " << term << endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// A small class to hold state while splitting text
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
public:
|
||||
bool takeword(const std::string &term, int pos, int bs, int be) {
|
||||
cout << pos << " " << term << " bs " << bs << " be " << be << endl;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static string teststring =
|
||||
"jfd@okyz.com "
|
||||
@ -241,7 +249,8 @@ static string teststring =
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
TextSplit splitter(termsink, 0);
|
||||
mySplitterCB cb;
|
||||
TextSplit splitter(&cb);
|
||||
if (argc == 2) {
|
||||
string data;
|
||||
if (!file_to_string(argv[1], data))
|
||||
|
||||
@ -1,9 +1,20 @@
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
// Function class whose called for every detected word
|
||||
class TextSplitCB {
|
||||
public:
|
||||
virtual ~TextSplitCB() {}
|
||||
virtual bool takeword(const std::string& term,
|
||||
int pos, // term pos
|
||||
int bts, // byte offset of first char in term
|
||||
int bte // byte offset of first char after term
|
||||
) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Split text into words.
|
||||
* See comments at top of .cpp for more explanations.
|
||||
@ -11,19 +22,14 @@
|
||||
* but 'ts much simpler this way...
|
||||
*/
|
||||
class TextSplit {
|
||||
public:
|
||||
typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
|
||||
private:
|
||||
TermSink termsink;
|
||||
void *cdata;
|
||||
TextSplitCB *cb;
|
||||
int maxWordLength;
|
||||
bool emitterm(std::string &term, int pos, bool doerase);
|
||||
bool emitterm(std::string &term, int pos, bool doerase, int, int);
|
||||
public:
|
||||
/**
|
||||
* Constructor: just store callback and client data
|
||||
*/
|
||||
TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
|
||||
{}
|
||||
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
|
||||
/**
|
||||
* Split text, emit words and positions.
|
||||
*/
|
||||
|
||||
@ -15,9 +15,13 @@
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include <utility>
|
||||
using std::pair;
|
||||
|
||||
#include <qmessagebox.h>
|
||||
#include <qcstring.h>
|
||||
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "rclconfig.h"
|
||||
#include "debuglog.h"
|
||||
@ -25,10 +29,12 @@
|
||||
#include "pathut.h"
|
||||
#include "recoll.h"
|
||||
#include "internfile.h"
|
||||
#include "textsplit.h"
|
||||
#include "smallut.h"
|
||||
|
||||
void RecollMain::fileExit()
|
||||
{
|
||||
LOGDEB(("RecollMain: fileExit\n"));
|
||||
LOGDEB1(("RecollMain: fileExit\n"));
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@ -52,17 +58,66 @@ void RecollMain::fileStart_IndexingAction_activated()
|
||||
startindexing = 1;
|
||||
}
|
||||
|
||||
static string plaintorich(const string &in)
|
||||
// Text splitter callback used to take note of the query terms byte offsets
|
||||
// inside the text. This is then used to post highlight tags.
|
||||
class myTextSplitCB : public TextSplitCB {
|
||||
public:
|
||||
list<pair<int, int> > tboffs;
|
||||
const list<string> *terms;
|
||||
myTextSplitCB(const list<string>& terms) : terms(&terms) {}
|
||||
virtual bool takeword(const std::string& term, int, int bts, int bte) {
|
||||
for (list<string>::const_iterator it = terms->begin();
|
||||
it != terms->end(); it++) {
|
||||
if (!stringlowercmp(*it, term)) {
|
||||
tboffs.push_back(pair<int, int>(bts, bte));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static string plaintorich(const string &in, const list<string>& terms,
|
||||
list<pair<int, int> >&termoffsets)
|
||||
{
|
||||
#if 0
|
||||
{string t;
|
||||
for (list<string>::const_iterator it = terms.begin();it != terms.end();it++)
|
||||
t += "'" + *it + "' ";
|
||||
LOGDEB(("plaintorich: term: %s\n", t.c_str()));
|
||||
}
|
||||
#endif
|
||||
myTextSplitCB cb(terms);
|
||||
TextSplit splitter(&cb);
|
||||
splitter.text_to_words(in);
|
||||
string out1;
|
||||
if (cb.tboffs.empty()) {
|
||||
out1 = in;
|
||||
} else {
|
||||
list<pair<int, int> >::iterator it = cb.tboffs.begin();
|
||||
for (unsigned int i = 0; i < in.length() ; i++) {
|
||||
if (it != cb.tboffs.end()) {
|
||||
if (i == (unsigned int)it->first) {
|
||||
out1 += "<termtag>";
|
||||
} else if (i == (unsigned int)it->second) {
|
||||
if (it != cb.tboffs.end())
|
||||
it++;
|
||||
out1 += "</termtag>";
|
||||
}
|
||||
}
|
||||
out1 += in[i];
|
||||
}
|
||||
}
|
||||
string out = "<qt><head><title></title></head><body><p>";
|
||||
for (unsigned int i = 0; i < in.length() ; i++) {
|
||||
if (in[i] == '\n') {
|
||||
for (string::const_iterator it = out1.begin();it != out1.end(); it++) {
|
||||
if (*it == '\n') {
|
||||
out += "<br>";
|
||||
// out += '\n';
|
||||
} else {
|
||||
out += in[i];
|
||||
out += *it;
|
||||
}
|
||||
}
|
||||
termoffsets = cb.tboffs;
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -137,7 +192,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
int reldocnum = par - 1;
|
||||
reslist_current = reldocnum;
|
||||
previewTextEdit->clear();
|
||||
LOGDEB(("Cleared preview\n"));
|
||||
|
||||
if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("Can't retrieve document from database"));
|
||||
@ -154,26 +209,28 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
doc.mimetype.c_str());
|
||||
return;
|
||||
}
|
||||
list<string> terms;
|
||||
rcldb->getQueryTerms(terms);
|
||||
list<pair<int, int> > termoffsets;
|
||||
string rich = plaintorich(fdoc.text, terms, termoffsets);
|
||||
|
||||
string rich = plaintorich(fdoc.text);
|
||||
|
||||
#if 0
|
||||
//Highlighting; pass a list of (search term, style name) to plaintorich
|
||||
// and create the corresponding styles with different colors here
|
||||
// We need to :
|
||||
// - Break the query into terms : wait for the query analyzer
|
||||
// - Break the text into words. This should use a version of
|
||||
// textsplit with an option to keep the punctuation (see how to do
|
||||
// this). We do want the same splitter code to be used here and
|
||||
// when indexing.
|
||||
QStyleSheetItem *item =
|
||||
new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" );
|
||||
item->setColor("red");
|
||||
new QStyleSheetItem( previewTextEdit->styleSheet(), "termtag" );
|
||||
item->setColor("blue");
|
||||
item->setFontWeight(QFont::Bold);
|
||||
#endif
|
||||
|
||||
QString str = QString::fromUtf8(rich.c_str(), rich.length());
|
||||
previewTextEdit->setText(str);
|
||||
int para = 0, index = 1;
|
||||
if (!termoffsets.empty()) {
|
||||
index = (termoffsets.begin())->first;
|
||||
LOGDEB1(("Setting cursor position to para %d, index %d\n",para,index));
|
||||
previewTextEdit->setCursorPosition(0, index);
|
||||
}
|
||||
previewTextEdit->ensureCursorVisible();
|
||||
previewTextEdit->getCursorPosition(¶, &index);
|
||||
LOGDEB1(("PREVIEW Paragraphs: %d. Cpos: %d %d\n",
|
||||
previewTextEdit->paragraphs(), para, index));
|
||||
}
|
||||
|
||||
|
||||
@ -181,7 +238,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
// first page of results
|
||||
void RecollMain::queryText_returnPressed()
|
||||
{
|
||||
LOGDEB(("RecollMain::queryText_returnPressed()\n"));
|
||||
LOGDEB1(("RecollMain::queryText_returnPressed()\n"));
|
||||
if (!rcldb->isopen()) {
|
||||
string dbdir;
|
||||
if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
|
||||
@ -206,6 +263,7 @@ void RecollMain::queryText_returnPressed()
|
||||
|
||||
if (!rcldb->setQuery(string((const char *)u8)))
|
||||
return;
|
||||
list<string> terms;
|
||||
listNextPB_clicked();
|
||||
}
|
||||
|
||||
@ -234,7 +292,7 @@ void RecollMain::listPrevPB_clicked()
|
||||
// Fill up result list window with next screen of hits
|
||||
void RecollMain::listNextPB_clicked()
|
||||
{
|
||||
LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
|
||||
LOGDEB1(("listNextPB_clicked: winfirst %d\n", reslist_winfirst));
|
||||
|
||||
if (reslist_winfirst < 0)
|
||||
reslist_winfirst = 0;
|
||||
@ -284,7 +342,7 @@ void RecollMain::listNextPB_clicked()
|
||||
struct tm *tm = localtime(&mtime);
|
||||
strftime(datebuf, 99, "<i>Modified:</i> %F %T", tm);
|
||||
}
|
||||
LOGDEB(("Abstract: %s\n", doc.abstract.c_str()));
|
||||
LOGDEB1(("Abstract: %s\n", doc.abstract.c_str()));
|
||||
string result = "<p>" +
|
||||
string(perbuf) + " <b>" + doc.title + "</b><br>" +
|
||||
doc.mimetype + " " +
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.18 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -171,20 +171,19 @@ bool Rcl::Db::isopen()
|
||||
}
|
||||
|
||||
// A small class to hold state while splitting text
|
||||
class wsData {
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
public:
|
||||
Xapian::Document &doc;
|
||||
Xapian::termpos basepos; // Base for document section
|
||||
Xapian::termpos curpos; // Last position sent to callback
|
||||
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
||||
mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
||||
{}
|
||||
bool takeword(const std::string &term, int pos, int, int);
|
||||
};
|
||||
|
||||
// Callback for the document to word splitting class during indexation
|
||||
static bool splitCb(void *cdata, const std::string &term, int pos)
|
||||
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
{
|
||||
wsData *data = (wsData*)cdata;
|
||||
|
||||
// cerr << "splitCb: term " << term << endl;
|
||||
//string printable;
|
||||
//transcode(term, printable, "UTF-8", "ISO8859-1");
|
||||
@ -193,8 +192,8 @@ static bool splitCb(void *cdata, const std::string &term, int pos)
|
||||
try {
|
||||
// 1 is the value for wdfinc in index_text when called from omindex
|
||||
// TOBEDONE: check what this is used for
|
||||
data->curpos = pos;
|
||||
data->doc.add_posting(term, data->basepos + data->curpos, 1);
|
||||
curpos = pos;
|
||||
doc.add_posting(term, basepos + curpos, 1);
|
||||
} catch (...) {
|
||||
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
|
||||
return false;
|
||||
@ -281,9 +280,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
wsData splitData(newdocument);
|
||||
mySplitterCB splitData(newdocument);
|
||||
|
||||
TextSplit splitter(splitCb, &splitData);
|
||||
TextSplit splitter(&splitData);
|
||||
|
||||
string noacc;
|
||||
if (!unac_cpp(doc.title, noacc)) {
|
||||
@ -436,18 +435,16 @@ bool Rcl::Db::purge()
|
||||
|
||||
#include <vector>
|
||||
|
||||
class wsQData {
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
vector<string> terms;
|
||||
|
||||
bool takeword(const std::string &term, int , int, int) {
|
||||
terms.push_back(term);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// Callback for the query-to-words splitting
|
||||
static bool splitQCb(void *cdata, const std::string &term, int )
|
||||
{
|
||||
wsQData *data = (wsQData*)cdata;
|
||||
data->terms.push_back(term);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Rcl::Db::setQuery(const std::string &querystring)
|
||||
{
|
||||
@ -457,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
|
||||
return false;
|
||||
|
||||
wsQData splitData;
|
||||
TextSplit splitter(splitQCb, &splitData);
|
||||
TextSplit splitter(&splitData);
|
||||
|
||||
string noacc;
|
||||
if (!dumb_string(querystring, noacc)) {
|
||||
@ -475,6 +472,21 @@ bool Rcl::Db::setQuery(const std::string &querystring)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Rcl::Db::getQueryTerms(list<string>& terms)
|
||||
{
|
||||
Native *ndb = (Native *)pdata;
|
||||
if (!ndb)
|
||||
return false;
|
||||
|
||||
terms.clear();
|
||||
Xapian::TermIterator it;
|
||||
for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end();
|
||||
it++) {
|
||||
terms.push_back(*it);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int Rcl::Db::getResCnt()
|
||||
{
|
||||
Native *ndb = (Native *)pdata;
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.8 2005-01-31 14:31:09 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.9 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
// rcldb defines an interface for a 'real' text database. The current
|
||||
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
|
||||
@ -72,6 +73,7 @@ class Db {
|
||||
|
||||
// Parse query string and initialize query
|
||||
bool setQuery(const std::string &q);
|
||||
bool getQueryTerms(std::list<std::string>& terms);
|
||||
|
||||
// Get document at rank i. This is probably vastly inferior to the type
|
||||
// of interface in Xapian, but we have to start with something simple
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user