diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index efa8bc37..c682017e 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -7,6 +7,7 @@ static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Ex #include #include "textsplit.h" +#include "debuglog.h" using namespace std; @@ -57,9 +58,12 @@ static void setcharclasses() init = 1; } -bool TextSplit::emitterm(string &w, int pos, bool doerase = true) +bool TextSplit::emitterm(string &w, int pos, bool doerase, + int btstart, int btend) { - if (!termsink) + LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); + + if (!cb) return false; // Maybe trim end of word. These are chars that we would keep inside @@ -77,7 +81,7 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true) } breakloop: if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { - bool ret = termsink(cdata, w, pos); + bool ret = cb->takeword(w, pos, btstart, btend); if (doerase) w.erase(); return ret; @@ -92,14 +96,16 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase = true) */ bool TextSplit::text_to_words(const string &in) { + LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb)); setcharclasses(); string span; string word; bool number = false; int wordpos = 0; int spanpos = 0; + unsigned int i; - for (unsigned int i = 0; i < in.length(); i++) { + for (i = 0; i < in.length(); i++) { int c = in[i]; int cc = charclasses[c]; switch (cc) { @@ -107,10 +113,10 @@ bool TextSplit::text_to_words(const string &in) SPACE: if (word.length()) { if (span.length() != word.length()) { - if (!emitterm(span, spanpos)) + if (!emitterm(span, spanpos, true, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++)) + if (!emitterm(word, wordpos++, true, i-word.length(), i)) return false; number = false; } @@ -127,10 +133,10 @@ bool TextSplit::text_to_words(const string &in) } } else { if (span.length() != word.length()) { - if (!emitterm(span, spanpos, false)) + if (!emitterm(span, spanpos, false, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++)) + if (!emitterm(word, wordpos++, true, i-word.length(), i)) return false; number = false; span += c; @@ -140,10 +146,10 @@ bool TextSplit::text_to_words(const string &in) case '@': if (word.length()) { if (span.length() != word.length()) { - if (!emitterm(span, spanpos, false)) + if (!emitterm(span, spanpos, false, i-span.length(), i)) return false; } - if (!emitterm(word, wordpos++)) + if (!emitterm(word, wordpos++, true, i-word.length(), i)) return false; number = false; } else @@ -155,7 +161,7 @@ bool TextSplit::text_to_words(const string &in) word += c; } else { if (word.length()) { - if (!emitterm(word, wordpos++)) + if (!emitterm(word, wordpos++, true, i-word.length(), i)) return false; number = false; } else @@ -202,9 +208,9 @@ bool TextSplit::text_to_words(const string &in) } if (word.length()) { if (span.length() != word.length()) - if (!emitterm(span, spanpos)) + if (!emitterm(span, spanpos, true, i-span.length(), i)) return false; - return emitterm(word, wordpos); + return emitterm(word, wordpos, true, i-word.length(), i); } return true; } @@ -222,12 +228,14 @@ bool TextSplit::text_to_words(const string &in) using namespace std; -bool termsink(void *, const string &term, int pos) -{ - cout << pos << " " << term << endl; - return true; -} - +// A small class to hold state while splitting text +class mySplitterCB : public TextSplitCB { + public: + bool takeword(const std::string &term, int pos, int bs, int be) { + cout << pos << " " << term << " bs " << bs << " be " << be << endl; + return true; + } +}; static string teststring = "jfd@okyz.com " @@ -241,7 +249,8 @@ static string teststring = int main(int argc, char **argv) { - TextSplit splitter(termsink, 0); + mySplitterCB cb; + TextSplit splitter(&cb); if (argc == 2) { string data; if (!file_to_string(argv[1], data)) diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 36a6cf76..0f1cb1af 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -1,9 +1,20 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */ #include +// Function class whose called for every detected word +class TextSplitCB { + public: + virtual ~TextSplitCB() {} + virtual bool takeword(const std::string& term, + int pos, // term pos + int bts, // byte offset of first char in term + int bte // byte offset of first char after term + ) = 0; +}; + /** * Split text into words. * See comments at top of .cpp for more explanations. @@ -11,19 +22,14 @@ * but 'ts much simpler this way... */ class TextSplit { - public: - typedef bool (*TermSink)(void *cdata, const std::string & term, int pos); - private: - TermSink termsink; - void *cdata; + TextSplitCB *cb; int maxWordLength; - bool emitterm(std::string &term, int pos, bool doerase); + bool emitterm(std::string &term, int pos, bool doerase, int, int); public: /** * Constructor: just store callback and client data */ - TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40) - {} + TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {} /** * Split text, emit words and positions. */ diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index 2b5bcb12..2fb686eb 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -15,9 +15,13 @@ #include #include +#include +using std::pair; + #include #include + #include "rcldb.h" #include "rclconfig.h" #include "debuglog.h" @@ -25,10 +29,12 @@ #include "pathut.h" #include "recoll.h" #include "internfile.h" +#include "textsplit.h" +#include "smallut.h" void RecollMain::fileExit() { - LOGDEB(("RecollMain: fileExit\n")); + LOGDEB1(("RecollMain: fileExit\n")); exit(0); } @@ -52,17 +58,66 @@ void RecollMain::fileStart_IndexingAction_activated() startindexing = 1; } -static string plaintorich(const string &in) +// Text splitter callback used to take note of the query terms byte offsets +// inside the text. This is then used to post highlight tags. +class myTextSplitCB : public TextSplitCB { + public: + list > tboffs; + const list *terms; + myTextSplitCB(const list& terms) : terms(&terms) {} + virtual bool takeword(const std::string& term, int, int bts, int bte) { + for (list::const_iterator it = terms->begin(); + it != terms->end(); it++) { + if (!stringlowercmp(*it, term)) { + tboffs.push_back(pair(bts, bte)); + break; + } + } + return true; + } +}; + +static string plaintorich(const string &in, const list& terms, + list >&termoffsets) { +#if 0 + {string t; + for (list::const_iterator it = terms.begin();it != terms.end();it++) + t += "'" + *it + "' "; + LOGDEB(("plaintorich: term: %s\n", t.c_str())); + } +#endif + myTextSplitCB cb(terms); + TextSplit splitter(&cb); + splitter.text_to_words(in); + string out1; + if (cb.tboffs.empty()) { + out1 = in; + } else { + list >::iterator it = cb.tboffs.begin(); + for (unsigned int i = 0; i < in.length() ; i++) { + if (it != cb.tboffs.end()) { + if (i == (unsigned int)it->first) { + out1 += ""; + } else if (i == (unsigned int)it->second) { + if (it != cb.tboffs.end()) + it++; + out1 += ""; + } + } + out1 += in[i]; + } + } string out = "

"; - for (unsigned int i = 0; i < in.length() ; i++) { - if (in[i] == '\n') { + for (string::const_iterator it = out1.begin();it != out1.end(); it++) { + if (*it == '\n') { out += "
"; // out += '\n'; } else { - out += in[i]; + out += *it; } } + termoffsets = cb.tboffs; return out; } @@ -137,7 +192,7 @@ void RecollMain::reslistTE_clicked(int par, int car) int reldocnum = par - 1; reslist_current = reldocnum; previewTextEdit->clear(); - LOGDEB(("Cleared preview\n")); + if (!rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) { QMessageBox::warning(0, "Recoll", QString("Can't retrieve document from database")); @@ -154,26 +209,28 @@ void RecollMain::reslistTE_clicked(int par, int car) doc.mimetype.c_str()); return; } + list terms; + rcldb->getQueryTerms(terms); + list > termoffsets; + string rich = plaintorich(fdoc.text, terms, termoffsets); - string rich = plaintorich(fdoc.text); - -#if 0 - //Highlighting; pass a list of (search term, style name) to plaintorich - // and create the corresponding styles with different colors here - // We need to : - // - Break the query into terms : wait for the query analyzer - // - Break the text into words. This should use a version of - // textsplit with an option to keep the punctuation (see how to do - // this). We do want the same splitter code to be used here and - // when indexing. QStyleSheetItem *item = - new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" ); - item->setColor("red"); + new QStyleSheetItem( previewTextEdit->styleSheet(), "termtag" ); + item->setColor("blue"); item->setFontWeight(QFont::Bold); -#endif QString str = QString::fromUtf8(rich.c_str(), rich.length()); previewTextEdit->setText(str); + int para = 0, index = 1; + if (!termoffsets.empty()) { + index = (termoffsets.begin())->first; + LOGDEB1(("Setting cursor position to para %d, index %d\n",para,index)); + previewTextEdit->setCursorPosition(0, index); + } + previewTextEdit->ensureCursorVisible(); + previewTextEdit->getCursorPosition(¶, &index); + LOGDEB1(("PREVIEW Paragraphs: %d. Cpos: %d %d\n", + previewTextEdit->paragraphs(), para, index)); } @@ -181,7 +238,7 @@ void RecollMain::reslistTE_clicked(int par, int car) // first page of results void RecollMain::queryText_returnPressed() { - LOGDEB(("RecollMain::queryText_returnPressed()\n")); + LOGDEB1(("RecollMain::queryText_returnPressed()\n")); if (!rcldb->isopen()) { string dbdir; if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { @@ -206,6 +263,7 @@ void RecollMain::queryText_returnPressed() if (!rcldb->setQuery(string((const char *)u8))) return; + list terms; listNextPB_clicked(); } @@ -234,7 +292,7 @@ void RecollMain::listPrevPB_clicked() // Fill up result list window with next screen of hits void RecollMain::listNextPB_clicked() { - LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst)); + LOGDEB1(("listNextPB_clicked: winfirst %d\n", reslist_winfirst)); if (reslist_winfirst < 0) reslist_winfirst = 0; @@ -284,7 +342,7 @@ void RecollMain::listNextPB_clicked() struct tm *tm = localtime(&mtime); strftime(datebuf, 99, "Modified: %F %T", tm); } - LOGDEB(("Abstract: %s\n", doc.abstract.c_str())); + LOGDEB1(("Abstract: %s\n", doc.abstract.c_str())); string result = "

" + string(perbuf) + " " + doc.title + "
" + doc.mimetype + " " + diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index a2d4675d..d1f1a72e 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.18 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -171,20 +171,19 @@ bool Rcl::Db::isopen() } // A small class to hold state while splitting text -class wsData { +class mySplitterCB : public TextSplitCB { public: Xapian::Document &doc; Xapian::termpos basepos; // Base for document section Xapian::termpos curpos; // Last position sent to callback - wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0) + mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0) {} + bool takeword(const std::string &term, int pos, int, int); }; // Callback for the document to word splitting class during indexation -static bool splitCb(void *cdata, const std::string &term, int pos) +bool mySplitterCB::takeword(const std::string &term, int pos, int, int) { - wsData *data = (wsData*)cdata; - // cerr << "splitCb: term " << term << endl; //string printable; //transcode(term, printable, "UTF-8", "ISO8859-1"); @@ -193,8 +192,8 @@ static bool splitCb(void *cdata, const std::string &term, int pos) try { // 1 is the value for wdfinc in index_text when called from omindex // TOBEDONE: check what this is used for - data->curpos = pos; - data->doc.add_posting(term, data->basepos + data->curpos, 1); + curpos = pos; + doc.add_posting(term, basepos + curpos, 1); } catch (...) { LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n")); return false; @@ -281,9 +280,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) Xapian::Document newdocument; - wsData splitData(newdocument); + mySplitterCB splitData(newdocument); - TextSplit splitter(splitCb, &splitData); + TextSplit splitter(&splitData); string noacc; if (!unac_cpp(doc.title, noacc)) { @@ -436,18 +435,16 @@ bool Rcl::Db::purge() #include -class wsQData { +class wsQData : public TextSplitCB { public: vector terms; + + bool takeword(const std::string &term, int , int, int) { + terms.push_back(term); + return true; + } }; -// Callback for the query-to-words splitting -static bool splitQCb(void *cdata, const std::string &term, int ) -{ - wsQData *data = (wsQData*)cdata; - data->terms.push_back(term); - return true; -} bool Rcl::Db::setQuery(const std::string &querystring) { @@ -457,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring) return false; wsQData splitData; - TextSplit splitter(splitQCb, &splitData); + TextSplit splitter(&splitData); string noacc; if (!dumb_string(querystring, noacc)) { @@ -475,6 +472,21 @@ bool Rcl::Db::setQuery(const std::string &querystring) return true; } +bool Rcl::Db::getQueryTerms(list& terms) +{ + Native *ndb = (Native *)pdata; + if (!ndb) + return false; + + terms.clear(); + Xapian::TermIterator it; + for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end(); + it++) { + terms.push_back(*it); + } + return true; +} + int Rcl::Db::getResCnt() { Native *ndb = (Native *)pdata; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index c7e8614b..b0b94aa4 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,8 +1,9 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.8 2005-01-31 14:31:09 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.9 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include // rcldb defines an interface for a 'real' text database. The current // implementation uses xapian only, and xapian-related code is in rcldb.cpp @@ -72,6 +73,7 @@ class Db { // Parse query string and initialize query bool setQuery(const std::string &q); + bool getQueryTerms(std::list& terms); // Get document at rank i. This is probably vastly inferior to the type // of interface in Xapian, but we have to start with something simple