From 82334f295785fc635fbc5d8926dac079b1734614 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 28 Jan 2005 15:25:40 +0000 Subject: [PATCH] ckpt --- src/Makefile | 11 +++ src/internfile/indextext.h | 23 +++++ src/internfile/myhtmlparse.cpp | 2 - src/qtgui/recollmain.ui | 140 +++++++++------------------ src/qtgui/recollmain.ui.h | 171 ++++++++++++++++++++++++--------- src/rcldb/rcldb.cpp | 118 ++++++++++++++++------- src/rcldb/rcldb.h | 13 ++- 7 files changed, 297 insertions(+), 181 deletions(-) create mode 100644 src/Makefile create mode 100644 src/internfile/indextext.h diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 00000000..36b78401 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,11 @@ +all: + cd lib;make + cd index;make + cd qtgui;rm -f recoll;make +clean: + cd common;make clean + cd index;make clean + cd qtgui;rm -f recoll;make clean + cd query;make clean + cd utils;make clean + diff --git a/src/internfile/indextext.h b/src/internfile/indextext.h new file mode 100644 index 00000000..8fb106d8 --- /dev/null +++ b/src/internfile/indextext.h @@ -0,0 +1,23 @@ +#ifndef _INDEXTEXT_H_INCLUDED_ +#define _INDEXTEXT_H_INCLUDED_ +/* @(#$Id: indextext.h,v 1.1 2005-01-28 15:25:39 dockes Exp $ (C) 2004 J.F.Dockes */ +/* Note: this only exists to help with using myhtmlparse.cc */ + +// Minimize changes to myhtmlparse.cpp +#include "debuglog.h" + +#include + +// lets hope that the charset includes ascii values... +static inline void +lowercase_term(std::string &term) +{ + std::string::iterator i = term.begin(); + while (i != term.end()) { + if (*i >= 'A' && *i <= 'Z') + *i = *i + 'a' - 'A'; + i++; + } +} + +#endif /* _INDEXTEXT_H_INCLUDED_ */ diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index db087822..47174b05 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -108,7 +108,6 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) break; case 'm': if (tag == "meta") { - LOGDEB(("Found META\n")); map::const_iterator i, j; if ((i = p.find("content")) != p.end()) { if ((j = p.find("name")) != p.end()) { @@ -135,7 +134,6 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) } } } else if ((j = p.find("http-equiv")) != p.end()) { - LOGDEB(("Found http-equiv\n")); string hequiv = j->second; lowercase_term(hequiv); if (hequiv == "content-type") { diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui index 4edb92b6..47efb043 100644 --- a/src/qtgui/recollmain.ui +++ b/src/qtgui/recollmain.ui @@ -23,13 +23,13 @@ recoll - + unnamed - layout3 + layout10 @@ -37,7 +37,7 @@ - layout2 + layout8 @@ -62,6 +62,22 @@ Search + + + listPrevPb + + + Previous page + + + + + listNextPB + + + Next page + + spacer1 @@ -74,7 +90,7 @@ - 329 + 346 20 @@ -83,21 +99,7 @@ - splitter9 - - - - 7 - 7 - 1 - 1 - - - - - 0 - 0 - + splitter6 Horizontal @@ -108,8 +110,8 @@ - 7 - 7 + 5 + 5 2 0 @@ -123,13 +125,13 @@ - splitter8 + splitter5 - 7 - 7 - 3 + 5 + 5 + 5 0 @@ -145,7 +147,7 @@ 7 7 0 - 2 + 4 @@ -178,7 +180,7 @@ - + @@ -190,10 +192,7 @@ - - - @@ -213,48 +212,6 @@ - - - helpContentsAction - - - Contents - - - &Contents... - - - - - - - - helpIndexAction - - - Index - - - &Index... - - - - - - - - helpAboutAction - - - About - - - &About - - - - - @@ -263,24 +220,6 @@ RecollMain fileExit() - - helpIndexAction - activated() - RecollMain - helpIndex() - - - helpContentsAction - activated() - RecollMain - helpContents() - - - helpAboutAction - activated() - RecollMain - helpAbout() - resTextEdit clicked(int,int) @@ -305,18 +244,33 @@ RecollMain Search_clicked() + + listPrevPb + clicked() + RecollMain + listPrevPB_clicked() + + + listNextPB + clicked() + RecollMain + listNextPB_clicked() + recollmain.ui.h + + int reslist_current; + int reslist_winfirst; + fileExit() - helpIndex() - helpContents() - helpAbout() resTextEdit_clicked( int par, int car ) queryText_returnPressed() Search_clicked() + listPrevPB_clicked() + listNextPB_clicked() diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index 5bc46a61..7333648d 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -16,22 +16,6 @@ void RecollMain::fileExit() } -void RecollMain::helpIndex() -{ - -} - - -void RecollMain::helpContents() -{ - -} - - -void RecollMain::helpAbout() -{ - -} #include #include "rcldb.h" @@ -62,12 +46,27 @@ static string plaintorich(const string &in) return out; } -void RecollMain::resTextEdit_clicked( int par, int car ) +// Click in the result list window: display preview for selected document, +// and highlight entry. The paragraph number is doc number in window + 1 +void RecollMain::resTextEdit_clicked(int par, int car) { - fprintf(stderr, "Clicked at paragraph %d, char %d\n", par, car); + LOGDEB(("RecollMain::resTextEdi_clicked: par %d, char %d\n", par, car)); + if (reslist_winfirst == -1) + return; Rcl::Doc doc; doc.erase(); - if (rcldb->getDoc(par, doc)) { + if (reslist_current != -1) { + QColor color("white"); + resTextEdit->setParagraphBackgroundColor(reslist_current+1, color); + } + QColor color("lightblue"); + resTextEdit->setParagraphBackgroundColor(par, color); + + int reldocnum = par-1; + reslist_current = reldocnum; + previewTextEdit->clear(); + + if (rcldb->getDoc(reslist_winfirst + reldocnum, doc, 0)) { // Go to the file system to retrieve / convert the document text // for preview: @@ -86,7 +85,7 @@ void RecollMain::resTextEdit_clicked( int par, int car ) Rcl::Doc fdoc; if (!fun(rclconfig, fn, doc.mimetype, fdoc)) { QMessageBox::warning(0, "Recoll", - QString("Failed to convert document for preview!\n") + + QString("Failed to convert document for preview!\n") + fn.c_str() + " mimetype " + doc.mimetype.c_str()); return; @@ -108,46 +107,24 @@ void RecollMain::resTextEdit_clicked( int par, int car ) item->setColor("red"); item->setFontWeight(QFont::Bold); #endif - QString str = QString::fromUtf8(rich.c_str(), rich.length()); + QString str = QString::fromUtf8(rich.c_str(), rich.length()); previewTextEdit->setTextFormat(RichText); previewTextEdit->setText(str); } } +#include "pathut.h" + void RecollMain::queryText_returnPressed() { LOGDEB(("RecollMain::queryText_returnPressed()\n")); - resTextEdit->clear(); - previewTextEdit->clear(); + reslist_current = -1; + reslist_winfirst = -1; string rawq = queryText->text(); rcldb->setQuery(rawq); - Rcl::Doc doc; - - // Insert results if any in result list window - QString result; - resTextEdit->append(""); - for (int i = 0;; i++) { - doc.erase(); - if (!rcldb->getDoc(i, doc)) - break; - LOGDEB(("Url: %s\n", doc.url.c_str())); - LOGDEB(("Mimetype: \n", doc.mimetype.c_str())); - LOGDEB(("Mtime: \n", doc.mtime.c_str())); - LOGDEB(("Origcharset: \n", doc.origcharset.c_str())); - LOGDEB(("Title: \n", doc.title.c_str())); - LOGDEB(("Text: \n", doc.text.c_str())); - LOGDEB(("Keywords: \n", doc.keywords.c_str())); - LOGDEB(("Abstract: \n", doc.abstract.c_str())); - - result = "

" + doc.url + "

"; - resTextEdit->append(result); - } - resTextEdit->append("
"); - - // Display preview for 1st doc in list - resTextEdit_clicked(0, 0); + listNextPB_clicked(); } @@ -155,3 +132,101 @@ void RecollMain::Search_clicked() { queryText_returnPressed(); } + + +static const int respagesize = 10; +void RecollMain::listPrevPB_clicked() +{ + reslist_winfirst -= 2*respagesize; + listNextPB_clicked(); +} + +#ifndef MIN +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#endif + +void RecollMain::listNextPB_clicked() +{ + LOGDEB(("listNextPB_clicked: winfirst %d\n", reslist_winfirst)); + + if (reslist_winfirst < 0) + reslist_winfirst = 0; + else + reslist_winfirst += respagesize; + + // Insert results if any in result list window + bool gotone = false; + for (int i = 0; i < respagesize; i++) { + Rcl::Doc doc; + doc.erase(); + int percent; + if (!rcldb->getDoc(reslist_winfirst + i, doc, &percent)) + break; + int resCnt = rcldb->getResCnt(); + int last = MIN(resCnt, reslist_winfirst+respagesize); + if (i == 0) { + resTextEdit->clear(); + previewTextEdit->clear(); + resTextEdit->append("

"); + char line[80]; + sprintf(line, "

Displaying results %d-%d out of %d
", + reslist_winfirst+1, last, resCnt); + resTextEdit->append(line); + } + + gotone = true; + + LOGDEB1(("Url: %s\n", doc.url.c_str())); + LOGDEB1(("Mimetype: %s\n", doc.mimetype.c_str())); + LOGDEB1(("Mtime: %s\n", doc.mtime.c_str())); + LOGDEB1(("Origcharset: %s\n", doc.origcharset.c_str())); + LOGDEB1(("Title: %s\n", doc.title.c_str())); + LOGDEB1(("Text: %s\n", doc.text.c_str())); + LOGDEB1(("Keywords: %s\n", doc.keywords.c_str())); + LOGDEB1(("Abstract: %s\n", doc.abstract.c_str())); + + // Result list display. Standard Omega includes: + // - title or simple file name or url + // - abstract and keywords + // - url + // - relevancy percentage + keywords matched + // - date de modification + // - langue + // - taille + char perbuf[10]; + sprintf(perbuf, "%3d%%", percent); + if (doc.title.empty()) + doc.title = path_getsimple(doc.url); + char datebuf[100]; + datebuf[0] = 0; + if (!doc.mtime.empty()) { + time_t mtime = atol(doc.mtime.c_str()); + struct tm *tm = localtime(&mtime); + strftime(datebuf, 99, "Modified: %F %T", tm); + } + + string result = "

" + + string(perbuf) + " " + doc.title + "
" + + (!doc.mtime.empty() ? string(datebuf) + "
" : string("")) + + (!doc.abstract.empty() ? doc.abstract + "
" : string("")) + + (!doc.keywords.empty() ? doc.keywords + "
" : string("")) + + "" + doc.url + +"
" + + "

"; + QString str = QString::fromUtf8(result.c_str(), result.length()); + + resTextEdit->append(str); + } + + if (gotone) { + resTextEdit->append("
"); + resTextEdit->setCursorPosition(0,0); + resTextEdit->ensureCursorVisible(); + // Display preview for 1st doc in list + resTextEdit_clicked(1, 0); + } else { + // Restore first in win parameter that we shouln't have incremented + reslist_winfirst -= respagesize; + if (reslist_winfirst < 0) + reslist_winfirst = 0; + } +} diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 890b6703..0b30ad05 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.12 2005-01-28 15:25:40 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -32,8 +32,14 @@ class Native { // Querying Xapian::Database db; Xapian::Query query; - Native() : isopen(false), iswritable(false) {} + Xapian::Enquire *enquire; + Xapian::MSet mset; + Native() : isopen(false), iswritable(false), enquire(0) { + } + ~Native() { + delete enquire; + } }; Rcl::Db::Db() @@ -185,6 +191,7 @@ static bool splitCb(void *cdata, const std::string &term, int pos) // Unaccent and lowercase data: use unac // for accents, and do it by hand for upper / lower. Note lowercasing is // only for ascii letters anyway, so it's just A-Z -> a-z +// Removing crlfs is so that we can use the text in the document data fields. bool dumb_string(const string &in, string &out) { string inter; @@ -193,10 +200,14 @@ bool dumb_string(const string &in, string &out) return false; out.reserve(inter.length()); for (unsigned int i = 0; i < inter.length(); i++) { - if (inter[i] >= 'A' && inter[i] <= 'Z') + if (inter[i] >= 'A' && inter[i] <= 'Z') { out += inter[i] + 'a' - 'A'; - else - out += inter[i]; + } else { + if (inter[i] == '\n' || inter[i] == '\r') + out += ' '; + else + out += inter[i]; + } } return true; } @@ -210,19 +221,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) Xapian::Document newdocument; - // Document data record. omindex has the following nl separated fields: - // - url - // - sample - // - caption (title limited to 100 chars) - // - mime type - string record = "url=file:/" + fn; - record += "\nmtime=" + doc.mtime; - record += "\nsample="; - record += "\ncaption=" + doc.title; - record += "\nmtype=" + doc.mimetype; - record += "\n"; - newdocument.set_data(record); - wsData splitData(newdocument); TextSplit splitter(splitCb, &splitData); @@ -260,6 +258,22 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) newdocument.add_term(pathterm); const char *fnc = fn.c_str(); + // Document data record. omindex has the following nl separated fields: + // - url + // - sample + // - caption (title limited to 100 chars) + // - mime type + string record = "url=file:/" + fn; + record += "\nmtype=" + doc.mimetype; + record += "\nmtime=" + doc.mtime; + record += "\norigcharset=" + doc.origcharset; + record += "\ncaption=" + doc.title; + record += "\nkeywords=" + doc.keywords; + record += "\nabstract=" + doc.abstract; + record += "\n"; + LOGDEB(("Newdocument data: %s\n", record.c_str())); + newdocument.set_data(record); + // If this document has already been indexed, update the existing // entry. try { @@ -268,8 +282,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) #endif ndb->wdb.replace_document(pathterm, newdocument); #if 0 - if (did < updated.size()) { - updated[did] = true; + if (did < ndb->updated.size()) { + ndb->updated[did] = true; LOGDEB(("%s updated\n", fnc)); } else { LOGDEB(("%s added\n", fnc)); @@ -299,6 +313,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) if (did == ndb->wdb.postlist_end(pathterm)) return true; Xapian::Document doc = ndb->wdb.get_document(*did); +#if 0 + ndb->updated[*did] = true; +#endif string data = doc.get_data(); //cerr << "DOCUMENT EXISTS " << data << endl; const char *cp = strstr(data.c_str(), "mtime="); @@ -332,6 +349,7 @@ static bool splitQCb(void *cdata, const std::string &term, int ) bool Rcl::Db::setQuery(const std::string &querystring) { + LOGDEB(("Rcl::Db::setQuery: %s\n", querystring.c_str())); wsQData splitData; TextSplit splitter(splitQCb, &splitData); @@ -345,32 +363,64 @@ bool Rcl::Db::setQuery(const std::string &querystring) ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(), splitData.terms.end()); - + delete ndb->enquire; + ndb->enquire = new Xapian::Enquire(ndb->db); + ndb->enquire->set_query(ndb->query); + ndb->mset = Xapian::MSet(); return true; } - -bool Rcl::Db::getDoc(int i, Doc &doc) +int Rcl::Db::getResCnt() { - LOGDEB1(("Rcl::Db::getDoc: %d\n", i)); Native *ndb = (Native *)pdata; + if (!ndb || !ndb->enquire) { + LOGERR(("Rcl::Db::getResCnt: no query opened\n")); + return -1; + } + if (ndb->mset.size() <= 0) + return -1; + return ndb->mset.get_matches_lower_bound(); +} - Xapian::Enquire enquire(ndb->db); - enquire.set_query(ndb->query); - Xapian::MSet matches = enquire.get_mset(i, 1); - - LOGDEB1(("Rcl::Db::getDoc: Query '%s' Estimated results: %d\n", - ndb->query.get_description(), matches.get_matches_lower_bound())); - - if (matches.empty()) +bool Rcl::Db::getDoc(int i, Doc &doc, int *percent) +{ + LOGDEB(("Rcl::Db::getDoc: %d\n", i)); + Native *ndb = (Native *)pdata; + if (!ndb || !ndb->enquire) { + LOGERR(("Rcl::Db::getDoc: no query opened\n")); return false; + } - Xapian::Document xdoc = matches.begin().get_document(); + int first = ndb->mset.get_firstitem(); + int last = first + ndb->mset.size() -1; + + if (!(i >= first && i <= last)) { + LOGDEB1(("Fetching for first %d, count 10\n", i)); + ndb->mset = ndb->enquire->get_mset(i, 10); + if (ndb->mset.empty()) + return false; + first = ndb->mset.get_firstitem(); + last = first + ndb->mset.size() -1; + } + + LOGDEB1(("Rcl::Db::getDoc: Qry '%s' win [%d-%d] Estimated results: %d", + ndb->query.get_description().c_str(), + first, last, + ndb->mset.get_matches_lower_bound())); + + Xapian::Document xdoc = ndb->mset[i-first].get_document(); + if (percent) + *percent = ndb->mset.convert_to_percent(ndb->mset[i-first]); // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); + LOGDEB1(("Rcl::Db::getDoc: data: %s\n", data.c_str())); ConfSimple parms(&data); + parms.get(string("url"), doc.url); parms.get(string("mtype"), doc.mimetype); parms.get(string("mtime"), doc.mtime); - parms.get(string("url"), doc.url); + parms.get(string("origcharset"), doc.origcharset); + parms.get(string("caption"), doc.title); + parms.get(string("keywords"), doc.keywords); + parms.get(string("abstract"), doc.abstract); return true; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 4f012a09..335edb04 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,6 +1,6 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.5 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.6 2005-01-28 15:25:40 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -26,23 +26,26 @@ namespace Rcl { */ class Doc { public: + // This fields potentially go into the document data record std::string url; std::string mimetype; std::string mtime; // Modification time as decimal ascii std::string origcharset; std::string title; - std::string text; std::string keywords; std::string abstract; + + std::string text; void erase() { url.erase(); mimetype.erase(); mtime.erase(); origcharset.erase(); title.erase(); - text.erase(); keywords.erase(); abstract.erase(); + + text.erase(); } }; @@ -71,7 +74,9 @@ class Db { // Get document at rank i. This is probably vastly inferior to the type // of interface in Xapian, but we have to start with something simple // to experiment with the GUI - bool getDoc(int i, Doc &doc); + bool getDoc(int i, Doc &doc, int *percent = 0); + // Get results count + int getResCnt(); };