diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 45c6a422..41c83adb 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.12 2005-09-22 14:09:04 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.13 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -82,11 +82,6 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, { LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); - // It may happen that our cleanup would result in emitting the - // same term twice. We try to avoid this - static string prevterm; - static int prevpos = -1; - if (!cb) return false; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index d9c48508..c892eb4a 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -1,6 +1,6 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.7 2005-10-10 13:25:23 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.8 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -28,6 +28,10 @@ class TextSplitCB { */ class TextSplit { bool fq; // for query: Are we splitting for query or index ? + // It may happen that our cleanup would result in emitting the + // same term twice. We try to avoid this + string prevterm; + int prevpos; TextSplitCB *cb; int maxWordLength; bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); @@ -38,7 +42,7 @@ class TextSplit { * Constructor: just store callback object */ TextSplit(TextSplitCB *t, bool forquery = false) - : fq(forquery), cb(t), maxWordLength(40) {} + : fq(forquery), prevpos(-1), cb(t), maxWordLength(40) {} /** * Split text, emit words and positions. */ diff --git a/src/filters/rcldoc b/src/filters/rcldoc index aaae036b..f65857a5 100755 --- a/src/filters/rcldoc +++ b/src/filters/rcldoc @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rcldoc,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rcldoc,v 1.2 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -35,6 +35,31 @@ fi infile="$1" +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + IFS=: ; set -- $PATH; unset IFS + for d in $* ; do test -x $d/$cmd && return 0; done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds awk antiword iconv + # check the input file existence if test ! -f "$infile" then diff --git a/src/filters/rclpdf b/src/filters/rclpdf index e696c14b..8f719d99 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclpdf,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclpdf,v 1.2 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes # This is copied almost verbatim from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -33,6 +33,31 @@ then exit 1 fi +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + IFS=: ; set -- $PATH; unset IFS + for d in $* ; do test -x $d/$cmd && return 0; done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds pdftotext iconv awk + # output the result pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | iconv -f UTF-8 -t UTF-8 -c -s | @@ -80,5 +105,3 @@ BEGIN { # didn't really understand its use as iconv -c is supposed to fix the # encoding anyway -# exit normally -exit 0 diff --git a/src/filters/rclps b/src/filters/rclps index b1578919..dc155621 100755 --- a/src/filters/rclps +++ b/src/filters/rclps @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclps,v 1.1 2005-02-02 17:57:08 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclps,v 1.2 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -34,6 +34,31 @@ fi infile="$1" +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + IFS=: ; set -- $PATH; unset IFS + for d in $* ; do test -x $d/$cmd && return 0; done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds $decoder iconv awk + # check the input file existence if test ! -f "$infile" then @@ -69,5 +94,3 @@ END { printf("

\n"); }' | iconv -f iso-8859-1 -t UTF-8 -c -s -# exit normally -exit 0 diff --git a/src/filters/rclsoff b/src/filters/rclsoff index 6d8c3499..c0b88169 100755 --- a/src/filters/rclsoff +++ b/src/filters/rclsoff @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclsoff,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclsoff,v 1.2 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -28,6 +28,31 @@ fi infile="$1" +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + IFS=: ; set -- $PATH; unset IFS + for d in $* ; do test -x $d/$cmd && return 0; done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds awk iconv unzip + # check the input file existence if test ! -f "$infile" then diff --git a/src/qtgui/advsearch.ui b/src/qtgui/advsearch.ui index b7b5f292..6cd280bf 100644 --- a/src/qtgui/advsearch.ui +++ b/src/qtgui/advsearch.ui @@ -413,7 +413,7 @@ advsearch.ui.h - startSearch(AdvSearchData) + startSearch(Rcl::AdvSearchData) delFiltypPB_clicked() diff --git a/src/qtgui/advsearch.ui.h b/src/qtgui/advsearch.ui.h index e1dda969..a4757023 100644 --- a/src/qtgui/advsearch.ui.h +++ b/src/qtgui/advsearch.ui.h @@ -91,14 +91,14 @@ void advsearch::restrictFtCB_toggled(bool on) void advsearch::searchPB_clicked() { - AdvSearchData mydata; + Rcl::AdvSearchData mydata; mydata.allwords = string((const char*)(andWordsLE->text().utf8())); mydata.phrase = string((const char*)(phraseLE->text().utf8())); mydata.orwords = string((const char*)(orWordsLE->text().utf8())); mydata.nowords = string((const char*)(noWordsLE->text().utf8())); if (restrictFtCB->isOn() && noFiltypsLB->count() > 0) { for (unsigned int i = 0; i < yesFiltypsLB->count(); i++) { - QCString ctext = noFiltypsLB->item(i)->text().utf8(); + QCString ctext = yesFiltypsLB->item(i)->text().utf8(); mydata.filetypes.push_back(string((const char *)ctext)); } } diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp index 14384a14..97781dce 100644 --- a/src/qtgui/main.cpp +++ b/src/qtgui/main.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: main.cpp,v 1.9 2005-10-10 12:29:42 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: main.cpp,v 1.10 2005-10-19 10:21:48 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -10,8 +10,9 @@ static char rcsid[] = "@(#$Id: main.cpp,v 1.9 2005-10-10 12:29:42 dockes Exp $ ( #include -#include "recollmain.h" #include "rcldb.h" +using Rcl::AdvSearchData; + #include "rclconfig.h" #include "pathut.h" #include "recoll.h" @@ -19,11 +20,43 @@ static char rcsid[] = "@(#$Id: main.cpp,v 1.9 2005-10-10 12:29:42 dockes Exp $ ( #include "wipedir.h" #include "rclinit.h" +#include "recollmain.h" + RclConfig *rclconfig; Rcl::Db *rcldb; int recollNeedsExit; string tmpdir; +void getQueryStemming(bool &dostem, std::string &stemlang) +{ + string param; + if (rclconfig->getConfParam("querystemming", param)) + dostem = ConfTree::stringToBool(param); + else + dostem = false; + if (!rclconfig->getConfParam("querystemminglanguage", stemlang)) + stemlang = "english"; +} + +bool maybeOpenDb(string &reason) +{ + if (!rcldb) + return false; + if (!rcldb->isopen()) { + string dbdir; + if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { + reason = "No db directory in configuration"; + return false; + } + dbdir = path_tildexpand(dbdir); + if (!rcldb->open(dbdir, Rcl::Db::DbRO)) { + reason = "Could not open database in " + + dbdir + " wait for indexing to complete?"; + return false; + } + } + return true; +} void recollCleanup() { @@ -86,7 +119,7 @@ int main( int argc, char ** argv ) rcldb = new Rcl::Db; - if (!rcldb->open(dbdir, Rcl::Db::DbRO)) { + if (!rcldb || !rcldb->open(dbdir, Rcl::Db::DbRO)) { startindexing = 1; QMessageBox::information(0, "Recoll", QString("Could not open database in ") + diff --git a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp index d2c41245..1b5ec4d1 100644 --- a/src/qtgui/plaintorich.cpp +++ b/src/qtgui/plaintorich.cpp @@ -1,3 +1,6 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.3 2005-10-19 10:21:48 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif #include diff --git a/src/qtgui/recoll.h b/src/qtgui/recoll.h index 66f37cbe..584d3165 100644 --- a/src/qtgui/recoll.h +++ b/src/qtgui/recoll.h @@ -1,29 +1,22 @@ #ifndef _RECOLL_H_INCLUDED_ #define _RECOLL_H_INCLUDED_ -/* @(#$Id: recoll.h,v 1.3 2005-10-17 13:36:53 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: recoll.h,v 1.4 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include "rclconfig.h" #include "rcldb.h" #include "idxthread.h" -extern void recollCleanup(); - // Misc declarations in need of sharing between the UI files + +extern void recollCleanup(); +extern bool maybeOpenDb(std::string &reason); +extern void getQueryStemming(bool &dostem, std::string &stemlang); + extern RclConfig *rclconfig; extern Rcl::Db *rcldb; -extern string tmpdir; +extern std::string tmpdir; extern int recollNeedsExit; -// Holder for data collected by the advanced search dialog -struct AdvSearchData { - std::string allwords; - std::string phrase; - std::string orwords; - std::string nowords; - std::list filetypes; // restrict to types. Empty if inactive - std::string topdir; // restrict to subtree. Empty if inactive -}; - #endif /* _RECOLL_H_INCLUDED_ */ diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui index 4643b202..34e4030b 100644 --- a/src/qtgui/recollmain.ui +++ b/src/qtgui/recollmain.ui @@ -282,6 +282,7 @@ preview/.ui/preview.h recoll.h + advsearch.h recollmain.ui.h @@ -290,6 +291,7 @@ bool dostem; std::string stemlang; Preview *curPreview; + advsearch *asearchform; fileExit() @@ -304,7 +306,7 @@ listNextPB_clicked() advSearchPB_clicked() previewClosed( Preview * w ) - startAdvSearch( AdvSearchData sdata ) + startAdvSearch( Rcl::AdvSearchData sdata ) init() diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index a509752c..38c78549 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -32,6 +32,8 @@ using std::pair; #include "smallut.h" #include "plaintorich.h" #include "unacpp.h" +#include "advsearch.h" + #ifndef MIN #define MIN(A,B) ((A) < (B) ? (A) : (B)) @@ -45,6 +47,7 @@ static const int respagesize = 8; void RecollMain::init() { curPreview = 0; + asearchform = 0; } // We want to catch ^Q everywhere to mean quit. @@ -266,31 +269,13 @@ void RecollMain::reslistTE_clicked(int par, int car) void RecollMain::queryText_returnPressed() { LOGDEB(("RecollMain::queryText_returnPressed()\n")); - if (!rcldb->isopen()) { - string dbdir; - if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { - QMessageBox::critical(0, "Recoll", - QString("No db directory in configuration")); - exit(1); - } - dbdir = path_tildexpand(dbdir); - if (!rcldb->open(dbdir, Rcl::Db::DbRO)) { - QMessageBox::information(0, "Recoll", - QString("Could not open database in ") + - QString(dbdir) + " wait for indexing " + - "to complete?"); - return; - } - } - if (stemlang.empty()) { - string param; - if (rclconfig->getConfParam("querystemming", param)) - dostem = ConfTree::stringToBool(param); - else - dostem = false; - if (!rclconfig->getConfParam("querystemminglanguage", stemlang)) - stemlang = "english"; + string reason; + if (!maybeOpenDb(reason)) { + QMessageBox::critical(0, "Recoll", QString(reason.c_str())); + return; } + if (stemlang.empty()) + getQueryStemming(dostem, stemlang); reslist_current = -1; reslist_winfirst = -1; @@ -388,7 +373,7 @@ void RecollMain::listNextPB_clicked() strftime(datebuf, 99, "Modified: %F %T", tm); } string abst = stripMarkup(doc.abstract); - LOGDEB(("Abstract: {%s}\n", abst.c_str())); + LOGDEB1(("Abstract: {%s}\n", abst.c_str())); string result = "

" + string(perbuf) + " " + doc.title + "
" + doc.mimetype + " " + @@ -433,46 +418,40 @@ void RecollMain::previewClosed(Preview *w) delete w; } - - -#include "advsearch.h" - -advsearch *asearchform; - +// Open advanced search dialog. void RecollMain::advSearchPB_clicked() { if (asearchform == 0) { - // Couldn't find way to have a normal wm frame asearchform = new advsearch(this, "Advanced search", FALSE, WStyle_Customize | WStyle_NormalBorder | WStyle_Title | WStyle_SysMenu); asearchform->setSizeGripEnabled(FALSE); - connect(asearchform, SIGNAL(startSearch(AdvSearchData)), - this, SLOT(startAdvSearch(AdvSearchData))); + connect(asearchform, SIGNAL(startSearch(Rcl::AdvSearchData)), + this, SLOT(startAdvSearch(Rcl::AdvSearchData))); asearchform->show(); } else { asearchform->show(); } } -void RecollMain::startAdvSearch(AdvSearchData sdata) +// Execute and advanced search query +void RecollMain::startAdvSearch(Rcl::AdvSearchData sdata) { LOGDEB(("RecollMain::startAdvSearch\n")); - LOGDEB((" allwords: %s\n", sdata.allwords.c_str())); - LOGDEB((" phrase: %s\n", sdata.phrase.c_str())); - LOGDEB((" orwords: %s\n", sdata.orwords.c_str())); - LOGDEB((" nowords: %s\n", sdata.nowords.c_str())); - string ft; - for (list::iterator it = sdata.filetypes.begin(); - it != sdata.filetypes.end(); it++) { - ft += *it + " "; + string reason; + if (!maybeOpenDb(reason)) { + QMessageBox::critical(0, "Recoll", QString(reason.c_str())); + return; } - if (!ft.empty()) - LOGDEB(("Searched file types: %s\n", ft.c_str())); - if (!sdata.topdir.empty()) - LOGDEB(("Restricted to: %s\n", sdata.topdir.c_str())); + if (stemlang.empty()) + getQueryStemming(dostem, stemlang); + + reslist_current = -1; + reslist_winfirst = -1; + + if (!rcldb->setQuery(sdata, stemlang)) + return; + curPreview = 0; + listNextPB_clicked(); } - - - diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 9922d77e..174e0101 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.28 2005-04-06 10:20:11 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.29 2005-10-19 10:21:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -37,9 +37,10 @@ class Native { // Querying Xapian::Database db; - Xapian::Query query; + Xapian::Query query; // query descriptor: terms and subqueries + // joined by operators (or/and etc...) Xapian::Enquire *enquire; - Xapian::MSet mset; + Xapian::MSet mset; Native() : isopen(false), iswritable(false), enquire(0) { } @@ -206,8 +207,8 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int) } // Unaccent and lowercase data: use unac -// for accents, and do it by hand for upper / lower. Note lowercasing is -// only for ascii letters anyway, so it's just A-Z -> a-z +// for accents, and do it by hand for upper / lower. +// TOBEDONE: lowercasing is done only for ascii letters, just A-Z -> a-z // Removing crlfs is so that we can use the text in the document data fields. bool Rcl::dumb_string(const string &in, string &out) { @@ -404,15 +405,15 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) // If the db is up to date, set the update flags for all documents Xapian::PostingIterator doc; try { - Xapian::PostingIterator did0 = ndb->wdb.postlist_begin(pathterm); - for (Xapian::PostingIterator did = did0; - did != ndb->wdb.postlist_end(pathterm); did++) { + Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); + for (Xapian::PostingIterator docid = docid0; + docid != ndb->wdb.postlist_end(pathterm); docid++) { - Xapian::Document doc = ndb->wdb.get_document(*did); + Xapian::Document doc = ndb->wdb.get_document(*docid); // Check the date once. no need to look at the others if the // db needs updating. - if (did == did0) { + if (docid == docid0) { string data = doc.get_data(); const char *cp = strstr(data.c_str(), "mtime="); cp += 6; @@ -424,8 +425,8 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) } // Db is up to date. Make a note that this document exists. - if (*did < ndb->updated.size()) - ndb->updated[*did] = true; + if (*docid < ndb->updated.size()) + ndb->updated[*docid] = true; } } catch (...) { return true; @@ -596,13 +597,13 @@ bool Rcl::Db::purge() // trying to delete an unexistant document ? // Flushing before trying the deletes seeems to work around the problem ndb->wdb.flush(); - for (Xapian::docid did = 1; did < ndb->updated.size(); ++did) { - if (!ndb->updated[did]) { + for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) { + if (!ndb->updated[docid]) { try { - ndb->wdb.delete_document(did); - LOGDEB(("Rcl::Db::purge: deleted document #%d\n", did)); + ndb->wdb.delete_document(docid); + LOGDEB(("Rcl::Db::purge: deleted document #%d\n", docid)); } catch (const Xapian::DocNotFoundError &) { - LOGDEB2(("Rcl::Db::purge: document #%d not found\n", did)); + LOGDEB2(("Rcl::Db::purge: document #%d not found\n", docid)); } } } @@ -628,6 +629,13 @@ class wsQData : public TextSplitCB { terms.push_back(term); return true; } + void dumball() { + for (vector::iterator it=terms.begin(); it !=terms.end();it++){ + string dumb; + Rcl::dumb_string(*it, dumb); + *it = dumb; + } + } }; @@ -638,11 +646,11 @@ static list stemexpand(Native *ndb, string term, const string& lang) try { Xapian::Stem stemmer(lang); string stem = stemmer.stem_word(term); - LOGDEB(("stemexpand: '%s' -> '%s'\n", term.c_str(), stem.c_str())); + LOGDEB(("stemexpand: '%s' stem-> '%s'\n", term.c_str(), stem.c_str())); // Try to fetch the doc from the stem db string stemdbdir = stemdbname(ndb->basedir, lang); Xapian::Database sdb(stemdbdir); - LOGDEB1(("Rcl::Db::stemexpand: %s lastdocid: %d\n", + LOGDEB1(("stemexpand: %s lastdocid: %d\n", stemdbdir.c_str(), sdb.get_lastdocid())); if (!sdb.term_exists(stem)) { LOGDEB1(("Rcl::Db::stemexpand: no term for %s\n", stem.c_str())); @@ -651,7 +659,7 @@ static list stemexpand(Native *ndb, string term, const string& lang) } Xapian::PostingIterator did = sdb.postlist_begin(stem); if (did == sdb.postlist_end(stem)) { - LOGDEB1(("Rcl::Db::stemexpand: no term(1) for %s\n",stem.c_str())); + LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str())); explist.push_back(term); return explist; } @@ -669,7 +677,7 @@ static list stemexpand(Native *ndb, string term, const string& lang) if (find(explist.begin(), explist.end(), term) == explist.end()) { explist.push_back(term); } - LOGDEB(("Rcl::Db::stemexpand: %s -> %s\n", stem.c_str(), + LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(), stringlistdisp(explist).c_str())); } catch (...) { LOGERR(("stemexpand: error accessing stem db\n")); @@ -679,6 +687,81 @@ static list stemexpand(Native *ndb, string term, const string& lang) return explist; } +// Turn string into possibly complex xapian query. There is little +// interpretation done on the string (no +term -term or filename:term +// stuff). We just separate words and phrases, and interpret +// capitalized terms as wanting no stem expansion +static void stringToXapianQueries(const string &iq, + const string& stemlang, + Native *ndb, + list &pqueries, + Rcl::Db::QueryOpts opts = Rcl::Db::QO_NONE) +{ + string qstring = iq; +#if 0 + // Unaccent and lowerterm. Note that lowerterming here may not be + // such a good idea because it forbids using capitalized words to + // indicate that a term should not use stem expansion, for + // example. + if (!Rcl::dumb_string(iqstring, qstring)) + return false; +#endif + + // Split into (possibly single word) phrases ("this is a phrase"): + list phrases; + ConfTree::stringToStrings(qstring, phrases); + + // Then process each phrase: split into terms and transform into + // appropriate Xapian Query + + for (list::iterator it=phrases.begin(); it !=phrases.end(); it++) { + LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str())); + + wsQData splitData; + TextSplit splitter(&splitData, true); + splitter.text_to_words(*it); + LOGDEB(("strToXapianQ: splitter term count: %d\n", + splitData.terms.size())); + switch(splitData.terms.size()) { + case 0: continue;// ?? + case 1: // Not a real phrase: one term + { + string term = splitData.terms.front(); + bool nostemexp = false; + // Yes this doesnt work with accented or non-european + // majuscules. TOBEDONE: something :) + if (term.length() > 0 && term[0] >= 'A' && term[0] <= 'Z') + nostemexp = true; + + LOGDEB(("Term: %s\n", term.c_str())); + + // Possibly perform stem compression/expansion + list exp; + string term1; + Rcl::dumb_string(term, term1); + if (!nostemexp && (opts & Rcl::Db::QO_STEM)) { + exp = stemexpand(ndb, term1, stemlang); + } else { + exp.push_back(term1); + } + + // Push either term or stem-expanded set + pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, + exp.begin(), exp.end())); + } + break; + + default: + // Phrase: no stem expansion + splitData.dumball(); + LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str())); + pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE, + splitData.terms.begin(), + splitData.terms.end())); + } + } +} + bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, const string& stemlang) { @@ -688,48 +771,8 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, if (!ndb) return false; - string qstring;; - if (!dumb_string(iqstring, qstring)) { - return false; - } - - // First split into (possibly single word) phrases ("this is a phrase"): - list phrases; - ConfTree::stringToStrings(qstring, phrases); - for (list::const_iterator i=phrases.begin(); - i != phrases.end();i++) { - LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str())); - } - list pqueries; - for (list::const_iterator it = phrases.begin(); - it != phrases.end(); it++) { - - wsQData splitData; - TextSplit splitter(&splitData, true); - splitter.text_to_words(*it); - LOGDEB1(("Rcl::Db::setquery: splitter term count: %d\n", - splitData.terms.size())); - switch(splitData.terms.size()) { - case 0: continue;// ?? - case 1: { - list exp; - if (opts & QO_STEM) - exp = stemexpand(ndb, splitData.terms.front(), stemlang); - else - exp.push_back(splitData.terms.front()); - pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, - exp.begin(), - exp.end())); - } - break; - default: - LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str())); - pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE, - splitData.terms.begin(), - splitData.terms.end())); - } - } + stringToXapianQueries(iqstring, stemlang, ndb, pqueries, opts); ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); delete ndb->enquire; @@ -739,6 +782,93 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, return true; } +bool Rcl::Db::setQuery(AdvSearchData &sdata, const string& stemlang) +{ + LOGDEB(("Rcl::Db::setQuery: adv:\n")); + LOGDEB((" allwords: %s\n", sdata.allwords.c_str())); + LOGDEB((" phrase: %s\n", sdata.phrase.c_str())); + LOGDEB((" orwords: %s\n", sdata.orwords.c_str())); + LOGDEB((" nowords: %s\n", sdata.nowords.c_str())); + string ft; + for (list::iterator it = sdata.filetypes.begin(); + it != sdata.filetypes.end(); it++) {ft += *it + " ";} + if (!ft.empty()) + LOGDEB((" searched file types: %s\n", ft.c_str())); + if (!sdata.topdir.empty()) + LOGDEB((" restricted to: %s\n", sdata.topdir.c_str())); + + Native *ndb = (Native *)pdata; + if (!ndb) + return false; + + list pqueries; + Xapian::Query xq; + + if (!sdata.allwords.empty()) { + stringToXapianQueries(sdata.allwords, stemlang, ndb, pqueries); + if (!pqueries.empty()) { + xq = Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(), + pqueries.end()); + pqueries.clear(); + } + } + + if (!sdata.orwords.empty()) { + stringToXapianQueries(sdata.orwords, stemlang, ndb, pqueries); + if (!pqueries.empty()) { + Xapian::Query nq; + nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), + pqueries.end()); + xq = xq.empty() ? nq : + Xapian::Query(Xapian::Query::OP_AND, xq, nq); + pqueries.clear(); + } + } + + if (!sdata.nowords.empty()) { + stringToXapianQueries(sdata.nowords, stemlang, ndb, pqueries); + if (!pqueries.empty()) { + Xapian::Query nq; + nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), + pqueries.end()); + xq = xq.empty() ? nq : + Xapian::Query(Xapian::Query::OP_AND_NOT, xq, nq); + pqueries.clear(); + } + } + + if (!sdata.phrase.empty()) { + Xapian::Query nq; + string s = string("\"") + sdata.phrase + string("\""); + stringToXapianQueries(s, stemlang, ndb, pqueries); + if (!pqueries.empty()) { + // There should be a single list element phrase query. + xq = xq.empty() ? *pqueries.begin() : + Xapian::Query(Xapian::Query::OP_AND, xq, *pqueries.begin()); + pqueries.clear(); + } + } + + if (!sdata.filetypes.empty()) { + Xapian::Query tq; + for (list::iterator it = sdata.filetypes.begin(); + it != sdata.filetypes.end(); it++) { + string term = "T" + *it; + LOGDEB(("Adding file type term: [%s]\n", term.c_str())); + tq = tq.empty() ? Xapian::Query(term) : + Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); + } + xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, xq, tq); + } + + ndb->query = xq; + delete ndb->enquire; + ndb->enquire = new Xapian::Enquire(ndb->db); + ndb->enquire->set_query(ndb->query); + ndb->mset = Xapian::MSet(); + return true; +} + bool Rcl::Db::getQueryTerms(list& terms) { Native *ndb = (Native *)pdata; @@ -766,6 +896,10 @@ int Rcl::Db::getResCnt() return ndb->mset.get_matches_lower_bound(); } +// Get document at rank i in query (i is the index in the whole result +// set, as in the enquire class. We check if the current mset has the +// doc, else ask for an other one. We use msets of 10 documents. Don't +// know if the whole thing makes sense at all but it seems to work. bool Rcl::Db::getDoc(int i, Doc &doc, int *percent) { LOGDEB1(("Rcl::Db::getDoc: %d\n", i)); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 85c1f300..fda82053 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,12 +1,13 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.13 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.14 2005-10-19 10:21:47 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #ifndef NO_NAMESPACES using std::string; +using std::list; #endif // rcldb defines an interface for a 'real' text database. The current @@ -24,7 +25,9 @@ using std::string; struct stat; +#ifndef NO_NAMESPACES namespace Rcl { +#endif /** * Dumb bunch holder for document attributes and data @@ -57,6 +60,19 @@ class Doc { } }; +/** + * Holder for the advanced query data + */ +class AdvSearchData { + public: + string allwords; + string phrase; + string orwords; + string nowords; + list filetypes; // restrict to types. Empty if inactive + string topdir; // restrict to subtree. Empty if inactive +}; + /** * Wrapper class for the native database. */ @@ -83,7 +99,8 @@ class Db { enum QueryOpts {QO_NONE=0, QO_STEM = 1}; bool setQuery(const string &q, QueryOpts opts = QO_NONE, const string& stemlang = "english"); - bool getQueryTerms(std::list& terms); + bool setQuery(AdvSearchData &q, const string& stemlang = "english"); + bool getQueryTerms(list& terms); // Get document at rank i. This is probably vastly inferior to the type // of interface in Xapian, but we have to start with something simple @@ -96,6 +113,9 @@ class Db { // Unaccent and lowercase data. extern bool dumb_string(const string &in, string &out); +#ifndef NO_NAMESPACES } +#endif // NO_NAMESPACES + #endif /* _DB_H_INCLUDED_ */