diff --git a/src/Makefile b/src/Makefile index 5bb5c3db..1df2fe3f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ all: cd lib;make cd index;make - cd qtgui;rm -f recoll;make + cd qtgui;qmake recoll.pro ; rm -f recoll;make clean: cd common;make clean cd index;make clean diff --git a/src/README b/src/README index 12e17bb3..48690010 100644 --- a/src/README +++ b/src/README @@ -1,4 +1,4 @@ - @(#$Id: README,v 1.1 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes + @(#$Id: README,v 1.2 2005-02-08 14:54:38 dockes Exp $ (C) 2004 J.F.Dockes Hello. @@ -13,9 +13,8 @@ It will become much better in the near future. What it has: - - Easy installation. No db, web server or exotic language necessary. The - binary packages are statically linked and should run almost as soon as - unpacked. The idea is that EVERYBODY should index their files because it + - Easy installation. No db, web server or exotic language necessary. + The idea is that EVERYBODY should index their files because it makes life easier. - Indexes text, pdf, html, postscript. Deals with compressed versions of same. diff --git a/src/qtgui/recoll.pro b/src/qtgui/recoll.pro index 54f9d33a..e02ce8f4 100644 --- a/src/qtgui/recoll.pro +++ b/src/qtgui/recoll.pro @@ -23,8 +23,7 @@ unix { UI_DIR = .ui MOC_DIR = .moc OBJECTS_DIR = .obj - LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \ - -lfontconfig -lfreetype -lexpat -lz + LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv INCLUDEPATH += ../common ../index ../query ../unac ../utils } diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui index daa19bca..b8f644b4 100644 --- a/src/qtgui/recollmain.ui +++ b/src/qtgui/recollmain.ui @@ -311,6 +311,8 @@ recollmain.ui.h + std::string stemlang; + bool dostem; int reslist_current; int reslist_winfirst; diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index b56d5e49..13d937f4 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -259,14 +259,24 @@ void RecollMain::queryText_returnPressed() "to complete?"); return; } - } + if (stemlang.empty()) { + string param; + if (rclconfig->getConfParam("querystemming", param)) + dostem = ConfTree::stringToBool(param); + else + dostem = false; + if (!rclconfig->getConfParam("querystemminglanguage", stemlang)) + stemlang = "english"; + } + reslist_current = -1; reslist_winfirst = -1; QCString u8 = queryText->text().utf8(); - if (!rcldb->setQuery(string((const char *)u8))) + if (!rcldb->setQuery(string((const char *)u8), dostem ? + Rcl::Db::QO_STEM : Rcl::Db::QO_NONE, stemlang)) return; list terms; listNextPB_clicked(); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 51fa78c7..0781dad4 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.22 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -452,10 +452,49 @@ class wsQData : public TextSplitCB { } }; +#include -bool Rcl::Db::setQuery(const std::string &iqstring) +// Expand term to list of all terms which expand to the same term. +// This is currently awfully inefficient as we actually stem the whole +// db term list ! Need to build an efficient structure when finishing +// indexing, but good enough for testing +static list stemexpand(Native *ndb, string term, const string& lang) { - LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str())); + list explist; + try { + Xapian::Stem stemmer(lang); + string stem = stemmer.stem_word(term); + LOGDEB(("stemexpand: term '%s' stem '%s'\n", + term.c_str(), stem.c_str())); + Xapian::TermIterator it; + for (it = ndb->db.allterms_begin(); + it != ndb->db.allterms_end(); it++) { + string stem1 = stemmer.stem_word(*it); + if (stem == stem1) + explist.push_back(*it); + } + if (explist.size() == 0) + explist.push_back(term); + if (1) { + string expanded; + for (list::const_iterator it = explist.begin(); + it != explist.end(); it++) { + expanded += *it + " "; + } + LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str())); + } + } catch (...) { + LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str())); + explist.push_back(term); + } + return explist; +} + +bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, + const string& stemlang) +{ + LOGDEB(("Rcl::Db::setQuery: q: '%s', opts 0x%x, stemlang %s\n", + iqstring.c_str(), (unsigned int)opts, stemlang.c_str())); Native *ndb = (Native *)pdata; if (!ndb) return false; @@ -465,13 +504,14 @@ bool Rcl::Db::setQuery(const std::string &iqstring) return false; } - // First extract phrases: + // First split into (possibly single word) phrases ("this is a phrase"): list phrases; ConfTree::stringToStrings(qstring, phrases); for (list::const_iterator i=phrases.begin(); i != phrases.end();i++) { LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str())); } + list pqueries; for (list::const_iterator it = phrases.begin(); it != phrases.end(); it++) { @@ -482,8 +522,16 @@ bool Rcl::Db::setQuery(const std::string &iqstring) LOGDEB(("Splitter term count: %d\n", splitData.terms.size())); switch(splitData.terms.size()) { case 0: continue;// ?? - case 1: - pqueries.push_back(Xapian::Query(splitData.terms.front())); + case 1: { + list exp; + if (opts & QO_STEM) + exp = stemexpand(ndb, splitData.terms.front(), stemlang); + else + exp.push_back(splitData.terms.front()); + pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, + exp.begin(), + exp.end())); + } break; default: LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str())); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 0bbc43d1..8a78e197 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,6 +1,6 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.10 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -76,7 +76,9 @@ class Db { // Query-related functions // Parse query string and initialize query - bool setQuery(const string &q); + enum QueryOpts {QO_NONE=0, QO_STEM = 1}; + bool setQuery(const string &q, QueryOpts opts = QO_NONE, + const string& stemlang = "english"); bool getQueryTerms(std::list& terms); // Get document at rank i. This is probably vastly inferior to the type diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 59866153..72798205 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -1,20 +1,38 @@ -# @(#$Id: recoll.conf,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: recoll.conf,v 1.3 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes -# Recoll default configuration file: this will index your home directory +# Recoll default configuration file. This should be copied to +# ~/.recoll/recoll.conf + +# Space-separated list of directories to index topdirs = ~ +# Use stemming of query terms or not (ie: expand search for floors to +# floor, flooring, etc... There is currently a serious performance hit for +# this (at query time), but you can try it, it may be acceptable depending +# on your database size +querystemming = 0 +querystemminglanguage = english + +# Name of file suffix to mime-type map file. mimemapfile = mimemap +# Name of mime-type to filter type/name map file. mimeconffile = mimeconf +# Where to store the database. dbdir = ~/.recoll/xapiandb +# Default character set. Values found inside files, ie content tag in html +# documents, will override this. It can be specified per directory (see +# below). Used when converting to utf-8 (internal storage format). defaultcharset = iso-8859-1 defaultlanguage = french + +# Guessing charsets usually does not work well guesscharset = 0 # You could specify different parameters for a subdirectory like this: (no # tilde substitution there for now, sorry) #[/home/me/englishdocs/plain] -#defaultlanguage = english +#defaultcharset = iso-8859-2