diff --git a/src/Makefile b/src/Makefile
index 5bb5c3db..1df2fe3f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,7 @@
all:
cd lib;make
cd index;make
- cd qtgui;rm -f recoll;make
+ cd qtgui;qmake recoll.pro ; rm -f recoll;make
clean:
cd common;make clean
cd index;make clean
diff --git a/src/README b/src/README
index 12e17bb3..48690010 100644
--- a/src/README
+++ b/src/README
@@ -1,4 +1,4 @@
- @(#$Id: README,v 1.1 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes
+ @(#$Id: README,v 1.2 2005-02-08 14:54:38 dockes Exp $ (C) 2004 J.F.Dockes
Hello.
@@ -13,9 +13,8 @@ It will become much better in the near future.
What it has:
- - Easy installation. No db, web server or exotic language necessary. The
- binary packages are statically linked and should run almost as soon as
- unpacked. The idea is that EVERYBODY should index their files because it
+ - Easy installation. No db, web server or exotic language necessary.
+ The idea is that EVERYBODY should index their files because it
makes life easier.
- Indexes text, pdf, html, postscript. Deals with compressed versions of
same.
diff --git a/src/qtgui/recoll.pro b/src/qtgui/recoll.pro
index 54f9d33a..e02ce8f4 100644
--- a/src/qtgui/recoll.pro
+++ b/src/qtgui/recoll.pro
@@ -23,8 +23,7 @@ unix {
UI_DIR = .ui
MOC_DIR = .moc
OBJECTS_DIR = .obj
- LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
- -lfontconfig -lfreetype -lexpat -lz
+ LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
INCLUDEPATH += ../common ../index ../query ../unac ../utils
}
diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui
index daa19bca..b8f644b4 100644
--- a/src/qtgui/recollmain.ui
+++ b/src/qtgui/recollmain.ui
@@ -311,6 +311,8 @@
recollmain.ui.h
+ std::string stemlang;
+ bool dostem;
int reslist_current;
int reslist_winfirst;
diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h
index b56d5e49..13d937f4 100644
--- a/src/qtgui/recollmain.ui.h
+++ b/src/qtgui/recollmain.ui.h
@@ -259,14 +259,24 @@ void RecollMain::queryText_returnPressed()
"to complete?");
return;
}
-
}
+ if (stemlang.empty()) {
+ string param;
+ if (rclconfig->getConfParam("querystemming", param))
+ dostem = ConfTree::stringToBool(param);
+ else
+ dostem = false;
+ if (!rclconfig->getConfParam("querystemminglanguage", stemlang))
+ stemlang = "english";
+ }
+
reslist_current = -1;
reslist_winfirst = -1;
QCString u8 = queryText->text().utf8();
- if (!rcldb->setQuery(string((const char *)u8)))
+ if (!rcldb->setQuery(string((const char *)u8), dostem ?
+ Rcl::Db::QO_STEM : Rcl::Db::QO_NONE, stemlang))
return;
list terms;
listNextPB_clicked();
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 51fa78c7..0781dad4 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.22 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include
#include
@@ -452,10 +452,49 @@ class wsQData : public TextSplitCB {
}
};
+#include
-bool Rcl::Db::setQuery(const std::string &iqstring)
+// Expand term to list of all terms which expand to the same term.
+// This is currently awfully inefficient as we actually stem the whole
+// db term list ! Need to build an efficient structure when finishing
+// indexing, but good enough for testing
+static list stemexpand(Native *ndb, string term, const string& lang)
{
- LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str()));
+ list explist;
+ try {
+ Xapian::Stem stemmer(lang);
+ string stem = stemmer.stem_word(term);
+ LOGDEB(("stemexpand: term '%s' stem '%s'\n",
+ term.c_str(), stem.c_str()));
+ Xapian::TermIterator it;
+ for (it = ndb->db.allterms_begin();
+ it != ndb->db.allterms_end(); it++) {
+ string stem1 = stemmer.stem_word(*it);
+ if (stem == stem1)
+ explist.push_back(*it);
+ }
+ if (explist.size() == 0)
+ explist.push_back(term);
+ if (1) {
+ string expanded;
+ for (list::const_iterator it = explist.begin();
+ it != explist.end(); it++) {
+ expanded += *it + " ";
+ }
+ LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
+ }
+ } catch (...) {
+ LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
+ explist.push_back(term);
+ }
+ return explist;
+}
+
+bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
+ const string& stemlang)
+{
+ LOGDEB(("Rcl::Db::setQuery: q: '%s', opts 0x%x, stemlang %s\n",
+ iqstring.c_str(), (unsigned int)opts, stemlang.c_str()));
Native *ndb = (Native *)pdata;
if (!ndb)
return false;
@@ -465,13 +504,14 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
return false;
}
- // First extract phrases:
+ // First split into (possibly single word) phrases ("this is a phrase"):
list phrases;
ConfTree::stringToStrings(qstring, phrases);
for (list::const_iterator i=phrases.begin();
i != phrases.end();i++) {
LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
}
+
list pqueries;
for (list::const_iterator it = phrases.begin();
it != phrases.end(); it++) {
@@ -482,8 +522,16 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
switch(splitData.terms.size()) {
case 0: continue;// ??
- case 1:
- pqueries.push_back(Xapian::Query(splitData.terms.front()));
+ case 1: {
+ list exp;
+ if (opts & QO_STEM)
+ exp = stemexpand(ndb, splitData.terms.front(), stemlang);
+ else
+ exp.push_back(splitData.terms.front());
+ pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
+ exp.begin(),
+ exp.end()));
+ }
break;
default:
LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 0bbc43d1..8a78e197 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.10 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */
#include
#include
@@ -76,7 +76,9 @@ class Db {
// Query-related functions
// Parse query string and initialize query
- bool setQuery(const string &q);
+ enum QueryOpts {QO_NONE=0, QO_STEM = 1};
+ bool setQuery(const string &q, QueryOpts opts = QO_NONE,
+ const string& stemlang = "english");
bool getQueryTerms(std::list& terms);
// Get document at rank i. This is probably vastly inferior to the type
diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf
index 59866153..72798205 100644
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@@ -1,20 +1,38 @@
-# @(#$Id: recoll.conf,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes
+# @(#$Id: recoll.conf,v 1.3 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes
-# Recoll default configuration file: this will index your home directory
+# Recoll default configuration file. This should be copied to
+# ~/.recoll/recoll.conf
+
+# Space-separated list of directories to index
topdirs = ~
+# Use stemming of query terms or not (ie: expand search for floors to
+# floor, flooring, etc... There is currently a serious performance hit for
+# this (at query time), but you can try it, it may be acceptable depending
+# on your database size
+querystemming = 0
+querystemminglanguage = english
+
+# Name of file suffix to mime-type map file.
mimemapfile = mimemap
+# Name of mime-type to filter type/name map file.
mimeconffile = mimeconf
+# Where to store the database.
dbdir = ~/.recoll/xapiandb
+# Default character set. Values found inside files, ie content tag in html
+# documents, will override this. It can be specified per directory (see
+# below). Used when converting to utf-8 (internal storage format).
defaultcharset = iso-8859-1
defaultlanguage = french
+
+# Guessing charsets usually does not work well
guesscharset = 0
# You could specify different parameters for a subdirectory like this: (no
# tilde substitution there for now, sorry)
#[/home/me/englishdocs/plain]
-#defaultlanguage = english
+#defaultcharset = iso-8859-2