*** empty log message ***

This commit is contained in:
dockes 2005-02-08 14:54:38 +00:00
parent 2e35f674a6
commit fe550bf0e8
8 changed files with 98 additions and 20 deletions

View File

@ -1,7 +1,7 @@
all: all:
cd lib;make cd lib;make
cd index;make cd index;make
cd qtgui;rm -f recoll;make cd qtgui;qmake recoll.pro ; rm -f recoll;make
clean: clean:
cd common;make clean cd common;make clean
cd index;make clean cd index;make clean

View File

@ -1,4 +1,4 @@
@(#$Id: README,v 1.1 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes @(#$Id: README,v 1.2 2005-02-08 14:54:38 dockes Exp $ (C) 2004 J.F.Dockes
Hello. Hello.
@ -13,9 +13,8 @@ It will become much better in the near future.
What it has: What it has:
- Easy installation. No db, web server or exotic language necessary. The - Easy installation. No db, web server or exotic language necessary.
binary packages are statically linked and should run almost as soon as The idea is that EVERYBODY should index their files because it
unpacked. The idea is that EVERYBODY should index their files because it
makes life easier. makes life easier.
- Indexes text, pdf, html, postscript. Deals with compressed versions of - Indexes text, pdf, html, postscript. Deals with compressed versions of
same. same.

View File

@ -23,8 +23,7 @@ unix {
UI_DIR = .ui UI_DIR = .ui
MOC_DIR = .moc MOC_DIR = .moc
OBJECTS_DIR = .obj OBJECTS_DIR = .obj
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \ LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
-lfontconfig -lfreetype -lexpat -lz
INCLUDEPATH += ../common ../index ../query ../unac ../utils INCLUDEPATH += ../common ../index ../query ../unac ../utils
} }

View File

@ -311,6 +311,8 @@
<include location="local" impldecl="in implementation">recollmain.ui.h</include> <include location="local" impldecl="in implementation">recollmain.ui.h</include>
</includes> </includes>
<variables> <variables>
<variable>std::string stemlang;</variable>
<variable>bool dostem;</variable>
<variable>int reslist_current;</variable> <variable>int reslist_current;</variable>
<variable>int reslist_winfirst;</variable> <variable>int reslist_winfirst;</variable>
</variables> </variables>

View File

@ -259,14 +259,24 @@ void RecollMain::queryText_returnPressed()
"to complete?"); "to complete?");
return; return;
} }
} }
if (stemlang.empty()) {
string param;
if (rclconfig->getConfParam("querystemming", param))
dostem = ConfTree::stringToBool(param);
else
dostem = false;
if (!rclconfig->getConfParam("querystemminglanguage", stemlang))
stemlang = "english";
}
reslist_current = -1; reslist_current = -1;
reslist_winfirst = -1; reslist_winfirst = -1;
QCString u8 = queryText->text().utf8(); QCString u8 = queryText->text().utf8();
if (!rcldb->setQuery(string((const char *)u8))) if (!rcldb->setQuery(string((const char *)u8), dostem ?
Rcl::Db::QO_STEM : Rcl::Db::QO_NONE, stemlang))
return; return;
list<string> terms; list<string> terms;
listNextPB_clicked(); listNextPB_clicked();

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.22 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -452,10 +452,49 @@ class wsQData : public TextSplitCB {
} }
}; };
#include <xapian/stem.h>
bool Rcl::Db::setQuery(const std::string &iqstring) // Expand term to list of all terms which expand to the same term.
// This is currently awfully inefficient as we actually stem the whole
// db term list ! Need to build an efficient structure when finishing
// indexing, but good enough for testing
static list<string> stemexpand(Native *ndb, string term, const string& lang)
{ {
LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str())); list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemexpand: term '%s' stem '%s'\n",
term.c_str(), stem.c_str()));
Xapian::TermIterator it;
for (it = ndb->db.allterms_begin();
it != ndb->db.allterms_end(); it++) {
string stem1 = stemmer.stem_word(*it);
if (stem == stem1)
explist.push_back(*it);
}
if (explist.size() == 0)
explist.push_back(term);
if (1) {
string expanded;
for (list<string>::const_iterator it = explist.begin();
it != explist.end(); it++) {
expanded += *it + " ";
}
LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
}
} catch (...) {
LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
explist.push_back(term);
}
return explist;
}
bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
const string& stemlang)
{
LOGDEB(("Rcl::Db::setQuery: q: '%s', opts 0x%x, stemlang %s\n",
iqstring.c_str(), (unsigned int)opts, stemlang.c_str()));
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
if (!ndb) if (!ndb)
return false; return false;
@ -465,13 +504,14 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
return false; return false;
} }
// First extract phrases: // First split into (possibly single word) phrases ("this is a phrase"):
list<string> phrases; list<string> phrases;
ConfTree::stringToStrings(qstring, phrases); ConfTree::stringToStrings(qstring, phrases);
for (list<string>::const_iterator i=phrases.begin(); for (list<string>::const_iterator i=phrases.begin();
i != phrases.end();i++) { i != phrases.end();i++) {
LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str())); LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
} }
list<Xapian::Query> pqueries; list<Xapian::Query> pqueries;
for (list<string>::const_iterator it = phrases.begin(); for (list<string>::const_iterator it = phrases.begin();
it != phrases.end(); it++) { it != phrases.end(); it++) {
@ -482,8 +522,16 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
LOGDEB(("Splitter term count: %d\n", splitData.terms.size())); LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
switch(splitData.terms.size()) { switch(splitData.terms.size()) {
case 0: continue;// ?? case 0: continue;// ??
case 1: case 1: {
pqueries.push_back(Xapian::Query(splitData.terms.front())); list<string> exp;
if (opts & QO_STEM)
exp = stemexpand(ndb, splitData.terms.front(), stemlang);
else
exp.push_back(splitData.terms.front());
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(),
exp.end()));
}
break; break;
default: default:
LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str())); LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.10 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -76,7 +76,9 @@ class Db {
// Query-related functions // Query-related functions
// Parse query string and initialize query // Parse query string and initialize query
bool setQuery(const string &q); enum QueryOpts {QO_NONE=0, QO_STEM = 1};
bool setQuery(const string &q, QueryOpts opts = QO_NONE,
const string& stemlang = "english");
bool getQueryTerms(std::list<string>& terms); bool getQueryTerms(std::list<string>& terms);
// Get document at rank i. This is probably vastly inferior to the type // Get document at rank i. This is probably vastly inferior to the type

View File

@ -1,20 +1,38 @@
# @(#$Id: recoll.conf,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: recoll.conf,v 1.3 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll default configuration file: this will index your home directory # Recoll default configuration file. This should be copied to
# ~/.recoll/recoll.conf
# Space-separated list of directories to index
topdirs = ~ topdirs = ~
# Use stemming of query terms or not (ie: expand search for floors to
# floor, flooring, etc... There is currently a serious performance hit for
# this (at query time), but you can try it, it may be acceptable depending
# on your database size
querystemming = 0
querystemminglanguage = english
# Name of file suffix to mime-type map file.
mimemapfile = mimemap mimemapfile = mimemap
# Name of mime-type to filter type/name map file.
mimeconffile = mimeconf mimeconffile = mimeconf
# Where to store the database.
dbdir = ~/.recoll/xapiandb dbdir = ~/.recoll/xapiandb
# Default character set. Values found inside files, ie content tag in html
# documents, will override this. It can be specified per directory (see
# below). Used when converting to utf-8 (internal storage format).
defaultcharset = iso-8859-1 defaultcharset = iso-8859-1
defaultlanguage = french defaultlanguage = french
# Guessing charsets usually does not work well
guesscharset = 0 guesscharset = 0
# You could specify different parameters for a subdirectory like this: (no # You could specify different parameters for a subdirectory like this: (no
# tilde substitution there for now, sorry) # tilde substitution there for now, sorry)
#[/home/me/englishdocs/plain] #[/home/me/englishdocs/plain]
#defaultlanguage = english #defaultcharset = iso-8859-2