*** empty log message ***

This commit is contained in:
dockes 2005-02-08 14:54:38 +00:00
parent 2e35f674a6
commit fe550bf0e8
8 changed files with 98 additions and 20 deletions

View File

@ -1,7 +1,7 @@
all:
cd lib;make
cd index;make
cd qtgui;rm -f recoll;make
cd qtgui;qmake recoll.pro ; rm -f recoll;make
clean:
cd common;make clean
cd index;make clean

View File

@ -1,4 +1,4 @@
@(#$Id: README,v 1.1 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes
@(#$Id: README,v 1.2 2005-02-08 14:54:38 dockes Exp $ (C) 2004 J.F.Dockes
Hello.
@ -13,9 +13,8 @@ It will become much better in the near future.
What it has:
- Easy installation. No db, web server or exotic language necessary. The
binary packages are statically linked and should run almost as soon as
unpacked. The idea is that EVERYBODY should index their files because it
- Easy installation. No db, web server or exotic language necessary.
The idea is that EVERYBODY should index their files because it
makes life easier.
- Indexes text, pdf, html, postscript. Deals with compressed versions of
same.

View File

@ -23,8 +23,7 @@ unix {
UI_DIR = .ui
MOC_DIR = .moc
OBJECTS_DIR = .obj
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
-lfontconfig -lfreetype -lexpat -lz
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
INCLUDEPATH += ../common ../index ../query ../unac ../utils
}

View File

@ -311,6 +311,8 @@
<include location="local" impldecl="in implementation">recollmain.ui.h</include>
</includes>
<variables>
<variable>std::string stemlang;</variable>
<variable>bool dostem;</variable>
<variable>int reslist_current;</variable>
<variable>int reslist_winfirst;</variable>
</variables>

View File

@ -259,14 +259,24 @@ void RecollMain::queryText_returnPressed()
"to complete?");
return;
}
}
if (stemlang.empty()) {
string param;
if (rclconfig->getConfParam("querystemming", param))
dostem = ConfTree::stringToBool(param);
else
dostem = false;
if (!rclconfig->getConfParam("querystemminglanguage", stemlang))
stemlang = "english";
}
reslist_current = -1;
reslist_winfirst = -1;
QCString u8 = queryText->text().utf8();
if (!rcldb->setQuery(string((const char *)u8)))
if (!rcldb->setQuery(string((const char *)u8), dostem ?
Rcl::Db::QO_STEM : Rcl::Db::QO_NONE, stemlang))
return;
list<string> terms;
listNextPB_clicked();

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.22 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -452,10 +452,49 @@ class wsQData : public TextSplitCB {
}
};
#include <xapian/stem.h>
bool Rcl::Db::setQuery(const std::string &iqstring)
// Expand term to list of all terms which expand to the same term.
// This is currently awfully inefficient as we actually stem the whole
// db term list ! Need to build an efficient structure when finishing
// indexing, but good enough for testing
static list<string> stemexpand(Native *ndb, string term, const string& lang)
{
LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str()));
list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemexpand: term '%s' stem '%s'\n",
term.c_str(), stem.c_str()));
Xapian::TermIterator it;
for (it = ndb->db.allterms_begin();
it != ndb->db.allterms_end(); it++) {
string stem1 = stemmer.stem_word(*it);
if (stem == stem1)
explist.push_back(*it);
}
if (explist.size() == 0)
explist.push_back(term);
if (1) {
string expanded;
for (list<string>::const_iterator it = explist.begin();
it != explist.end(); it++) {
expanded += *it + " ";
}
LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
}
} catch (...) {
LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
explist.push_back(term);
}
return explist;
}
bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
const string& stemlang)
{
LOGDEB(("Rcl::Db::setQuery: q: '%s', opts 0x%x, stemlang %s\n",
iqstring.c_str(), (unsigned int)opts, stemlang.c_str()));
Native *ndb = (Native *)pdata;
if (!ndb)
return false;
@ -465,13 +504,14 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
return false;
}
// First extract phrases:
// First split into (possibly single word) phrases ("this is a phrase"):
list<string> phrases;
ConfTree::stringToStrings(qstring, phrases);
for (list<string>::const_iterator i=phrases.begin();
i != phrases.end();i++) {
LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
}
list<Xapian::Query> pqueries;
for (list<string>::const_iterator it = phrases.begin();
it != phrases.end(); it++) {
@ -482,8 +522,16 @@ bool Rcl::Db::setQuery(const std::string &iqstring)
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
switch(splitData.terms.size()) {
case 0: continue;// ??
case 1:
pqueries.push_back(Xapian::Query(splitData.terms.front()));
case 1: {
list<string> exp;
if (opts & QO_STEM)
exp = stemexpand(ndb, splitData.terms.front(), stemlang);
else
exp.push_back(splitData.terms.front());
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(),
exp.end()));
}
break;
default:
LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.10 2005-02-08 11:59:08 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -76,7 +76,9 @@ class Db {
// Query-related functions
// Parse query string and initialize query
bool setQuery(const string &q);
enum QueryOpts {QO_NONE=0, QO_STEM = 1};
bool setQuery(const string &q, QueryOpts opts = QO_NONE,
const string& stemlang = "english");
bool getQueryTerms(std::list<string>& terms);
// Get document at rank i. This is probably vastly inferior to the type

View File

@ -1,20 +1,38 @@
# @(#$Id: recoll.conf,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: recoll.conf,v 1.3 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll default configuration file: this will index your home directory
# Recoll default configuration file. This should be copied to
# ~/.recoll/recoll.conf
# Space-separated list of directories to index
topdirs = ~
# Use stemming of query terms or not (ie: expand search for floors to
# floor, flooring, etc... There is currently a serious performance hit for
# this (at query time), but you can try it, it may be acceptable depending
# on your database size
querystemming = 0
querystemminglanguage = english
# Name of file suffix to mime-type map file.
mimemapfile = mimemap
# Name of mime-type to filter type/name map file.
mimeconffile = mimeconf
# Where to store the database.
dbdir = ~/.recoll/xapiandb
# Default character set. Values found inside files, ie content tag in html
# documents, will override this. It can be specified per directory (see
# below). Used when converting to utf-8 (internal storage format).
defaultcharset = iso-8859-1
defaultlanguage = french
# Guessing charsets usually does not work well
guesscharset = 0
# You could specify different parameters for a subdirectory like this: (no
# tilde substitution there for now, sorry)
#[/home/me/englishdocs/plain]
#defaultlanguage = english
#defaultcharset = iso-8859-2