implemented stem databases

This commit is contained in:
dockes 2005-02-10 15:21:12 +00:00
parent 3fc0738c81
commit 1a897c47b3
9 changed files with 282 additions and 89 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.5 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.6 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -34,7 +34,7 @@ using namespace std;
/**
* Bunch holder for data used while indexing a directory tree
*/
class DbIndexer {
class DbIndexer : public FsTreeWalkerCB {
FsTreeWalker walker;
RclConfig *config;
string dbdir;
@ -46,7 +46,7 @@ class DbIndexer {
: config(cnf), dbdir(dbd), topdirs(top)
{ }
~DbIndexer() {
virtual ~DbIndexer() {
if (tmpdir.length()) {
wipedir(tmpdir);
if (rmdir(tmpdir.c_str()) < 0) {
@ -55,9 +55,9 @@ class DbIndexer {
}
}
}
friend FsTreeWalker::Status
indexfile(void *, const std::string &, const struct stat *,
FsTreeWalker::CbFlag);
FsTreeWalker::Status
processone(const std::string &, const struct stat *, FsTreeWalker::CbFlag);
bool index();
};
@ -79,7 +79,7 @@ bool DbIndexer::index()
it != topdirs->end(); it++) {
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
dbdir.c_str()));
if (walker.walk(*it, indexfile, this) != FsTreeWalker::FtwOk) {
if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
LOGERR(("DbIndexer::index: error while indexing %s\n",
it->c_str()));
db.close();
@ -87,6 +87,18 @@ bool DbIndexer::index()
}
}
db.purge();
// Create stemming databases
string slangs;
if (config->getConfParam("indexstemminglanguages", slangs)) {
list<string> langs;
ConfTree::stringToStrings(slangs, langs);
for (list<string>::const_iterator it = langs.begin();
it != langs.end(); it++) {
db.createStemDb(*it);
}
}
if (!db.close()) {
LOGERR(("DbIndexer::index: error closing database in %s\n",
dbdir.c_str()));
@ -105,26 +117,24 @@ bool DbIndexer::index()
* the actual indexing work.
*/
FsTreeWalker::Status
indexfile(void *cdata, const std::string &fn, const struct stat *stp,
FsTreeWalker::CbFlag flg)
DbIndexer::processone(const std::string &fn, const struct stat *stp,
FsTreeWalker::CbFlag flg)
{
DbIndexer *me = (DbIndexer *)cdata;
// If we're changing directories, possibly adjust parameters.
if (flg == FsTreeWalker::FtwDirEnter ||
flg == FsTreeWalker::FtwDirReturn) {
me->config->setKeyDir(fn);
config->setKeyDir(fn);
return FsTreeWalker::FtwOk;
}
// Check db up to date ?
if (!me->db.needUpdate(fn, stp)) {
if (!db.needUpdate(fn, stp)) {
LOGDEB(("indexfile: up to date: %s\n", fn.c_str()));
return FsTreeWalker::FtwOk;
}
Rcl::Doc doc;
if (!internfile(fn, me->config, doc, me->tmpdir))
if (!internfile(fn, config, doc, tmpdir))
return FsTreeWalker::FtwOk;
// Set up common fields:
@ -133,7 +143,7 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
doc.mtime = ascdate;
// Do database-specific work to update document data
if (!me->db.add(fn, doc))
if (!db.add(fn, doc))
return FsTreeWalker::FtwError;
return FsTreeWalker::FtwOk;

View File

@ -86,11 +86,7 @@ class myTextSplitCB : public TextSplitCB {
static string plaintorich(const string &in, const list<string>& terms,
list<pair<int, int> >&termoffsets)
{
{string t;
for (list<string>::const_iterator it = terms.begin();
it != terms.end();it++) t += "'" + *it + "' ";
LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
}
LOGDEB(("plaintorich: terms: %s\n", stringlistdisp(terms).c_str()));
myTextSplitCB cb(terms);
TextSplit splitter(&cb, true);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.24 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -16,8 +16,11 @@ using namespace std;
#include "unacpp.h"
#include "conftree.h"
#include "debuglog.h"
#include "pathut.h"
#include "smallut.h"
#include "xapian.h"
#include <xapian/stem.h>
// Data for a xapian database. There could actually be 2 different
// ones for indexing or query as there is not much in common.
@ -25,6 +28,8 @@ class Native {
public:
bool isopen;
bool iswritable;
string basedir;
// Indexing
Xapian::WritableDatabase wdb;
vector<bool> updated;
@ -102,9 +107,6 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
ndb->iswritable = true;
break;
case DbTrunc:
ndb->wdb =
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
ndb->iswritable = true;
break;
case DbRO:
default:
@ -113,6 +115,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
break;
}
ndb->isopen = true;
ndb->basedir = dir;
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
@ -399,17 +402,152 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
return true;
}
/// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang)
{
string nm = basename;
path_cat(nm, string("stem_") + lang);
return nm;
}
// Is char non-lowercase ascii ?
inline static bool
p_notlowerorutf(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
* with documents indexed by a single term (the stem), and with the list of
* parent terms in the document data.
*/
bool Rcl::Db::createStemDb(const string& lang)
{
LOGDEB(("Rcl::Db::createStemDb(%s)\n", lang.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false || ndb->iswritable == false)
return false;
// First build the in-memory stem database:
// We walk the list of all terms, and stem each.
// If the stem is identical to the term, no need to create an entry
// Else, we add an entry to the multimap.
// At the end, we only save stem-terms associations with several terms, the
// others are not useful
multimap<string, string> assocs;
// Statistics
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
int stemconst=0; // Stem == term
int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives
try {
Xapian::Stem stemmer(lang);
Xapian::TermIterator it;
for (it = ndb->wdb.allterms_begin();
it != ndb->wdb.allterms_end(); it++) {
// If it has any non-lowercase 7bit char, cant be stemmable
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
++nostem;
// LOGDEB(("stemskipped: '%s', because of 0x%x\n",
// (*it).c_str(), *sit));
continue;
}
string stem = stemmer.stem_word(*it);
//cerr << "word " << *it << " stem " << stem << endl;
if (stem == *it) {
++stemconst;
continue;
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (...) {
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
// Create xapian database for stem relations
string stemdbdir = stemdbname(ndb->basedir, lang);
string ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
} catch (const string &s) {
ermsg = s;
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (ermsg != "NOERROR") {
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
stemdbdir.c_str(), ermsg.c_str()));
return false;
}
// Enter pseud-docs in db. Walk the multimap, only enter
// associations where there are several parent terms
string stem;
list<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) {
if (stem == it->first) {
// Staying with same stem
derivs.push_back(it->second);
// cerr << " " << it->second << endl;
} else {
// Changing stems
++stemdiff;
if (derivs.size() > 1) {
// Previous stem has multiple derivatives. Enter in db
++stemmultiple;
Xapian::Document newdocument;
newdocument.add_term(stem);
// The doc data is just parents=blank-separated-list
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin();
it != derivs.end(); it++) {
record += *it + " ";
}
record += "\n";
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
try {
sdb.replace_document(stem, newdocument);
} catch (...) {
LOGERR(("Rcl::Db::createstemdb: replace failed\n"));
return false;
}
}
derivs.clear();
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
return true;
}
/**
* This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the
* stem database while we are at it.
*/
bool Rcl::Db::purge()
{
LOGDEB(("Rcl::Db::purge\n"));
// There seems to be problems with the document delete code, when
// we do this, the database is not actually updated. Especially,
// if we delete a bunch of docs, so that there is a hole in the
// docids at the beginning, we can't add anything (appears to work
// and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
@ -418,6 +556,13 @@ bool Rcl::Db::purge()
if (ndb->isopen == false || ndb->iswritable == false)
return false;
// There seems to be problems with the document delete code, when
// we do this, the database is not actually updated. Especially,
// if we delete a bunch of docs, so that there is a hole in the
// docids at the beginning, we can't add anything (appears to work
// and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem
ndb->wdb.flush();
for (Xapian::docid did = 1; did < ndb->updated.size(); ++did) {
if (!ndb->updated[did]) {
@ -429,6 +574,7 @@ bool Rcl::Db::purge()
}
}
}
ndb->wdb.flush();
return true;
}
@ -446,46 +592,57 @@ class wsQData : public TextSplitCB {
return s;
}
bool takeword(const std::string &term, int , int, int) {
LOGDEB(("Takeword: %s\n", term.c_str()));
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
terms.push_back(term);
return true;
}
};
#include <xapian/stem.h>
// Expand term to list of all terms which expand to the same term.
// This is currently awfully inefficient as we actually stem the whole
// db term list ! Need to build an efficient structure when finishing
// indexing, but good enough for testing
// Expand term to list of all terms which stem to the same term.
static list<string> stemexpand(Native *ndb, string term, const string& lang)
{
list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemexpand: term '%s' stem '%s'\n",
term.c_str(), stem.c_str()));
Xapian::TermIterator it;
for (it = ndb->db.allterms_begin();
it != ndb->db.allterms_end(); it++) {
string stem1 = stemmer.stem_word(*it);
if (stem == stem1)
explist.push_back(*it);
}
if (explist.size() == 0)
LOGDEB(("stemexpand: '%s' -> '%s'\n", term.c_str(), stem.c_str()));
// Try to fetch the doc from the stem db
string stemdbdir = stemdbname(ndb->basedir, lang);
Xapian::Database sdb(stemdbdir);
LOGDEB1(("Rcl::Db::stemexpand: %s lastdocid: %d\n",
stemdbdir.c_str(), sdb.get_lastdocid()));
if (!sdb.term_exists(stem)) {
LOGDEB1(("Rcl::Db::stemexpand: no term for %s\n", stem.c_str()));
explist.push_back(term);
if (1) {
string expanded;
for (list<string>::const_iterator it = explist.begin();
it != explist.end(); it++) {
expanded += *it + " ";
}
LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
return explist;
}
Xapian::PostingIterator did = sdb.postlist_begin(stem);
if (did == sdb.postlist_end(stem)) {
LOGDEB1(("Rcl::Db::stemexpand: no term(1) for %s\n",stem.c_str()));
explist.push_back(term);
return explist;
}
Xapian::Document doc = sdb.get_document(*did);
string data = doc.get_data();
// No need for a conftree, but we need to massage the data a little
string::size_type pos = data.find_first_of("=");
++pos;
string::size_type pos1 = data.find_last_of("\n");
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
explist.push_back(term);
return explist;
}
ConfTree::stringToStrings(data.substr(pos, pos1-pos), explist);
if (find(explist.begin(), explist.end(), term) == explist.end()) {
explist.push_back(term);
}
LOGDEB(("Rcl::Db::stemexpand: %s -> %s\n", stem.c_str(),
stringlistdisp(explist).c_str()));
} catch (...) {
LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
LOGERR(("stemexpand: error accessing stem db\n"));
explist.push_back(term);
return explist;
}
return explist;
}
@ -519,7 +676,8 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
wsQData splitData;
TextSplit splitter(&splitData, true);
splitter.text_to_words(*it);
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
LOGDEB1(("Rcl::Db::setquery: splitter term count: %d\n",
splitData.terms.size()));
switch(splitData.terms.size()) {
case 0: continue;// ??
case 1: {
@ -578,7 +736,7 @@ int Rcl::Db::getResCnt()
bool Rcl::Db::getDoc(int i, Doc &doc, int *percent)
{
LOGDEB(("Rcl::Db::getDoc: %d\n", i));
LOGDEB1(("Rcl::Db::getDoc: %d\n", i));
Native *ndb = (Native *)pdata;
if (!ndb || !ndb->enquire) {
LOGERR(("Rcl::Db::getDoc: no query opened\n"));

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.12 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -72,6 +72,7 @@ class Db {
bool add(const string &filename, const Doc &doc);
bool needUpdate(const string &filename, const struct stat *stp);
bool purge();
bool createStemDb(const string &lang);
// Query-related functions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.5 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_EXECMD
#include <unistd.h>
@ -131,7 +131,7 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
close(pipeout[0]);
if (pipeout[1] >= 0)
close(pipeout[1]);
LOGDEB(("ExecCmd::doexec: father got status 0x%x\n", status));
LOGDEB1(("ExecCmd::doexec: father got status 0x%x\n", status));
return status;
} else {
if (input) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.2 2004-12-12 08:58:12 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_FSTREEWALK
@ -53,8 +53,8 @@ int FsTreeWalker::getErrCnt()
return data->errors;
}
FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
void *cdata)
FsTreeWalker::Status FsTreeWalker::walk(const string &top,
FsTreeWalkerCB& cb)
{
Status status = FtwOk;
struct stat st;
@ -68,12 +68,12 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
return FtwError;
}
if (S_ISDIR(st.st_mode)) {
if ((status = fun(cdata, top, &st, FtwDirEnter)) &
if ((status = cb.processone(top, &st, FtwDirEnter)) &
(FtwStop|FtwError)) {
return status;
}
} else if (S_ISREG(st.st_mode)) {
return fun(cdata, top, &st, FtwRegular);
return cb.processone(top, &st, FtwRegular);
} else {
return status;
}
@ -110,17 +110,17 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
}
if (S_ISDIR(st.st_mode)) {
if (data->options & FtwNoRecurse) {
status = fun(cdata, fn, &st, FtwDirEnter);
status = cb.processone(fn, &st, FtwDirEnter);
} else {
status=walk(fn, fun, cdata);
status=walk(fn, cb);
}
if (status & (FtwStop|FtwError))
goto out;
if ((status = fun(cdata, top, &st, FtwDirReturn))
if ((status = cb.processone(top, &st, FtwDirReturn))
& (FtwStop|FtwError))
goto out;
} else if (S_ISREG(st.st_mode)) {
if ((status = fun(cdata, fn, &st, FtwRegular)) &
if ((status = cb.processone(fn, &st, FtwRegular)) &
(FtwStop|FtwError)) {
goto out;
}
@ -143,17 +143,22 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
using namespace std;
FsTreeWalker::Status walkfunc(void *, const string &path,
const struct stat *st,
FsTreeWalker::CbFlag flg)
{
if (flg == FsTreeWalker::FtwDirEnter) {
cout << "[Entering " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwRegular) {
cout << path << endl;
class myCB : public FsTreeWalkerCB {
public:
FsTreeWalker::Status processone(const string &path,
const struct stat *st,
FsTreeWalker::CbFlag flg)
{
if (flg == FsTreeWalker::FtwDirEnter) {
cout << "[Entering " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwDirReturn) {
cout << "[Returning to " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwRegular) {
cout << path << endl;
}
return FsTreeWalker::FtwOk;
}
return FsTreeWalker::FtwOk;
}
};
int main(int argc, const char **argv)
{
@ -162,7 +167,8 @@ int main(int argc, const char **argv)
exit(1);
}
FsTreeWalker walker;
walker.walk(argv[1], walkfunc, 0);
myCB cb;
walker.walk(argv[1], cb);
if (walker.getErrCnt() > 0)
cout << walker.getReason();
}

View File

@ -1,9 +1,14 @@
#ifndef _FSTREEWALK_H_INCLUDED_
#define _FSTREEWALK_H_INCLUDED_
/* @(#$Id: fstreewalk.h,v 1.1 2004-12-10 18:13:13 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: fstreewalk.h,v 1.2 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#ifndef NO_NAMESPACES
using std::string;
#endif
class FsTreeWalkerCB;
class FsTreeWalker {
public:
@ -12,12 +17,9 @@ class FsTreeWalker {
FtwStatAll = FtwError|FtwStop};
enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2};
typedef Status (*CbType)(void *cdata,
const std::string &, const struct stat *, CbFlag);
FsTreeWalker(Options opts = FtwOptNone);
~FsTreeWalker();
Status walk(const std::string &dir, CbType fun, void *cdata);
Status walk(const std::string &dir, FsTreeWalkerCB& cb);
std::string getReason();
int getErrCnt();
private:
@ -25,4 +27,11 @@ class FsTreeWalker {
Internal *data;
};
class FsTreeWalkerCB {
public:
virtual ~FsTreeWalkerCB() {}
virtual FsTreeWalker::Status
processone(const string &, const struct stat *, FsTreeWalker::CbFlag)
= 0;
};
#endif /* _FSTREEWALK_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.4 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_SMALLUT
#include <string>
@ -48,6 +48,17 @@ bool maketmpdir(string& tdir)
return true;
}
string stringlistdisp(const list<string>& sl)
{
string s;
for (list<string>::const_iterator it = sl.begin(); it!= sl.end(); it++)
s += "[" + *it + "] ";
if (!s.empty())
s.erase(s.length()-1);
return s;
}
int stringicmp(const string & s1, const string& s2)
{
string::const_iterator it1 = s1.begin();

View File

@ -1,14 +1,16 @@
#ifndef _SMALLUT_H_INCLUDED_
#define _SMALLUT_H_INCLUDED_
/* @(#$Id: smallut.h,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: smallut.h,v 1.4 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
using std::string;
using std::list;
extern int stringicmp(const string& s1, const string& s2);
extern int stringlowercmp(const string& alreadylower, const string& s2);
extern int stringuppercmp(const string& alreadyupper, const string& s2);
extern bool maketmpdir(string& tdir);
extern string stringlistdisp(const list<string>& strs);
#endif /* _SMALLUT_H_INCLUDED_ */