implemented stem databases
This commit is contained in:
parent
3fc0738c81
commit
1a897c47b3
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.5 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.6 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -34,7 +34,7 @@ using namespace std;
|
|||||||
/**
|
/**
|
||||||
* Bunch holder for data used while indexing a directory tree
|
* Bunch holder for data used while indexing a directory tree
|
||||||
*/
|
*/
|
||||||
class DbIndexer {
|
class DbIndexer : public FsTreeWalkerCB {
|
||||||
FsTreeWalker walker;
|
FsTreeWalker walker;
|
||||||
RclConfig *config;
|
RclConfig *config;
|
||||||
string dbdir;
|
string dbdir;
|
||||||
@ -46,7 +46,7 @@ class DbIndexer {
|
|||||||
: config(cnf), dbdir(dbd), topdirs(top)
|
: config(cnf), dbdir(dbd), topdirs(top)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
~DbIndexer() {
|
virtual ~DbIndexer() {
|
||||||
if (tmpdir.length()) {
|
if (tmpdir.length()) {
|
||||||
wipedir(tmpdir);
|
wipedir(tmpdir);
|
||||||
if (rmdir(tmpdir.c_str()) < 0) {
|
if (rmdir(tmpdir.c_str()) < 0) {
|
||||||
@ -55,9 +55,9 @@ class DbIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
friend FsTreeWalker::Status
|
|
||||||
indexfile(void *, const std::string &, const struct stat *,
|
FsTreeWalker::Status
|
||||||
FsTreeWalker::CbFlag);
|
processone(const std::string &, const struct stat *, FsTreeWalker::CbFlag);
|
||||||
|
|
||||||
bool index();
|
bool index();
|
||||||
};
|
};
|
||||||
@ -79,7 +79,7 @@ bool DbIndexer::index()
|
|||||||
it != topdirs->end(); it++) {
|
it != topdirs->end(); it++) {
|
||||||
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
|
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
|
||||||
dbdir.c_str()));
|
dbdir.c_str()));
|
||||||
if (walker.walk(*it, indexfile, this) != FsTreeWalker::FtwOk) {
|
if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
|
||||||
LOGERR(("DbIndexer::index: error while indexing %s\n",
|
LOGERR(("DbIndexer::index: error while indexing %s\n",
|
||||||
it->c_str()));
|
it->c_str()));
|
||||||
db.close();
|
db.close();
|
||||||
@ -87,6 +87,18 @@ bool DbIndexer::index()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
db.purge();
|
db.purge();
|
||||||
|
|
||||||
|
// Create stemming databases
|
||||||
|
string slangs;
|
||||||
|
if (config->getConfParam("indexstemminglanguages", slangs)) {
|
||||||
|
list<string> langs;
|
||||||
|
ConfTree::stringToStrings(slangs, langs);
|
||||||
|
for (list<string>::const_iterator it = langs.begin();
|
||||||
|
it != langs.end(); it++) {
|
||||||
|
db.createStemDb(*it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!db.close()) {
|
if (!db.close()) {
|
||||||
LOGERR(("DbIndexer::index: error closing database in %s\n",
|
LOGERR(("DbIndexer::index: error closing database in %s\n",
|
||||||
dbdir.c_str()));
|
dbdir.c_str()));
|
||||||
@ -105,26 +117,24 @@ bool DbIndexer::index()
|
|||||||
* the actual indexing work.
|
* the actual indexing work.
|
||||||
*/
|
*/
|
||||||
FsTreeWalker::Status
|
FsTreeWalker::Status
|
||||||
indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||||
FsTreeWalker::CbFlag flg)
|
FsTreeWalker::CbFlag flg)
|
||||||
{
|
{
|
||||||
DbIndexer *me = (DbIndexer *)cdata;
|
|
||||||
|
|
||||||
// If we're changing directories, possibly adjust parameters.
|
// If we're changing directories, possibly adjust parameters.
|
||||||
if (flg == FsTreeWalker::FtwDirEnter ||
|
if (flg == FsTreeWalker::FtwDirEnter ||
|
||||||
flg == FsTreeWalker::FtwDirReturn) {
|
flg == FsTreeWalker::FtwDirReturn) {
|
||||||
me->config->setKeyDir(fn);
|
config->setKeyDir(fn);
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check db up to date ?
|
// Check db up to date ?
|
||||||
if (!me->db.needUpdate(fn, stp)) {
|
if (!db.needUpdate(fn, stp)) {
|
||||||
LOGDEB(("indexfile: up to date: %s\n", fn.c_str()));
|
LOGDEB(("indexfile: up to date: %s\n", fn.c_str()));
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::Doc doc;
|
Rcl::Doc doc;
|
||||||
if (!internfile(fn, me->config, doc, me->tmpdir))
|
if (!internfile(fn, config, doc, tmpdir))
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
|
|
||||||
// Set up common fields:
|
// Set up common fields:
|
||||||
@ -133,7 +143,7 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
|||||||
doc.mtime = ascdate;
|
doc.mtime = ascdate;
|
||||||
|
|
||||||
// Do database-specific work to update document data
|
// Do database-specific work to update document data
|
||||||
if (!me->db.add(fn, doc))
|
if (!db.add(fn, doc))
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
|
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
|
|||||||
@ -86,11 +86,7 @@ class myTextSplitCB : public TextSplitCB {
|
|||||||
static string plaintorich(const string &in, const list<string>& terms,
|
static string plaintorich(const string &in, const list<string>& terms,
|
||||||
list<pair<int, int> >&termoffsets)
|
list<pair<int, int> >&termoffsets)
|
||||||
{
|
{
|
||||||
{string t;
|
LOGDEB(("plaintorich: terms: %s\n", stringlistdisp(terms).c_str()));
|
||||||
for (list<string>::const_iterator it = terms.begin();
|
|
||||||
it != terms.end();it++) t += "'" + *it + "' ";
|
|
||||||
LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
myTextSplitCB cb(terms);
|
myTextSplitCB cb(terms);
|
||||||
TextSplit splitter(&cb, true);
|
TextSplit splitter(&cb, true);
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.24 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -16,8 +16,11 @@ using namespace std;
|
|||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
#include "conftree.h"
|
#include "conftree.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "pathut.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
|
#include <xapian/stem.h>
|
||||||
|
|
||||||
// Data for a xapian database. There could actually be 2 different
|
// Data for a xapian database. There could actually be 2 different
|
||||||
// ones for indexing or query as there is not much in common.
|
// ones for indexing or query as there is not much in common.
|
||||||
@ -25,6 +28,8 @@ class Native {
|
|||||||
public:
|
public:
|
||||||
bool isopen;
|
bool isopen;
|
||||||
bool iswritable;
|
bool iswritable;
|
||||||
|
string basedir;
|
||||||
|
|
||||||
// Indexing
|
// Indexing
|
||||||
Xapian::WritableDatabase wdb;
|
Xapian::WritableDatabase wdb;
|
||||||
vector<bool> updated;
|
vector<bool> updated;
|
||||||
@ -102,9 +107,6 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
ndb->iswritable = true;
|
ndb->iswritable = true;
|
||||||
break;
|
break;
|
||||||
case DbTrunc:
|
case DbTrunc:
|
||||||
ndb->wdb =
|
|
||||||
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
|
|
||||||
ndb->iswritable = true;
|
|
||||||
break;
|
break;
|
||||||
case DbRO:
|
case DbRO:
|
||||||
default:
|
default:
|
||||||
@ -113,6 +115,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ndb->isopen = true;
|
ndb->isopen = true;
|
||||||
|
ndb->basedir = dir;
|
||||||
return true;
|
return true;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg();
|
||||||
@ -399,17 +402,152 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute name of stem db for given base database and language
|
||||||
|
static string stemdbname(const string& basename, string lang)
|
||||||
|
{
|
||||||
|
string nm = basename;
|
||||||
|
path_cat(nm, string("stem_") + lang);
|
||||||
|
return nm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Is char non-lowercase ascii ?
|
||||||
|
inline static bool
|
||||||
|
p_notlowerorutf(unsigned int c)
|
||||||
|
{
|
||||||
|
if (c < 'a' || (c > 'z' && c < 128))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create database of stem to parents associations for a given language.
|
||||||
|
* We walk the list of all terms, stem them, and create another Xapian db
|
||||||
|
* with documents indexed by a single term (the stem), and with the list of
|
||||||
|
* parent terms in the document data.
|
||||||
|
*/
|
||||||
|
bool Rcl::Db::createStemDb(const string& lang)
|
||||||
|
{
|
||||||
|
LOGDEB(("Rcl::Db::createStemDb(%s)\n", lang.c_str()));
|
||||||
|
if (pdata == 0)
|
||||||
|
return false;
|
||||||
|
Native *ndb = (Native *)pdata;
|
||||||
|
if (ndb->isopen == false || ndb->iswritable == false)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// First build the in-memory stem database:
|
||||||
|
// We walk the list of all terms, and stem each.
|
||||||
|
// If the stem is identical to the term, no need to create an entry
|
||||||
|
// Else, we add an entry to the multimap.
|
||||||
|
// At the end, we only save stem-terms associations with several terms, the
|
||||||
|
// others are not useful
|
||||||
|
multimap<string, string> assocs;
|
||||||
|
// Statistics
|
||||||
|
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||||
|
int stemconst=0; // Stem == term
|
||||||
|
int stemdiff=0; // Count of all different stems
|
||||||
|
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||||
|
try {
|
||||||
|
Xapian::Stem stemmer(lang);
|
||||||
|
Xapian::TermIterator it;
|
||||||
|
for (it = ndb->wdb.allterms_begin();
|
||||||
|
it != ndb->wdb.allterms_end(); it++) {
|
||||||
|
// If it has any non-lowercase 7bit char, cant be stemmable
|
||||||
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
|
if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
|
||||||
|
++nostem;
|
||||||
|
// LOGDEB(("stemskipped: '%s', because of 0x%x\n",
|
||||||
|
// (*it).c_str(), *sit));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
string stem = stemmer.stem_word(*it);
|
||||||
|
//cerr << "word " << *it << " stem " << stem << endl;
|
||||||
|
if (stem == *it) {
|
||||||
|
++stemconst;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assocs.insert(pair<string,string>(stem, *it));
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
|
||||||
|
lang.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create xapian database for stem relations
|
||||||
|
string stemdbdir = stemdbname(ndb->basedir, lang);
|
||||||
|
string ermsg = "NOERROR";
|
||||||
|
Xapian::WritableDatabase sdb;
|
||||||
|
try {
|
||||||
|
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||||
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
ermsg = e.get_msg();
|
||||||
|
} catch (const string &s) {
|
||||||
|
ermsg = s;
|
||||||
|
} catch (const char *s) {
|
||||||
|
ermsg = s;
|
||||||
|
} catch (...) {
|
||||||
|
ermsg = "Caught unknown exception";
|
||||||
|
}
|
||||||
|
if (ermsg != "NOERROR") {
|
||||||
|
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
|
||||||
|
stemdbdir.c_str(), ermsg.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enter pseud-docs in db. Walk the multimap, only enter
|
||||||
|
// associations where there are several parent terms
|
||||||
|
string stem;
|
||||||
|
list<string> derivs;
|
||||||
|
for (multimap<string,string>::const_iterator it = assocs.begin();
|
||||||
|
it != assocs.end(); it++) {
|
||||||
|
if (stem == it->first) {
|
||||||
|
// Staying with same stem
|
||||||
|
derivs.push_back(it->second);
|
||||||
|
// cerr << " " << it->second << endl;
|
||||||
|
} else {
|
||||||
|
// Changing stems
|
||||||
|
++stemdiff;
|
||||||
|
if (derivs.size() > 1) {
|
||||||
|
// Previous stem has multiple derivatives. Enter in db
|
||||||
|
++stemmultiple;
|
||||||
|
Xapian::Document newdocument;
|
||||||
|
newdocument.add_term(stem);
|
||||||
|
// The doc data is just parents=blank-separated-list
|
||||||
|
string record = "parents=";
|
||||||
|
for (list<string>::const_iterator it = derivs.begin();
|
||||||
|
it != derivs.end(); it++) {
|
||||||
|
record += *it + " ";
|
||||||
|
}
|
||||||
|
record += "\n";
|
||||||
|
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
||||||
|
newdocument.set_data(record);
|
||||||
|
try {
|
||||||
|
sdb.replace_document(stem, newdocument);
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Rcl::Db::createstemdb: replace failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
derivs.clear();
|
||||||
|
stem = it->first;
|
||||||
|
derivs.push_back(it->second);
|
||||||
|
// cerr << "\n" << stem << " " << it->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||||
|
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is called at the end of an indexing session, to delete the
|
||||||
|
* documents for files that are no longer there. We also build the
|
||||||
|
* stem database while we are at it.
|
||||||
|
*/
|
||||||
bool Rcl::Db::purge()
|
bool Rcl::Db::purge()
|
||||||
{
|
{
|
||||||
LOGDEB(("Rcl::Db::purge\n"));
|
LOGDEB(("Rcl::Db::purge\n"));
|
||||||
// There seems to be problems with the document delete code, when
|
|
||||||
// we do this, the database is not actually updated. Especially,
|
|
||||||
// if we delete a bunch of docs, so that there is a hole in the
|
|
||||||
// docids at the beginning, we can't add anything (appears to work
|
|
||||||
// and does nothing). Maybe related to the exceptions below when
|
|
||||||
// trying to delete an unexistant document ?
|
|
||||||
// Flushing before trying the deletes seeems to work around the problem
|
|
||||||
|
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
@ -418,6 +556,13 @@ bool Rcl::Db::purge()
|
|||||||
if (ndb->isopen == false || ndb->iswritable == false)
|
if (ndb->isopen == false || ndb->iswritable == false)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// There seems to be problems with the document delete code, when
|
||||||
|
// we do this, the database is not actually updated. Especially,
|
||||||
|
// if we delete a bunch of docs, so that there is a hole in the
|
||||||
|
// docids at the beginning, we can't add anything (appears to work
|
||||||
|
// and does nothing). Maybe related to the exceptions below when
|
||||||
|
// trying to delete an unexistant document ?
|
||||||
|
// Flushing before trying the deletes seeems to work around the problem
|
||||||
ndb->wdb.flush();
|
ndb->wdb.flush();
|
||||||
for (Xapian::docid did = 1; did < ndb->updated.size(); ++did) {
|
for (Xapian::docid did = 1; did < ndb->updated.size(); ++did) {
|
||||||
if (!ndb->updated[did]) {
|
if (!ndb->updated[did]) {
|
||||||
@ -429,6 +574,7 @@ bool Rcl::Db::purge()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ndb->wdb.flush();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -446,46 +592,57 @@ class wsQData : public TextSplitCB {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
bool takeword(const std::string &term, int , int, int) {
|
bool takeword(const std::string &term, int , int, int) {
|
||||||
LOGDEB(("Takeword: %s\n", term.c_str()));
|
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||||
terms.push_back(term);
|
terms.push_back(term);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#include <xapian/stem.h>
|
|
||||||
|
|
||||||
// Expand term to list of all terms which expand to the same term.
|
// Expand term to list of all terms which stem to the same term.
|
||||||
// This is currently awfully inefficient as we actually stem the whole
|
|
||||||
// db term list ! Need to build an efficient structure when finishing
|
|
||||||
// indexing, but good enough for testing
|
|
||||||
static list<string> stemexpand(Native *ndb, string term, const string& lang)
|
static list<string> stemexpand(Native *ndb, string term, const string& lang)
|
||||||
{
|
{
|
||||||
list<string> explist;
|
list<string> explist;
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
Xapian::Stem stemmer(lang);
|
||||||
string stem = stemmer.stem_word(term);
|
string stem = stemmer.stem_word(term);
|
||||||
LOGDEB(("stemexpand: term '%s' stem '%s'\n",
|
LOGDEB(("stemexpand: '%s' -> '%s'\n", term.c_str(), stem.c_str()));
|
||||||
term.c_str(), stem.c_str()));
|
// Try to fetch the doc from the stem db
|
||||||
Xapian::TermIterator it;
|
string stemdbdir = stemdbname(ndb->basedir, lang);
|
||||||
for (it = ndb->db.allterms_begin();
|
Xapian::Database sdb(stemdbdir);
|
||||||
it != ndb->db.allterms_end(); it++) {
|
LOGDEB1(("Rcl::Db::stemexpand: %s lastdocid: %d\n",
|
||||||
string stem1 = stemmer.stem_word(*it);
|
stemdbdir.c_str(), sdb.get_lastdocid()));
|
||||||
if (stem == stem1)
|
if (!sdb.term_exists(stem)) {
|
||||||
explist.push_back(*it);
|
LOGDEB1(("Rcl::Db::stemexpand: no term for %s\n", stem.c_str()));
|
||||||
}
|
|
||||||
if (explist.size() == 0)
|
|
||||||
explist.push_back(term);
|
explist.push_back(term);
|
||||||
if (1) {
|
return explist;
|
||||||
string expanded;
|
|
||||||
for (list<string>::const_iterator it = explist.begin();
|
|
||||||
it != explist.end(); it++) {
|
|
||||||
expanded += *it + " ";
|
|
||||||
}
|
|
||||||
LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
|
|
||||||
}
|
}
|
||||||
|
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
||||||
|
if (did == sdb.postlist_end(stem)) {
|
||||||
|
LOGDEB1(("Rcl::Db::stemexpand: no term(1) for %s\n",stem.c_str()));
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
Xapian::Document doc = sdb.get_document(*did);
|
||||||
|
string data = doc.get_data();
|
||||||
|
// No need for a conftree, but we need to massage the data a little
|
||||||
|
string::size_type pos = data.find_first_of("=");
|
||||||
|
++pos;
|
||||||
|
string::size_type pos1 = data.find_last_of("\n");
|
||||||
|
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
||||||
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
|
}
|
||||||
|
ConfTree::stringToStrings(data.substr(pos, pos1-pos), explist);
|
||||||
|
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
||||||
|
explist.push_back(term);
|
||||||
|
}
|
||||||
|
LOGDEB(("Rcl::Db::stemexpand: %s -> %s\n", stem.c_str(),
|
||||||
|
stringlistdisp(explist).c_str()));
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
|
LOGERR(("stemexpand: error accessing stem db\n"));
|
||||||
explist.push_back(term);
|
explist.push_back(term);
|
||||||
|
return explist;
|
||||||
}
|
}
|
||||||
return explist;
|
return explist;
|
||||||
}
|
}
|
||||||
@ -519,7 +676,8 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts,
|
|||||||
wsQData splitData;
|
wsQData splitData;
|
||||||
TextSplit splitter(&splitData, true);
|
TextSplit splitter(&splitData, true);
|
||||||
splitter.text_to_words(*it);
|
splitter.text_to_words(*it);
|
||||||
LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
|
LOGDEB1(("Rcl::Db::setquery: splitter term count: %d\n",
|
||||||
|
splitData.terms.size()));
|
||||||
switch(splitData.terms.size()) {
|
switch(splitData.terms.size()) {
|
||||||
case 0: continue;// ??
|
case 0: continue;// ??
|
||||||
case 1: {
|
case 1: {
|
||||||
@ -578,7 +736,7 @@ int Rcl::Db::getResCnt()
|
|||||||
|
|
||||||
bool Rcl::Db::getDoc(int i, Doc &doc, int *percent)
|
bool Rcl::Db::getDoc(int i, Doc &doc, int *percent)
|
||||||
{
|
{
|
||||||
LOGDEB(("Rcl::Db::getDoc: %d\n", i));
|
LOGDEB1(("Rcl::Db::getDoc: %d\n", i));
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
if (!ndb || !ndb->enquire) {
|
if (!ndb || !ndb->enquire) {
|
||||||
LOGERR(("Rcl::Db::getDoc: no query opened\n"));
|
LOGERR(("Rcl::Db::getDoc: no query opened\n"));
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.11 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.12 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -72,6 +72,7 @@ class Db {
|
|||||||
bool add(const string &filename, const Doc &doc);
|
bool add(const string &filename, const Doc &doc);
|
||||||
bool needUpdate(const string &filename, const struct stat *stp);
|
bool needUpdate(const string &filename, const struct stat *stp);
|
||||||
bool purge();
|
bool purge();
|
||||||
|
bool createStemDb(const string &lang);
|
||||||
|
|
||||||
// Query-related functions
|
// Query-related functions
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.5 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_EXECMD
|
#ifndef TEST_EXECMD
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
@ -131,7 +131,7 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
|||||||
close(pipeout[0]);
|
close(pipeout[0]);
|
||||||
if (pipeout[1] >= 0)
|
if (pipeout[1] >= 0)
|
||||||
close(pipeout[1]);
|
close(pipeout[1]);
|
||||||
LOGDEB(("ExecCmd::doexec: father got status 0x%x\n", status));
|
LOGDEB1(("ExecCmd::doexec: father got status 0x%x\n", status));
|
||||||
return status;
|
return status;
|
||||||
} else {
|
} else {
|
||||||
if (input) {
|
if (input) {
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.2 2004-12-12 08:58:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef TEST_FSTREEWALK
|
#ifndef TEST_FSTREEWALK
|
||||||
@ -53,8 +53,8 @@ int FsTreeWalker::getErrCnt()
|
|||||||
return data->errors;
|
return data->errors;
|
||||||
}
|
}
|
||||||
|
|
||||||
FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
|
FsTreeWalker::Status FsTreeWalker::walk(const string &top,
|
||||||
void *cdata)
|
FsTreeWalkerCB& cb)
|
||||||
{
|
{
|
||||||
Status status = FtwOk;
|
Status status = FtwOk;
|
||||||
struct stat st;
|
struct stat st;
|
||||||
@ -68,12 +68,12 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
|
|||||||
return FtwError;
|
return FtwError;
|
||||||
}
|
}
|
||||||
if (S_ISDIR(st.st_mode)) {
|
if (S_ISDIR(st.st_mode)) {
|
||||||
if ((status = fun(cdata, top, &st, FtwDirEnter)) &
|
if ((status = cb.processone(top, &st, FtwDirEnter)) &
|
||||||
(FtwStop|FtwError)) {
|
(FtwStop|FtwError)) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
} else if (S_ISREG(st.st_mode)) {
|
} else if (S_ISREG(st.st_mode)) {
|
||||||
return fun(cdata, top, &st, FtwRegular);
|
return cb.processone(top, &st, FtwRegular);
|
||||||
} else {
|
} else {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
@ -110,17 +110,17 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
|
|||||||
}
|
}
|
||||||
if (S_ISDIR(st.st_mode)) {
|
if (S_ISDIR(st.st_mode)) {
|
||||||
if (data->options & FtwNoRecurse) {
|
if (data->options & FtwNoRecurse) {
|
||||||
status = fun(cdata, fn, &st, FtwDirEnter);
|
status = cb.processone(fn, &st, FtwDirEnter);
|
||||||
} else {
|
} else {
|
||||||
status=walk(fn, fun, cdata);
|
status=walk(fn, cb);
|
||||||
}
|
}
|
||||||
if (status & (FtwStop|FtwError))
|
if (status & (FtwStop|FtwError))
|
||||||
goto out;
|
goto out;
|
||||||
if ((status = fun(cdata, top, &st, FtwDirReturn))
|
if ((status = cb.processone(top, &st, FtwDirReturn))
|
||||||
& (FtwStop|FtwError))
|
& (FtwStop|FtwError))
|
||||||
goto out;
|
goto out;
|
||||||
} else if (S_ISREG(st.st_mode)) {
|
} else if (S_ISREG(st.st_mode)) {
|
||||||
if ((status = fun(cdata, fn, &st, FtwRegular)) &
|
if ((status = cb.processone(fn, &st, FtwRegular)) &
|
||||||
(FtwStop|FtwError)) {
|
(FtwStop|FtwError)) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@ -143,17 +143,22 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, CbType fun,
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
FsTreeWalker::Status walkfunc(void *, const string &path,
|
class myCB : public FsTreeWalkerCB {
|
||||||
const struct stat *st,
|
public:
|
||||||
FsTreeWalker::CbFlag flg)
|
FsTreeWalker::Status processone(const string &path,
|
||||||
{
|
const struct stat *st,
|
||||||
if (flg == FsTreeWalker::FtwDirEnter) {
|
FsTreeWalker::CbFlag flg)
|
||||||
cout << "[Entering " << path << "]" << endl;
|
{
|
||||||
} else if (flg == FsTreeWalker::FtwRegular) {
|
if (flg == FsTreeWalker::FtwDirEnter) {
|
||||||
cout << path << endl;
|
cout << "[Entering " << path << "]" << endl;
|
||||||
|
} else if (flg == FsTreeWalker::FtwDirReturn) {
|
||||||
|
cout << "[Returning to " << path << "]" << endl;
|
||||||
|
} else if (flg == FsTreeWalker::FtwRegular) {
|
||||||
|
cout << path << endl;
|
||||||
|
}
|
||||||
|
return FsTreeWalker::FtwOk;
|
||||||
}
|
}
|
||||||
return FsTreeWalker::FtwOk;
|
};
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, const char **argv)
|
int main(int argc, const char **argv)
|
||||||
{
|
{
|
||||||
@ -162,7 +167,8 @@ int main(int argc, const char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
FsTreeWalker walker;
|
FsTreeWalker walker;
|
||||||
walker.walk(argv[1], walkfunc, 0);
|
myCB cb;
|
||||||
|
walker.walk(argv[1], cb);
|
||||||
if (walker.getErrCnt() > 0)
|
if (walker.getErrCnt() > 0)
|
||||||
cout << walker.getReason();
|
cout << walker.getReason();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,9 +1,14 @@
|
|||||||
#ifndef _FSTREEWALK_H_INCLUDED_
|
#ifndef _FSTREEWALK_H_INCLUDED_
|
||||||
#define _FSTREEWALK_H_INCLUDED_
|
#define _FSTREEWALK_H_INCLUDED_
|
||||||
/* @(#$Id: fstreewalk.h,v 1.1 2004-12-10 18:13:13 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: fstreewalk.h,v 1.2 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
using std::string;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class FsTreeWalkerCB;
|
||||||
|
|
||||||
class FsTreeWalker {
|
class FsTreeWalker {
|
||||||
public:
|
public:
|
||||||
@ -12,12 +17,9 @@ class FsTreeWalker {
|
|||||||
FtwStatAll = FtwError|FtwStop};
|
FtwStatAll = FtwError|FtwStop};
|
||||||
enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2};
|
enum Options {FtwOptNone = 0, FtwNoRecurse = 1, FtwFollow = 2};
|
||||||
|
|
||||||
typedef Status (*CbType)(void *cdata,
|
|
||||||
const std::string &, const struct stat *, CbFlag);
|
|
||||||
|
|
||||||
FsTreeWalker(Options opts = FtwOptNone);
|
FsTreeWalker(Options opts = FtwOptNone);
|
||||||
~FsTreeWalker();
|
~FsTreeWalker();
|
||||||
Status walk(const std::string &dir, CbType fun, void *cdata);
|
Status walk(const std::string &dir, FsTreeWalkerCB& cb);
|
||||||
std::string getReason();
|
std::string getReason();
|
||||||
int getErrCnt();
|
int getErrCnt();
|
||||||
private:
|
private:
|
||||||
@ -25,4 +27,11 @@ class FsTreeWalker {
|
|||||||
Internal *data;
|
Internal *data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class FsTreeWalkerCB {
|
||||||
|
public:
|
||||||
|
virtual ~FsTreeWalkerCB() {}
|
||||||
|
virtual FsTreeWalker::Status
|
||||||
|
processone(const string &, const struct stat *, FsTreeWalker::CbFlag)
|
||||||
|
= 0;
|
||||||
|
};
|
||||||
#endif /* _FSTREEWALK_H_INCLUDED_ */
|
#endif /* _FSTREEWALK_H_INCLUDED_ */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: smallut.cpp,v 1.4 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_SMALLUT
|
#ifndef TEST_SMALLUT
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -48,6 +48,17 @@ bool maketmpdir(string& tdir)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string stringlistdisp(const list<string>& sl)
|
||||||
|
{
|
||||||
|
string s;
|
||||||
|
for (list<string>::const_iterator it = sl.begin(); it!= sl.end(); it++)
|
||||||
|
s += "[" + *it + "] ";
|
||||||
|
if (!s.empty())
|
||||||
|
s.erase(s.length()-1);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int stringicmp(const string & s1, const string& s2)
|
int stringicmp(const string & s1, const string& s2)
|
||||||
{
|
{
|
||||||
string::const_iterator it1 = s1.begin();
|
string::const_iterator it1 = s1.begin();
|
||||||
|
|||||||
@ -1,14 +1,16 @@
|
|||||||
#ifndef _SMALLUT_H_INCLUDED_
|
#ifndef _SMALLUT_H_INCLUDED_
|
||||||
#define _SMALLUT_H_INCLUDED_
|
#define _SMALLUT_H_INCLUDED_
|
||||||
/* @(#$Id: smallut.h,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: smallut.h,v 1.4 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <list>
|
||||||
using std::string;
|
using std::string;
|
||||||
|
using std::list;
|
||||||
|
|
||||||
extern int stringicmp(const string& s1, const string& s2);
|
extern int stringicmp(const string& s1, const string& s2);
|
||||||
extern int stringlowercmp(const string& alreadylower, const string& s2);
|
extern int stringlowercmp(const string& alreadylower, const string& s2);
|
||||||
extern int stringuppercmp(const string& alreadyupper, const string& s2);
|
extern int stringuppercmp(const string& alreadyupper, const string& s2);
|
||||||
|
|
||||||
extern bool maketmpdir(string& tdir);
|
extern bool maketmpdir(string& tdir);
|
||||||
|
extern string stringlistdisp(const list<string>& strs);
|
||||||
|
|
||||||
#endif /* _SMALLUT_H_INCLUDED_ */
|
#endif /* _SMALLUT_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user