allow independant creation / deletion of stem dbs

This commit is contained in:
dockes 2006-01-09 16:53:31 +00:00
parent c4ce5cf691
commit dac569ab51
9 changed files with 364 additions and 122 deletions

View File

@ -1,29 +1,29 @@
#*
*.cache
*.core
*.o *.o
*~ *~
*.core .#*
*.cache .#*
#*
.moc .moc
.obj .obj
.ui .ui
.#*
CVS CVS
alldeps
.#*
autom4*
TAGS TAGS
alldeps
autom4*
config.cache config.cache
config.log config.log
config.status config.status
excludefile excludefile
lib/librcl.a
makesrcdist.sh makesrcdist.sh
recollinstall
mk/localdefs mk/localdefs
sysconf
qtgui/Makefile qtgui/Makefile
qtgui/preview/Makefile qtgui/preview/Makefile
qtgui/preview/preview.pro qtgui/preview/preview.pro
qtgui/preview/pvmain.cpp qtgui/preview/pvmain.cpp
lib/librcl.a recollinstall
sampleconf/recoll.conf sampleconf/recoll.conf
sysconf
wxgui wxgui

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
#include <iostream> #include <iostream>
#include <list> #include <list>
#include <map> #include <map>
#include <algorithm>
#include "pathut.h" #include "pathut.h"
#include "conftree.h" #include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// filesystem anymore. // filesystem anymore.
db.purge(); db.purge();
// Create stemming databases // Create stemming databases. We also remove those which are not
// configured.
string slangs; string slangs;
if (config->getConfParam("indexstemminglanguages", slangs)) { if (config->getConfParam("indexstemminglanguages", slangs)) {
list<string> langs; list<string> langs;
stringToStrings(slangs, langs); stringToStrings(slangs, langs);
for (list<string>::const_iterator it = langs.begin();
it != langs.end(); it++) { // Get the list of existing stem dbs from the database (some may have
// been manually created, we just keep those from the config
list<string> dblangs = db.getStemLangs();
list<string>::const_iterator it;
for (it = dblangs.begin(); it != dblangs.end(); it++) {
if (find(langs.begin(), langs.end(), *it) == langs.end())
db.deleteStemDb(*it);
}
for (it = langs.begin(); it != langs.end(); it++) {
db.createStemDb(*it); db.createStemDb(*it);
} }
} }
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
return true; return true;
} }
bool DbIndexer::createStemDb(const string &lang)
{
if (!init())
return false;
return db.createStemDb(lang);
}
/**
Index individual files, out of a full tree run. No database purging
*/
bool DbIndexer::indexFiles(const list<string> &filenames) bool DbIndexer::indexFiles(const list<string> &filenames)
{ {
if (!init()) if (!init())

View File

@ -1,6 +1,6 @@
#ifndef _INDEXER_H_INCLUDED_ #ifndef _INDEXER_H_INCLUDED_
#define _INDEXER_H_INCLUDED_ #define _INDEXER_H_INCLUDED_
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -24,7 +24,9 @@ class DbIndexer;
class ConfIndexer { class ConfIndexer {
public: public:
enum runStatus {IndexerOk, IndexerError}; enum runStatus {IndexerOk, IndexerError};
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {} ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
{
}
virtual ~ConfIndexer(); virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */ /** Worker function: doe the actual indexing */
bool index(bool resetbefore = false); bool index(bool resetbefore = false);
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<std::string> &files); bool indexFiles(const std::list<std::string> &files);
/** Create stem database for given language */
bool createStemDb(const string &lang);
/** Tree walker callback method */ /** Tree walker callback method */
FsTreeWalker::Status FsTreeWalker::Status
processone(const std::string &, const struct stat *, processone(const std::string &, const struct stat *,

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
#include "pathut.h" #include "pathut.h"
// Globals for exit cleanup
ConfIndexer *confindexer; ConfIndexer *confindexer;
DbIndexer *dbindexer; DbIndexer *dbindexer;
bool indexfiles(RclConfig *config, const list<string> &filenames) // Index a list of files
static bool indexfiles(RclConfig *config, const list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
return dbindexer->indexFiles(filenames); return dbindexer->indexFiles(filenames);
} }
// Create additional stem database
static bool createstemdb(RclConfig *config, const string &lang)
{
// Note that we do not bother to check for multiple databases,
// which are currently a fiction anyway.
string dbdir;
if (!config->getConfParam("dbdir", dbdir)) {
LOGERR(("createstemdb: no database directory in configuration\n"));
return false;
}
dbdir = path_tildexpand(dbdir);
dbindexer = new DbIndexer(config, dbdir);
return dbindexer->createStemDb(lang);
}
static void cleanup() static void cleanup()
{ {
delete confindexer; delete confindexer;
@ -63,15 +80,19 @@ static int op_flags;
#define OPT_z 0x2 #define OPT_z 0x2
#define OPT_h 0x4 #define OPT_h 0x4
#define OPT_i 0x8 #define OPT_i 0x8
#define OPT_s 0x10
static const char usage [] = static const char usage [] =
"\n"
"recollindex [-hz] \n" "recollindex [-hz] \n"
" Normal index run\n"
"recollindex -i <filename [filename ...]>\n" "recollindex -i <filename [filename ...]>\n"
" Index individual files. No db purge or stem database updates\n"
"recollindex -s <lang>\n"
" Build stem database for language <lang>\n"
"Options:\n" "Options:\n"
" -h : print this message\n" " -h : print this message\n"
" -z : reset database before starting indexation\n\n" " -z : reset database before starting indexation\n\n"
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
" database updates in this case\n"
; ;
static void static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
case 'z': op_flags |= OPT_z; break; case 'z': op_flags |= OPT_z; break;
case 'h': op_flags |= OPT_h; break; case 'h': op_flags |= OPT_h; break;
case 'i': op_flags |= OPT_i; break; case 'i': op_flags |= OPT_i; break;
case 's': op_flags |= OPT_s; break;
default: Usage(); break; default: Usage(); break;
} }
b1: argc--; argv++; b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
string reason; string reason;
RclConfig *config = recollinit(cleanup, sigcleanup, reason); RclConfig *config = recollinit(cleanup, sigcleanup, reason);
if (config == 0 || !config->ok()) { if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl; cerr << "Configuration problem: " << reason << endl;
exit(1); exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
} }
} }
exit(!indexfiles(config, filenames)); exit(!indexfiles(config, filenames));
} else if (op_flags & OPT_s) {
if (argc != 1)
Usage();
string lang = *argv++; argc--;
exit(!createstemdb(config, lang));
} else { } else {
confindexer = new ConfIndexer(config); confindexer = new ConfIndexer(config);
bool rezero(op_flags & OPT_z); bool rezero(op_flags & OPT_z);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
#include "smallut.h" #include "smallut.h"
#include "pathhash.h" #include "pathhash.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "wipedir.h"
#include "xapian.h" #include "xapian.h"
#include <xapian/stem.h> #include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
ndb->iswritable)); ndb->iswritable));
if (ndb->isopen == false) if (ndb->isopen == false)
return; return;
string ermsg; const char *ermsg = "Unknown error";
try { try {
LOGDEB(("Rcl::Db::~Db: closing native database\n")); LOGDEB(("Rcl::Db::~Db: closing native database\n"));
if (ndb->iswritable == true) if (ndb->iswritable == true) {
ndb->wdb.flush(); ndb->wdb.flush();
}
delete ndb; delete ndb;
return; return;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str())); LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
} }
bool Rcl::Db::open(const string& dir, OpenMode mode) bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
LOGERR(("Rcl::Db::open: already open\n")); LOGERR(("Rcl::Db::open: already open\n"));
return false; return false;
} }
string ermsg; const char *ermsg = "Unknown";
try { try {
switch (mode) { switch (mode) {
case DbUpd: case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
ndb->basedir = dir; ndb->basedir = dir;
return true; return true;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
dir.c_str(), ermsg.c_str())); dir.c_str(), ermsg));
return false; return false;
} }
@ -148,7 +150,7 @@ bool Rcl::Db::close()
ndb->iswritable)); ndb->iswritable));
if (ndb->isopen == false) if (ndb->isopen == false)
return true; return true;
string ermsg; const char *ermsg = "Unknown";
try { try {
if (ndb->iswritable == true) { if (ndb->iswritable == true) {
ndb->wdb.flush(); ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
if (pdata) if (pdata)
return true; return true;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
ermsg.c_str()));
return false; return false;
} }
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
// Callback for the document to word splitting class during indexation // Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &term, int pos, int, int) bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{ {
// cerr << "splitCb: term " << term << endl; #if 0
//string printable; LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
//transcode(term, printable, "UTF-8", "ISO-8859-1"); string printable;
//cerr << "Adding " << printable << endl; if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
LOGDEB((" [%s]\n", printable.c_str()));
}
#endif
const char *ermsg;
try { try {
// 1 is the value for wdfinc in index_text when called from omindex // Note: 1 is the within document frequency increment. It would
// TOBEDONE: check what this is used for // be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos; curpos = pos;
doc.add_posting(term, basepos + curpos, 1); doc.add_posting(term, basepos + curpos, 1);
} catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
return false;
}
return true; return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) {
ermsg= "Unknown error";
}
LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
return false;
} }
// Unaccent and lowercase data, replace \n\r with spaces // Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
return true; return true;
} }
/* omindex direct */ /* From omindex direct */
/* Truncate a string to a given maxlength, avoiding cutting off midword /* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */ * if reasonably possible. */
string string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
output += " ..."; output += " ...";
} }
// No need to replace newlines with spaces, we do this in dumb_string()
// replace newlines with spaces
size_t i = 0;
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
return output; return output;
} }
// Truncate longer path and uniquize with hash . The goad for this is // Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we // to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30) // gain very little even with very short maxlens like 30)
#define HASHPATH
#define PATHHASHLEN 150 #define PATHHASHLEN 150
// Add document in internal form to the database: index the terms in // Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Split and index file name. This supposes that it's either ascii // Split and index file name. This supposes that it's either ascii
// or utf-8. If this fails, we just go on. We need a config // or utf-8. If this fails, we just go on. We need a config
// parameter for file name charset // parameter for file name charset.
// Do we really want to fold case here ?
if (dumb_string(fn, noacc)) { if (dumb_string(fn, noacc)) {
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split body and index terms // Split and index body
if (!dumb_string(doc.text, noacc)) { if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split keywords and index terms // Split and index keywords
if (!dumb_string(doc.keywords, noacc)) { if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split abstract and index terms // Split and index abstract
if (!dumb_string(doc.abstract, noacc)) { if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Path name // Path name
string hash; string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN); pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
string pathterm = "P" + hash; string pathterm = "P" + hash;
newdocument.add_term(pathterm); newdocument.add_term(pathterm);
// File path + internal path: document unique identifier for // Internal path: with path, makes unique identifier for documents
// documents inside multidocument files. // inside multidocument files.
string uniterm; string uniterm;
if (!doc.ipath.empty()) { if (!doc.ipath.empty()) {
uniterm = "Q" + hash + "|" + doc.ipath; uniterm = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
string record = "url=file://" + fn; string record = "url=file://" + fn;
record += "\nmtype=" + doc.mimetype; record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime; record += "\nfmtime=" + doc.fmtime;
if (!doc.dmtime.empty()) if (!doc.dmtime.empty()) {
record += "\ndmtime=" + doc.dmtime; record += "\ndmtime=" + doc.dmtime;
}
record += "\norigcharset=" + doc.origcharset; record += "\norigcharset=" + doc.origcharset;
record += "\ncaption=" + doc.title; record += "\ncaption=" + doc.title;
record += "\nkeywords=" + doc.keywords; record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
record += "\nipath=" + doc.ipath; record += "\nipath=" + doc.ipath;
} }
record += "\n"; record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str())); LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record); newdocument.set_data(record);
const char *fnc = fn.c_str(); const char *fnc = fn.c_str();
// Add db entry or update existing entry: // Add db entry or update existing entry:
try { try {
Xapian::docid did = Xapian::docid did =
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
} }
} catch (...) { } catch (...) {
// FIXME: is this ever actually needed? // FIXME: is this ever actually needed?
try {
ndb->wdb.add_document(newdocument); ndb->wdb.add_document(newdocument);
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
fnc)); fnc));
} catch (...) {
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
return false;
}
} }
return true; return true;
} }
// Test if given filename has changed since last indexed:
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
{ {
if (pdata == 0) if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// If no document exist with this path, we do need update // If no document exist with this path, we do need update
string hash; string hash;
#ifdef HASHPATH
pathHash(filename, hash, PATHHASHLEN); pathHash(filename, hash, PATHHASHLEN);
#else
hash = filename;
#endif
string pathterm = "P" + hash; string pathterm = "P" + hash;
if (!ndb->wdb.term_exists(pathterm)) { const char *ermsg;
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
return true;
}
// Look for all documents with this path. We need to look at all // Look for all documents with this path. We need to look at all
// to set their existence flag. We check the update time on the // to set their existence flag. We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// file changed) // file changed)
Xapian::PostingIterator doc; Xapian::PostingIterator doc;
try { try {
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
return true;
}
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
for (Xapian::PostingIterator docid = docid0; for (Xapian::PostingIterator docid = docid0;
docid != ndb->wdb.postlist_end(pathterm); docid++) { docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
if (*docid < ndb->updated.size()) if (*docid < ndb->updated.size())
ndb->updated[*docid] = true; ndb->updated[*docid] = true;
} }
return false;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) { } catch (...) {
ermsg= "Unknown error";
}
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
return true; return true;
} }
return false; const static string stemdirstem = "stem_";
}
/// Compute name of stem db for given base database and language /// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang) static string stemdbname(const string& basename, string lang)
{ {
string nm = path_cat(basename, string("stem_") + lang); string nm = path_cat(basename, stemdirstem + lang);
return nm; return nm;
} }
// Is char non-lowercase ascii ? // Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool inline static bool
p_notlowerorutf(unsigned int c) p_notlowerorutf(unsigned int c)
{ {
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
return false; return false;
} }
/**
* Delete stem db for given language
*/
bool Rcl::Db::deleteStemDb(const string& lang)
{
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false)
return false;
string dir = stemdbname(ndb->basedir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
/** /**
* Create database of stem to parents associations for a given language. * Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db * We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
if (pdata == 0) if (pdata == 0)
return false; return false;
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
if (ndb->isopen == false || ndb->iswritable == false) if (ndb->isopen == false)
return false; return false;
// First build the in-memory stem database: // First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
assocs.insert(pair<string,string>(stem, *it)); assocs.insert(pair<string,string>(stem, *it));
} }
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) { } catch (...) {
LOGERR(("Stem database build failed: no stemmer for %s ? \n", LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str())); lang.c_str()));
return false; return false;
} }
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Create xapian database for stem relations // Create xapian database for stem relations
string stemdbdir = stemdbname(ndb->basedir, lang); string stemdbdir = stemdbname(ndb->basedir, lang);
string ermsg = "NOERROR"; // We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb; Xapian::WritableDatabase sdb;
try { try {
sdb = Xapian::WritableDatabase(stemdbdir, sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE); Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
if (ermsg != "NOERROR") { if (ermsg != "NOERROR") {
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
stemdbdir.c_str(), ermsg.c_str())); stemdbdir.c_str(), ermsg));
return false; return false;
} }
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true; return true;
} }
list<string> Rcl::Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Rcl::Db::getStemLang\n"));
if (pdata == 0)
return dirs;
Native *ndb = (Native *)pdata;
string pattern = stemdirstem + "*";
dirs = path_dirglob(ndb->basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/** /**
* This is called at the end of an indexing session, to delete the * This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the * documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
// and does nothing). Maybe related to the exceptions below when // and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ? // trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem // Flushing before trying the deletes seeems to work around the problem
try {
ndb->wdb.flush(); ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
}
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) { for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
if (!ndb->updated[docid]) { if (!ndb->updated[docid]) {
try { try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
} }
} }
} }
try {
ndb->wdb.flush(); ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
}
return true; return true;
} }
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
}; };
//
// Turn string into list of xapian queries. There is little // Turn string into list of xapian queries. There is little
// interpretation done on the string (no +term -term or filename:term // interpretation done on the string (no +term -term or filename:term
// stuff). We just separate words and phrases, and interpret // stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
string hash; string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN); pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
string pathterm = "P" + hash; string pathterm = "P" + hash;
// Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient.
const char *ermsg = "";
try {
if (!ndb->db.term_exists(pathterm)) { if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length())); pathterm.c_str(), pathterm.length()));
return false; return false;
} }
// Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient.
try {
for (Xapian::PostingIterator docid = for (Xapian::PostingIterator docid =
ndb->db.postlist_begin(pathterm); ndb->db.postlist_begin(pathterm);
docid != ndb->db.postlist_end(pathterm); docid++) { docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath) if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
return true; return true;
} }
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) { } catch (...) {
return false; ermsg = "Caught unknown exception";
}
if (*ermsg) {
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
} }
return false; return false;
} }

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -102,6 +102,7 @@ public:
bool needUpdate(const string &filename, const struct stat *stp); bool needUpdate(const string &filename, const struct stat *stp);
bool purge(); bool purge();
bool createStemDb(const string &lang); bool createStemDb(const string &lang);
bool deleteStemDb(const string &lang);
// Query-related functions // Query-related functions
@ -127,6 +128,10 @@ public:
/** Get results count for current query */ /** Get results count for current query */
int getResCnt(); int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
/** Things we don't want to have here. */
friend class Rcl::DbPops; friend class Rcl::DbPops;
private: private:

View File

@ -15,7 +15,7 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \ $(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
-DTEST_FSTREEWALK fstreewalk.cpp -DTEST_FSTREEWALK fstreewalk.cpp
PATHUT_OBJS= trpathut.o pathut.o PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
trpathut : $(PATHUT_OBJS) trpathut : $(PATHUT_OBJS)
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS) $(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h trpathut.o : pathut.cpp pathut.h

View File

@ -1,15 +1,21 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_PATHUT #ifndef TEST_PATHUT
#include <unistd.h> #include <unistd.h>
#include <sys/param.h>
#include <pwd.h> #include <pwd.h>
#include <iostream> #include <iostream>
#include <list>
#include <stack>
#include "pathut.h" #include "pathut.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using std::string; using std::string;
using std::list;
using std::stack;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
void path_catslash(std::string &s) { void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
return simple; return simple;
} }
string path_basename(const string &s, const string &suff)
{
string simple = path_getsimple(s);
string::size_type pos = string::npos;
if (suff.length() && simple.length() > suff.length()) {
pos = simple.rfind(suff);
if (pos != string::npos && pos + suff.length() == simple.length())
return simple.substr(0, pos);
}
return simple;
}
string path_home() string path_home()
{ {
uid_t uid = getuid(); uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
return o; return o;
} }
#include <smallut.h>
extern std::string path_canon(const std::string &is)
{
if (is.length() == 0)
return is;
string s = is;
if (s[0] != '/') {
char buf[MAXPATHLEN];
if (!getcwd(buf, MAXPATHLEN)) {
return "";
}
s = path_cat(string(buf), s);
}
list<string>elems;
stringToTokens(s, elems, "/");
list<string> cleaned;
for (list<string>::const_iterator it = elems.begin();
it != elems.end(); it++){
if (*it == "..") {
if (!cleaned.empty())
cleaned.pop_back();
} else if (it->empty() || *it == ".") {
} else {
cleaned.push_back(*it);
}
}
string ret;
if (!cleaned.empty()) {
for (list<string>::const_iterator it = cleaned.begin();
it != cleaned.end(); it++) {
ret += "/";
ret += *it;
}
} else {
ret = "/";
}
return ret;
}
#include <glob.h>
#include <sys/stat.h>
list<std::string> path_dirglob(const std::string &dir,
const std::string pattern)
{
list<string> res;
glob_t mglob;
string mypat=path_cat(dir, pattern);
if (glob(mypat.c_str(), 0, 0, &mglob)) {
return res;
}
for (int i = 0; i < mglob.gl_pathc; i++) {
res.push_back(mglob.gl_pathv[i]);
}
globfree(&mglob);
return res;
}
#else // TEST_PATHUT #else // TEST_PATHUT
#include <iostream> #include <iostream>
@ -108,7 +184,7 @@ using namespace std;
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2", const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
"/dir1/dir2", "/dir1/dir2",
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c", "./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
"/dir/.c", "/dir/.c", "/dir/toto.txt", "toto.txt1"
}; };
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub", const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
int main(int argc, const char **argv) int main(int argc, const char **argv)
{ {
string s;
list<string>::const_iterator it;
#if 0 #if 0
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl; cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
} }
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl; cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
}
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Basename " <<
path_basename(tstvec[i], ".txt") << endl;
} }
#endif #endif
string s;
#if 0
for (int i = 0; i < nttvec; i++) { for (int i = 0; i < nttvec; i++) {
cout << "tildexp: '" << ttvec[i] << "' -> '" << cout << "tildexp: '" << ttvec[i] << "' -> '" <<
path_tildexpand(ttvec[i]) << "'" << endl; path_tildexpand(ttvec[i]) << "'" << endl;
} }
#endif
#if 0
const string canontst[] = {"/dir1/../../..", "/////", "",
"/dir1/../../.././/////dir2///////",
"../../",
"../../../../../../../../../../"
};
unsigned int nttvec = sizeof(canontst) / sizeof(string);
for (unsigned int i = 0; i < nttvec; i++) {
cout << "canon: '" << canontst[i] << "' -> '" <<
path_canon(canontst[i]) << "'" << endl;
}
#endif
#if 1
if (argc != 3) {
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
exit(1);
}
string dir=argv[1], pattern=argv[2];
list<string> matched = path_dirglob(dir, pattern);
for (it = matched.begin(); it != matched.end();it++) {
cout << *it << endl;
}
#endif
return 0; return 0;
} }

View File

@ -1,14 +1,19 @@
#ifndef _PATHUT_H_INCLUDED_ #ifndef _PATHUT_H_INCLUDED_
#define _PATHUT_H_INCLUDED_ #define _PATHUT_H_INCLUDED_
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list>
extern void path_catslash(std::string &s); extern void path_catslash(std::string &s);
extern std::string path_cat(const std::string &s1, const std::string &s2); extern std::string path_cat(const std::string &s1, const std::string &s2);
extern std::string path_getsimple(const std::string &s); extern std::string path_getsimple(const std::string &s);
extern std::string path_basename(const std::string &s, const std::string &suff="");
extern std::string path_getfather(const std::string &s); extern std::string path_getfather(const std::string &s);
extern std::string path_home(); extern std::string path_home();
extern std::string path_tildexpand(const std::string &s); extern std::string path_tildexpand(const std::string &s);
extern std::string path_canon(const std::string &s);
extern std::list<std::string> path_dirglob(const std::string &dir,
const std::string pattern);
#endif /* _PATHUT_H_INCLUDED_ */ #endif /* _PATHUT_H_INCLUDED_ */