allow independant creation / deletion of stem dbs

2006-01-09 16:53:31 +00:00 · 2006-01-09 16:53:31 +00:00 · dac569ab51
commit dac569ab51
parent c4ce5cf691
9 changed files with 364 additions and 122 deletions
--- a/src/excludefile
+++ b/src/excludefile
@ -1,29 +1,29 @@
+#*
+*.cache
+*.core
 *.o
 *~
-*.core
-*.cache
-#*
+.#*
+.#*
 .moc
 .obj
 .ui
-.#*
 CVS
-alldeps
-.#*
-autom4*
 TAGS
+alldeps
+autom4*
 config.cache
 config.log
 config.status
 excludefile
+lib/librcl.a
 makesrcdist.sh
-recollinstall
 mk/localdefs
-sysconf
 qtgui/Makefile
 qtgui/preview/Makefile
 qtgui/preview/preview.pro
 qtgui/preview/pvmain.cpp
-lib/librcl.a
+recollinstall
 sampleconf/recoll.conf
+sysconf
 wxgui
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
 #include <iostream>
 #include <list>
 #include <map>
+#include <algorithm>

 #include "pathut.h"
 #include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
    // filesystem anymore.
    db.purge();

-    // Create stemming databases
+    // Create stemming databases. We also remove those which are not
+    // configured.
    string slangs;
    if (config->getConfParam("indexstemminglanguages", slangs)) {
 	list<string> langs;
 	stringToStrings(slangs, langs);
-	for (list<string>::const_iterator it = langs.begin(); 
-	     it != langs.end(); it++) {
+
+	// Get the list of existing stem dbs from the database (some may have 
+	// been manually created, we just keep those from the config
+	list<string> dblangs = db.getStemLangs();
+	list<string>::const_iterator it;
+	for (it = dblangs.begin(); it != dblangs.end(); it++) {
+	    if (find(langs.begin(), langs.end(), *it) == langs.end())
+		db.deleteStemDb(*it);
+	}
+	for (it = langs.begin(); it != langs.end(); it++) {
 	    db.createStemDb(*it);
 	}
    }
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
    return true;
 }

+bool DbIndexer::createStemDb(const string &lang)
+{
+    if (!init())
+	return false;
+    return db.createStemDb(lang);
+}
+
+/** 
+ Index individual files, out of a full tree run. No database purging
+*/
 bool DbIndexer::indexFiles(const list<string> &filenames)
 {
    if (!init())
--- a/src/index/indexer.h
+++ b/src/index/indexer.h
@ -1,6 +1,6 @@
 #ifndef _INDEXER_H_INCLUDED_
 #define _INDEXER_H_INCLUDED_
-/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -24,7 +24,9 @@ class DbIndexer;
 class ConfIndexer {
 public:
    enum runStatus {IndexerOk, IndexerError};
-    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
+    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) 
+	{
+	}
    virtual ~ConfIndexer();
    /** Worker function: doe the actual indexing */
    bool index(bool resetbefore = false);
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
    /** Index a list of files. No db cleaning or stemdb updating */
    bool indexFiles(const std::list<std::string> &files);

+    /** Create stem database for given language */
+    bool createStemDb(const string &lang);
+
    /**  Tree walker callback method */
    FsTreeWalker::Status 
 	processone(const std::string &, const struct stat *, 
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
 #include "pathut.h"


+// Globals for exit cleanup
 ConfIndexer *confindexer;
 DbIndexer *dbindexer;

-bool indexfiles(RclConfig *config, const list<string> &filenames)
+// Index a list of files 
+static bool indexfiles(RclConfig *config, const list<string> &filenames)
 {
    if (filenames.empty())
 	return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
    return dbindexer->indexFiles(filenames);
 }

+// Create additional stem database 
+static bool createstemdb(RclConfig *config, const string &lang)
+{
+    // Note that we do not bother to check for multiple databases,
+    // which are currently a fiction anyway. 
+    string dbdir;
+    if (!config->getConfParam("dbdir", dbdir)) {
+	LOGERR(("createstemdb: no database directory in configuration\n"));
+	return false;
+    }
+    dbdir = path_tildexpand(dbdir);
+    dbindexer = new DbIndexer(config, dbdir);
+    return dbindexer->createStemDb(lang);
+}
+
 static void cleanup()
 {
    delete confindexer;
@ -63,15 +80,19 @@ static int     op_flags;
 #define OPT_z     0x2 
 #define OPT_h     0x4 
 #define OPT_i     0x8
+#define OPT_s     0x10

 static const char usage [] =
+"\n"
 "recollindex [-hz] \n"
+"    Normal index run\n"
 "recollindex -i <filename [filename ...]>\n"
+"    Index individual files. No db purge or stem database updates\n"
+"recollindex -s <lang>\n"
+"    Build stem database for language <lang>\n"
 "Options:\n"
 " -h : print this message\n"
 " -z : reset database before starting indexation\n\n"
-" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
-"           database updates in this case\n"
 ;

 static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
 	    case 'z': op_flags |= OPT_z; break;
 	    case 'h': op_flags |= OPT_h; break;
 	    case 'i': op_flags |= OPT_i; break;
+	    case 's': op_flags |= OPT_s; break;
 	    default: Usage(); break;
 	    }
    b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)

    string reason;
    RclConfig *config = recollinit(cleanup, sigcleanup, reason);
-
    if (config == 0 || !config->ok()) {
 	cerr << "Configuration problem: " << reason << endl;
 	exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
 	    }
 	}
 	exit(!indexfiles(config, filenames));
+    } else if (op_flags & OPT_s) {
+	if (argc != 1) 
+	    Usage();
+	string lang = *argv++; argc--;
+	exit(!createstemdb(config, lang));
    } else {
 	confindexer = new ConfIndexer(config);
 	bool rezero(op_flags & OPT_z);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
 #include "smallut.h"
 #include "pathhash.h"
 #include "utf8iter.h"
+#include "wipedir.h"

 #include "xapian.h"
 #include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
 	    ndb->iswritable));
    if (ndb->isopen == false)
 	return;
-    string ermsg;
+    const char *ermsg = "Unknown error";
    try {
 	LOGDEB(("Rcl::Db::~Db: closing native database\n"));
-	if (ndb->iswritable == true)
+	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
+	}
 	delete ndb;
 	return;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
-    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
+    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
 }

 bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	LOGERR(("Rcl::Db::open: already open\n"));
 	return false;
    }
-    string ermsg;
+    const char *ermsg = "Unknown";
    try {
 	switch (mode) {
 	case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	ndb->basedir = dir;
 	return true;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
    LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", 
-	    dir.c_str(), ermsg.c_str()));
+	    dir.c_str(), ermsg));
    return false;
 }

@ -148,7 +150,7 @@ bool Rcl::Db::close()
 	    ndb->iswritable));
    if (ndb->isopen == false)
 	return true;
-    string ermsg;
+    const char *ermsg = "Unknown";
    try {
 	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
 	if (pdata)
 	    return true;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
-    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", 
-	    ermsg.c_str()));
+    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
    return false;
 }

@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
 // Callback for the document to word splitting class during indexation
 bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
 {
-    // cerr << "splitCb: term " << term << endl;
-    //string printable;
-    //transcode(term, printable, "UTF-8", "ISO-8859-1");
-    //cerr << "Adding " << printable << endl;
+#if 0
+    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
+    string printable;
+    if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
+	LOGDEB(("                                [%s]\n", printable.c_str()));
+    }
+#endif

+    const char *ermsg;
    try {
-	// 1 is the value for wdfinc in index_text when called from omindex
-	// TOBEDONE: check what this is used for
+	// Note: 1 is the within document frequency increment. It would 
+	// be possible to assign different weigths to doc parts (ie title)
+	// by using a higher value
 	curpos = pos;
 	doc.add_posting(term, basepos + curpos, 1);
-    } catch (...) {
-	LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
-	return false;
-    }
 	return true;
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
+    } catch (...) {
+	ermsg= "Unknown error";
+    }
+    LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
+    return false;
 }

 // Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
    return true;
 }

-/* omindex direct */
+/* From omindex direct */
 /* Truncate a string to a given maxlength, avoiding cutting off midword
 * if reasonably possible. */
 string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)

 	output += " ...";
    }
-
-    // replace newlines with spaces
-    size_t i = 0;    
-    while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
+    // No need to replace newlines with spaces, we do this in dumb_string()
    return output;
 }

-// Truncate longer path and uniquize with hash . The goad for this is
+// Truncate longer path and uniquize with hash . The goal for this is
 // to avoid xapian max term length limitations, not to gain space (we
 // gain very little even with very short maxlens like 30)
-#define HASHPATH
 #define PATHHASHLEN 150

 // Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)

    // Split and index file name. This supposes that it's either ascii
    // or utf-8. If this fails, we just go on. We need a config
-    // parameter for file name charset
+    // parameter for file name charset.
+    // Do we really want to fold case here ?
    if (dumb_string(fn, noacc)) {
 	splitter.text_to_words(noacc);
 	splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;

-    // Split body and index terms
+    // Split and index body
    if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;

-    // Split keywords and index terms
+    // Split and index keywords
    if (!dumb_string(doc.keywords, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;

-    // Split abstract and index terms
+    // Split and index abstract
    if (!dumb_string(doc.abstract, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)

    // Path name
    string hash;
-#ifdef HASHPATH
    pathHash(fn, hash, PATHHASHLEN);
-#else
-    hash = fn;
-#endif
    LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
-
    string pathterm  = "P" + hash;
    newdocument.add_term(pathterm);

-    // File path + internal path: document unique identifier for
-    // documents inside multidocument files.
+    // Internal path: with path, makes unique identifier for documents
+    // inside multidocument files.
    string uniterm;
    if (!doc.ipath.empty()) {
 	uniterm  = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    string record = "url=file://" + fn;
    record += "\nmtype=" + doc.mimetype;
    record += "\nfmtime=" + doc.fmtime;
-    if (!doc.dmtime.empty())
+    if (!doc.dmtime.empty()) {
 	record += "\ndmtime=" + doc.dmtime;
+    }
    record += "\norigcharset=" + doc.origcharset;
    record += "\ncaption=" + doc.title;
    record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	record += "\nipath=" + doc.ipath;
    }
    record += "\n";
-
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
    newdocument.set_data(record);

    const char *fnc = fn.c_str();
-   
    // Add db entry or update existing entry:
    try {
 	Xapian::docid did = 
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	}
    } catch (...) {
 	// FIXME: is this ever actually needed?
+	try {
 	    ndb->wdb.add_document(newdocument);
 	    LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", 
 		    fnc));
+	} catch (...) {
+	    LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
+	    return false;
+	}
    }
    return true;
 }

+// Test if given filename has changed since last indexed:
 bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 {
    if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)

    // If no document exist with this path, we do need update
    string hash;
-#ifdef HASHPATH
    pathHash(filename, hash, PATHHASHLEN);
-#else
-    hash = filename;
-#endif
    string pathterm  = "P" + hash;
-    if (!ndb->wdb.term_exists(pathterm)) {
-	LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
-	return true;
-    }
+    const char *ermsg;

    // Look for all documents with this path. We need to look at all
    // to set their existence flag.  We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
    // file changed)
    Xapian::PostingIterator doc;
    try {
+	if (!ndb->wdb.term_exists(pathterm)) {
+	    LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
+	    return true;
+	}
+
 	Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
 	for (Xapian::PostingIterator docid = docid0;
 	     docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 	    if (*docid < ndb->updated.size())
 		ndb->updated[*docid] = true;
 	}
+	return false;
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
    } catch (...) {
+	ermsg= "Unknown error";
+    }
+    LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
    return true;
 }

-    return false;
-}
-
+const static string stemdirstem = "stem_";
 /// Compute name of stem db for given base database and language
 static string stemdbname(const string& basename, string lang)
 {
-    string nm = path_cat(basename, string("stem_") + lang);
+    string nm = path_cat(basename, stemdirstem + lang);
    return nm;
 }

-// Is char non-lowercase ascii ?
+// Deciding if we try to stem the term. If it has numerals or capitals
+// we don't
 inline static bool
 p_notlowerorutf(unsigned int c)
 {
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
    return false;
 }

+/**
+ * Delete stem db for given language
+ */
+bool Rcl::Db::deleteStemDb(const string& lang)
+{
+    LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
+    if (pdata == 0)
+	return false;
+    Native *ndb = (Native *)pdata;
+    if (ndb->isopen == false)
+	return false;
+
+    string dir = stemdbname(ndb->basedir, lang);
+    if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
+	return true;
+    return false;
+}
+
 /**
 * Create database of stem to parents associations for a given language.
 * We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
-    if (ndb->isopen == false || ndb->iswritable == false)
+    if (ndb->isopen == false)
 	return false;

    // First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
 	    }
 	    assocs.insert(pair<string,string>(stem, *it));
 	}
+    } catch (const Xapian::Error &e) {
+	LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
+	return false;
    } catch (...) {
-	LOGERR(("Stem database build failed: no stemmer for %s ? \n", 
+	LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", 
 		lang.c_str()));
 	return false;
    }

+    class DirWiper {
+    public:
+	string dir;
+	bool do_it;
+	DirWiper(string d) : dir(d), do_it(true) {}
+	~DirWiper() {
+	    if (do_it) {
+		wipedir(dir);
+		rmdir(dir.c_str());
+	    }
+	}
+    };
    // Create xapian database for stem relations
    string stemdbdir = stemdbname(ndb->basedir, lang);
-    string ermsg = "NOERROR";
+    // We want to get rid of the db dir in case of error. This gets disarmed
+    // just before success return.
+    DirWiper wiper(stemdbdir);
+    const char *ermsg = "NOERROR";
    Xapian::WritableDatabase sdb;
    try {
 	sdb = Xapian::WritableDatabase(stemdbdir, 
 				       Xapian::DB_CREATE_OR_OVERWRITE);
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
    }
    if (ermsg != "NOERROR") {
 	LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", 
-		stemdbdir.c_str(), ermsg.c_str()));
+		stemdbdir.c_str(), ermsg));
 	return false;
    }

@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
    }
    LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", 
 	    assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
+    wiper.do_it = false;
    return true;
 }

+list<string> Rcl::Db::getStemLangs()
+{
+    list<string> dirs;
+    LOGDEB(("Rcl::Db::getStemLang\n"));
+    if (pdata == 0)
+	return dirs;
+    Native *ndb = (Native *)pdata;
+    string pattern = stemdirstem + "*";
+    dirs = path_dirglob(ndb->basedir, pattern);
+    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
+	*it = path_basename(*it);
+	*it = it->substr(stemdirstem.length(), string::npos);
+    }
+    return dirs;
+}
+
+
 /**
 * This is called at the end of an indexing session, to delete the
 *  documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
    // and does nothing). Maybe related to the exceptions below when
    // trying to delete an unexistant document ?
    // Flushing before trying the deletes seeems to work around the problem
+    try {
 	ndb->wdb.flush();
+    } catch (...) {
+	LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
+    }
    for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
 	if (!ndb->updated[docid]) {
 	    try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
 	    }
 	}
    }
+    try {
 	ndb->wdb.flush();
+    } catch (...) {
+	LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
+    }
    return true;
 }

@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
 };


-//
 // Turn string into list of xapian queries. There is little
 // interpretation done on the string (no +term -term or filename:term
 // stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
    Native *ndb = (Native *)pdata;

    string hash;
-#ifdef HASHPATH
    pathHash(fn, hash, PATHHASHLEN);
-#else
-    hash = fn;
-#endif
    string pathterm  = "P" + hash;
+
+    // Look for all documents with this path, searching for the one
+    // with the appropriate ipath. This is very inefficient.
+    const char *ermsg = "";
+    try {
 	if (!ndb->db.term_exists(pathterm)) {
 	    LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", 
 		    pathterm.c_str(), pathterm.length()));
 	    return false;
 	}
-
-    // Look for all documents with this path, searching for the one
-    // with the appropriate ipath. This is very inefficient.
-    try {
 	for (Xapian::PostingIterator docid = 
 		 ndb->db.postlist_begin(pathterm);
 	     docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
 	    if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
 		return true;
 	}
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
+    } catch (const string &s) {
+	ermsg = s.c_str();
+    } catch (const char *s) {
+	ermsg = s;
    } catch (...) {
-	return false;
+	ermsg = "Caught unknown exception";
+    }
+    if (*ermsg) {
+	LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
    }
    return false;
 }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -1,6 +1,6 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -102,6 +102,7 @@ public:
    bool needUpdate(const string &filename, const struct stat *stp);
    bool purge();
    bool createStemDb(const string &lang);
+    bool deleteStemDb(const string &lang);

    // Query-related functions

@ -127,6 +128,10 @@ public:
    /** Get results count for current query */
    int getResCnt();

+    /** Get a list of existing stemming databases */
+    std::list<std::string> getStemLangs();
+
+    /** Things we don't want to have here. */
    friend class Rcl::DbPops;

 private:
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -15,7 +15,7 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
 	$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
 	       -DTEST_FSTREEWALK fstreewalk.cpp

-PATHUT_OBJS= trpathut.o pathut.o 
+PATHUT_OBJS= trpathut.o pathut.o  $(BIGLIB)
 trpathut : $(PATHUT_OBJS) 
 	$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
 trpathut.o : pathut.cpp pathut.h
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@ -1,15 +1,21 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #ifndef TEST_PATHUT
 #include <unistd.h>
+#include <sys/param.h>
 #include <pwd.h>
+
 #include <iostream>
+#include <list>
+#include <stack>

 #include "pathut.h"
 #ifndef NO_NAMESPACES
 using std::string;
+using std::list;
+using std::stack;
 #endif /* NO_NAMESPACES */

 void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
    return simple;
 }

+string path_basename(const string &s, const string &suff)
+{
+    string simple = path_getsimple(s);
+    string::size_type pos = string::npos;
+    if (suff.length() && simple.length() > suff.length()) {
+	pos = simple.rfind(suff);
+	if (pos != string::npos && pos + suff.length() == simple.length())
+	    return simple.substr(0, pos);
+    } 
+    return simple;
+}
+
 string path_home()
 {
    uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
    return o;
 }

+#include <smallut.h>
+extern std::string path_canon(const std::string &is)
+{
+    if (is.length() == 0)
+	return is;
+    string s = is;
+    if (s[0] != '/') {
+	char buf[MAXPATHLEN];
+	if (!getcwd(buf, MAXPATHLEN)) {
+	    return "";
+	}
+	s = path_cat(string(buf), s); 
+    }
+    list<string>elems;
+    stringToTokens(s, elems, "/");
+    list<string> cleaned;
+    for (list<string>::const_iterator it = elems.begin(); 
+	 it != elems.end(); it++){
+	if (*it == "..") {
+	    if (!cleaned.empty())
+		cleaned.pop_back();
+	} else if (it->empty() || *it == ".") {
+	} else {
+	    cleaned.push_back(*it);
+	}
+    }
+    string ret;
+    if (!cleaned.empty()) {
+	for (list<string>::const_iterator it = cleaned.begin(); 
+	     it != cleaned.end(); it++) {
+	    ret += "/";
+	    ret += *it;
+	}
+    } else {
+	ret = "/";
+    }
+    return ret;
+}
+
+#include <glob.h>
+#include <sys/stat.h>
+list<std::string> path_dirglob(const std::string &dir, 
+				    const std::string pattern)
+{
+    list<string> res;
+    glob_t mglob;
+    string mypat=path_cat(dir, pattern);
+    if (glob(mypat.c_str(), 0, 0, &mglob)) {
+	return res;
+    }
+    for (int i = 0; i < mglob.gl_pathc; i++) {
+	res.push_back(mglob.gl_pathv[i]);
+    }
+    globfree(&mglob);
+    return res;
+}
+
+
 #else // TEST_PATHUT

 #include <iostream>
@ -108,7 +184,7 @@ using namespace std;
 const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
 			 "/dir1/dir2",
 			"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
-			"/dir/.c",
+			"/dir/.c", "/dir/toto.txt", "toto.txt1"
 };

 const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);

 int main(int argc, const char **argv)
 {
+    string s;
+    list<string>::const_iterator it;
 #if 0
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
    }
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
+    }
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Basename " << 
+	    path_basename(tstvec[i], ".txt") << endl;
    }
 #endif
-    string s;

+#if 0
    for (int i = 0; i < nttvec; i++) {
 	cout << "tildexp: '" << ttvec[i] << "' -> '" << 
 	    path_tildexpand(ttvec[i]) << "'" << endl;
    }
+#endif

-
+#if 0
+    const string canontst[] = {"/dir1/../../..", "/////", "", 
+			       "/dir1/../../.././/////dir2///////",
+			       "../../", 
+			       "../../../../../../../../../../"
+    };
+    unsigned int nttvec = sizeof(canontst) / sizeof(string);
+    for (unsigned int i = 0; i < nttvec; i++) {
+	cout << "canon: '" << canontst[i] << "' -> '" << 
+	    path_canon(canontst[i]) << "'" << endl;
+    }
+#endif    
+#if 1
+    if (argc != 3) {
+	fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
+	exit(1);
+    }
+    string dir=argv[1], pattern=argv[2];
+    list<string> matched = path_dirglob(dir, pattern);
+    for (it = matched.begin(); it != matched.end();it++) {
+	cout << *it << endl;
+    }
+#endif

    return 0;
 }
--- a/src/utils/pathut.h
+++ b/src/utils/pathut.h
@ -1,14 +1,19 @@
 #ifndef _PATHUT_H_INCLUDED_
 #define _PATHUT_H_INCLUDED_
-/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
+#include <list>

 extern void path_catslash(std::string &s);
 extern std::string path_cat(const std::string &s1, const std::string &s2);
 extern std::string path_getsimple(const std::string &s);
+extern std::string path_basename(const std::string &s, const std::string &suff="");
 extern std::string path_getfather(const std::string &s);
 extern std::string path_home();
 extern std::string path_tildexpand(const std::string &s);

+extern std::string path_canon(const std::string &s);
+extern std::list<std::string> path_dirglob(const std::string &dir, 
+					   const std::string pattern);
 #endif /* _PATHUT_H_INCLUDED_ */