From dac569ab51ccddef8d42162aacd139d686965157 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Mon, 9 Jan 2006 16:53:31 +0000
Subject: [PATCH] allow independant creation / deletion of stem dbs

---
 src/excludefile           |  20 ++--
 src/index/indexer.cpp     |  28 ++++-
 src/index/indexer.h       |  23 ++--
 src/index/recollindex.cpp |  40 +++++--
 src/rcldb/rcldb.cpp       | 236 +++++++++++++++++++++++++-------------
 src/rcldb/rcldb.h         |   7 +-
 src/utils/Makefile        |   4 +-
 src/utils/pathut.cpp      | 121 +++++++++++++++++--
 src/utils/pathut.h        |   7 +-
 9 files changed, 364 insertions(+), 122 deletions(-)
diff --git a/src/excludefile b/src/excludefile
index bd7984f0..3c40ff36 100644
--- a/src/excludefile
+++ b/src/excludefile
@@ -1,29 +1,29 @@
+#*
+*.cache
+*.core
 *.o
 *~
-*.core
-*.cache
-#*
+.#*
+.#*
 .moc
 .obj
 .ui
-.#*
 CVS
-alldeps
-.#*
-autom4*
 TAGS
+alldeps
+autom4*
 config.cache
 config.log
 config.status
 excludefile
+lib/librcl.a
 makesrcdist.sh
-recollinstall
 mk/localdefs
-sysconf
 qtgui/Makefile
 qtgui/preview/Makefile
 qtgui/preview/preview.pro
 qtgui/preview/pvmain.cpp
-lib/librcl.a
+recollinstall
 sampleconf/recoll.conf
+sysconf
 wxgui
diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp
index 43ad1ff8..829ba167 100644
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
 #include <iostream>
 #include <list>
 #include <map>
+#include <algorithm>
 
 #include "pathut.h"
 #include "conftree.h"
@@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
     // filesystem anymore.
     db.purge();
 
-    // Create stemming databases
+    // Create stemming databases. We also remove those which are not
+    // configured.
     string slangs;
     if (config->getConfParam("indexstemminglanguages", slangs)) {
 	list<string> langs;
 	stringToStrings(slangs, langs);
-	for (list<string>::const_iterator it = langs.begin(); 
-	     it != langs.end(); it++) {
+
+	// Get the list of existing stem dbs from the database (some may have 
+	// been manually created, we just keep those from the config
+	list<string> dblangs = db.getStemLangs();
+	list<string>::const_iterator it;
+	for (it = dblangs.begin(); it != dblangs.end(); it++) {
+	    if (find(langs.begin(), langs.end(), *it) == langs.end())
+		db.deleteStemDb(*it);
+	}
+	for (it = langs.begin(); it != langs.end(); it++) {
 	    db.createStemDb(*it);
 	}
     }
@@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
     return true;
 }
 
+bool DbIndexer::createStemDb(const string &lang)
+{
+    if (!init())
+	return false;
+    return db.createStemDb(lang);
+}
+
+/** 
+ Index individual files, out of a full tree run. No database purging
+*/
 bool DbIndexer::indexFiles(const list<string> &filenames)
 {
     if (!init())
diff --git a/src/index/indexer.h b/src/index/indexer.h
index 54eb7cc5..6fa9748f 100644
--- a/src/index/indexer.h
+++ b/src/index/indexer.h
@@ -1,6 +1,6 @@
 #ifndef _INDEXER_H_INCLUDED_
 #define _INDEXER_H_INCLUDED_
-/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 #include <list>
@@ -24,10 +24,12 @@ class DbIndexer;
 class ConfIndexer {
  public:
     enum runStatus {IndexerOk, IndexerError};
-    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
-	virtual ~ConfIndexer();
-	/** Worker function: doe the actual indexing */
-	bool index(bool resetbefore = false);
+    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) 
+	{
+	}
+    virtual ~ConfIndexer();
+    /** Worker function: doe the actual indexing */
+    bool index(bool resetbefore = false);
  private:
 	RclConfig *config;
 	DbIndexer *dbindexer; // Object to process directories for a given db
@@ -36,10 +38,10 @@ class ConfIndexer {
 /** Index things into one database
  
 Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
-method is called by the file-system tree walk code for each file and
-directory. We keep all state needed while indexing, and finally call
-the methods to purge the db of stale entries and create the stemming
-databases.
+  method is called by the file-system tree walk code for each file and
+  directory. We keep all state needed while indexing, and finally call
+  the methods to purge the db of stale entries and create the stemming
+  databases.
 
 Single file(s) indexing: no database purging or stem db updating.
 */
@@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
     /** Index a list of files. No db cleaning or stemdb updating */
     bool indexFiles(const std::list<std::string> &files);
 
+    /** Create stem database for given language */
+    bool createStemDb(const string &lang);
+
     /**  Tree walker callback method */
     FsTreeWalker::Status 
 	processone(const std::string &, const struct stat *, 
diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp
index a3f0e0bd..d33ba70e 100644
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #include <stdio.h>
@@ -19,10 +19,12 @@ using namespace std;
 #include "pathut.h"
 
 
+// Globals for exit cleanup
 ConfIndexer *confindexer;
 DbIndexer *dbindexer;
 
-bool indexfiles(RclConfig *config, const list<string> &filenames)
+// Index a list of files 
+static bool indexfiles(RclConfig *config, const list<string> &filenames)
 {
     if (filenames.empty())
 	return true;
@@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
     return dbindexer->indexFiles(filenames);
 }
 
+// Create additional stem database 
+static bool createstemdb(RclConfig *config, const string &lang)
+{
+    // Note that we do not bother to check for multiple databases,
+    // which are currently a fiction anyway. 
+    string dbdir;
+    if (!config->getConfParam("dbdir", dbdir)) {
+	LOGERR(("createstemdb: no database directory in configuration\n"));
+	return false;
+    }
+    dbdir = path_tildexpand(dbdir);
+    dbindexer = new DbIndexer(config, dbdir);
+    return dbindexer->createStemDb(lang);
+}
+
 static void cleanup()
 {
     delete confindexer;
@@ -63,15 +80,19 @@ static int     op_flags;
 #define OPT_z     0x2 
 #define OPT_h     0x4 
 #define OPT_i     0x8
+#define OPT_s     0x10
 
 static const char usage [] =
-"  recollindex [-hz] \n"
-"  recollindex -i <filename [filename ...]>\n"
+"\n"
+"recollindex [-hz] \n"
+"    Normal index run\n"
+"recollindex -i <filename [filename ...]>\n"
+"    Index individual files. No db purge or stem database updates\n"
+"recollindex -s <lang>\n"
+"    Build stem database for language <lang>\n"
 "Options:\n"
 " -h : print this message\n"
 " -z : reset database before starting indexation\n\n"
-" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
-"           database updates in this case\n"
 ;
 
 static void
@@ -97,6 +118,7 @@ int main(int argc, const char **argv)
 	    case 'z': op_flags |= OPT_z; break;
 	    case 'h': op_flags |= OPT_h; break;
 	    case 'i': op_flags |= OPT_i; break;
+	    case 's': op_flags |= OPT_s; break;
 	    default: Usage(); break;
 	    }
     b1: argc--; argv++;
@@ -108,7 +130,6 @@ int main(int argc, const char **argv)
 
     string reason;
     RclConfig *config = recollinit(cleanup, sigcleanup, reason);
-
     if (config == 0 || !config->ok()) {
 	cerr << "Configuration problem: " << reason << endl;
 	exit(1);
@@ -130,6 +151,11 @@ int main(int argc, const char **argv)
 	    }
 	}
 	exit(!indexfiles(config, filenames));
+    } else if (op_flags & OPT_s) {
+	if (argc != 1) 
+	    Usage();
+	string lang = *argv++; argc--;
+	exit(!createstemdb(config, lang));
     } else {
 	confindexer = new ConfIndexer(config);
 	bool rezero(op_flags & OPT_z);
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 3c1e0646..cd15ef6a 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -23,6 +23,7 @@ using namespace std;
 #include "smallut.h"
 #include "pathhash.h"
 #include "utf8iter.h"
+#include "wipedir.h"
 
 #include "xapian.h"
 #include <xapian/stem.h>
@@ -67,23 +68,24 @@ Rcl::Db::~Db()
 	    ndb->iswritable));
     if (ndb->isopen == false)
 	return;
-    string ermsg;
+    const char *ermsg = "Unknown error";
     try {
 	LOGDEB(("Rcl::Db::~Db: closing native database\n"));
-	if (ndb->iswritable == true)
+	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
+	}
 	delete ndb;
 	return;
     } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
     } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
     } catch (const char *s) {
 	ermsg = s;
     } catch (...) {
 	ermsg = "Caught unknown exception";
     }
-    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
+    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
 }
 
 bool Rcl::Db::open(const string& dir, OpenMode mode)
@@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	LOGERR(("Rcl::Db::open: already open\n"));
 	return false;
     }
-    string ermsg;
+    const char *ermsg = "Unknown";
     try {
 	switch (mode) {
 	case DbUpd:
@@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	ndb->basedir = dir;
 	return true;
     } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
     } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
     } catch (const char *s) {
 	ermsg = s;
     } catch (...) {
 	ermsg = "Caught unknown exception";
     }
     LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", 
-	    dir.c_str(), ermsg.c_str()));
+	    dir.c_str(), ermsg));
     return false;
 }
 
@@ -148,7 +150,7 @@ bool Rcl::Db::close()
 	    ndb->iswritable));
     if (ndb->isopen == false)
 	return true;
-    string ermsg;
+    const char *ermsg = "Unknown";
     try {
 	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
@@ -159,16 +161,15 @@ bool Rcl::Db::close()
 	if (pdata)
 	    return true;
     } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
     } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
     } catch (const char *s) {
 	ermsg = s;
     } catch (...) {
 	ermsg = "Caught unknown exception";
     }
-    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", 
-	    ermsg.c_str()));
+    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
     return false;
 }
 
@@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
 // Callback for the document to word splitting class during indexation
 bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
 {
-    // cerr << "splitCb: term " << term << endl;
-    //string printable;
-    //transcode(term, printable, "UTF-8", "ISO-8859-1");
-    //cerr << "Adding " << printable << endl;
+#if 0
+    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
+    string printable;
+    if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
+	LOGDEB(("                                [%s]\n", printable.c_str()));
+    }
+#endif
 
+    const char *ermsg;
     try {
-	// 1 is the value for wdfinc in index_text when called from omindex
-	// TOBEDONE: check what this is used for
+	// Note: 1 is the within document frequency increment. It would 
+	// be possible to assign different weigths to doc parts (ie title)
+	// by using a higher value
 	curpos = pos;
 	doc.add_posting(term, basepos + curpos, 1);
+	return true;
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
     } catch (...) {
-	LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
-	return false;
+	ermsg= "Unknown error";
     }
-    return true;
+    LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
+    return false;
 }
 
 // Unaccent and lowercase data, replace \n\r with spaces
@@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
     return true;
 }
 
-/* omindex direct */
+/* From omindex direct */
 /* Truncate a string to a given maxlength, avoiding cutting off midword
  * if reasonably possible. */
 string
@@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
 
 	output += " ...";
     }
-
-    // replace newlines with spaces
-    size_t i = 0;    
-    while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
+    // No need to replace newlines with spaces, we do this in dumb_string()
     return output;
 }
 
-// Truncate longer path and uniquize with hash . The goad for this is
+// Truncate longer path and uniquize with hash . The goal for this is
 // to avoid xapian max term length limitations, not to gain space (we
 // gain very little even with very short maxlens like 30)
-#define HASHPATH
 #define PATHHASHLEN 150
 
 // Add document in internal form to the database: index the terms in
@@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 
     // Split and index file name. This supposes that it's either ascii
     // or utf-8. If this fails, we just go on. We need a config
-    // parameter for file name charset
+    // parameter for file name charset.
+    // Do we really want to fold case here ?
     if (dumb_string(fn, noacc)) {
 	splitter.text_to_words(noacc);
 	splitData.basepos += splitData.curpos + 100;
@@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
     splitter.text_to_words(noacc);
     splitData.basepos += splitData.curpos + 100;
 
-    // Split body and index terms
+    // Split and index body
     if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
     splitter.text_to_words(noacc);
     splitData.basepos += splitData.curpos + 100;
 
-    // Split keywords and index terms
+    // Split and index keywords
     if (!dumb_string(doc.keywords, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
     splitter.text_to_words(noacc);
     splitData.basepos += splitData.curpos + 100;
 
-    // Split abstract and index terms
+    // Split and index abstract
     if (!dumb_string(doc.abstract, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 
     // Path name
     string hash;
-#ifdef HASHPATH
     pathHash(fn, hash, PATHHASHLEN);
-#else
-    hash = fn;
-#endif
     LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
-
     string pathterm  = "P" + hash;
     newdocument.add_term(pathterm);
 
-    // File path + internal path: document unique identifier for
-    // documents inside multidocument files.
+    // Internal path: with path, makes unique identifier for documents
+    // inside multidocument files.
     string uniterm;
     if (!doc.ipath.empty()) {
 	uniterm  = "Q" + hash + "|" + doc.ipath;
@@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
     string record = "url=file://" + fn;
     record += "\nmtype=" + doc.mimetype;
     record += "\nfmtime=" + doc.fmtime;
-    if (!doc.dmtime.empty())
+    if (!doc.dmtime.empty()) {
 	record += "\ndmtime=" + doc.dmtime;
+    }
     record += "\norigcharset=" + doc.origcharset;
     record += "\ncaption=" + doc.title;
     record += "\nkeywords=" + doc.keywords;
@@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	record += "\nipath=" + doc.ipath;
     }
     record += "\n";
-
     LOGDEB1(("Newdocument data: %s\n", record.c_str()));
     newdocument.set_data(record);
 
     const char *fnc = fn.c_str();
-   
     // Add db entry or update existing entry:
     try {
 	Xapian::docid did = 
@@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	}
     } catch (...) {
 	// FIXME: is this ever actually needed?
-	ndb->wdb.add_document(newdocument);
-	LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", 
-		fnc));
+	try {
+	    ndb->wdb.add_document(newdocument);
+	    LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", 
+		    fnc));
+	} catch (...) {
+	    LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
+	    return false;
+	}
     }
     return true;
 }
 
+// Test if given filename has changed since last indexed:
 bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 {
     if (pdata == 0)
@@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 
     // If no document exist with this path, we do need update
     string hash;
-#ifdef HASHPATH
     pathHash(filename, hash, PATHHASHLEN);
-#else
-    hash = filename;
-#endif
     string pathterm  = "P" + hash;
-    if (!ndb->wdb.term_exists(pathterm)) {
-	LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
-	return true;
-    }
+    const char *ermsg;
 
     // Look for all documents with this path. We need to look at all
     // to set their existence flag.  We check the update time on the
@@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
     // file changed)
     Xapian::PostingIterator doc;
     try {
+	if (!ndb->wdb.term_exists(pathterm)) {
+	    LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
+	    return true;
+	}
+
 	Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
 	for (Xapian::PostingIterator docid = docid0;
 	     docid != ndb->wdb.postlist_end(pathterm); docid++) {
@@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 	    if (*docid < ndb->updated.size())
 		ndb->updated[*docid] = true;
 	}
+	return false;
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
     } catch (...) {
-	return true;
+	ermsg= "Unknown error";
     }
-
-    return false;
+    LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
+    return true;
 }
 
+const static string stemdirstem = "stem_";
 /// Compute name of stem db for given base database and language
 static string stemdbname(const string& basename, string lang)
 {
-    string nm = path_cat(basename, string("stem_") + lang);
+    string nm = path_cat(basename, stemdirstem + lang);
     return nm;
 }
 
-// Is char non-lowercase ascii ?
+// Deciding if we try to stem the term. If it has numerals or capitals
+// we don't
 inline static bool
 p_notlowerorutf(unsigned int c)
 {
@@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
     return false;
 }
 
+/**
+ * Delete stem db for given language
+ */
+bool Rcl::Db::deleteStemDb(const string& lang)
+{
+    LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
+    if (pdata == 0)
+	return false;
+    Native *ndb = (Native *)pdata;
+    if (ndb->isopen == false)
+	return false;
+
+    string dir = stemdbname(ndb->basedir, lang);
+    if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
+	return true;
+    return false;
+}
+
 /**
  * Create database of stem to parents associations for a given language.
  * We walk the list of all terms, stem them, and create another Xapian db
@@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
     if (pdata == 0)
 	return false;
     Native *ndb = (Native *)pdata;
-    if (ndb->isopen == false || ndb->iswritable == false)
+    if (ndb->isopen == false)
 	return false;
 
     // First build the in-memory stem database:
@@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
 	    }
 	    assocs.insert(pair<string,string>(stem, *it));
 	}
+    } catch (const Xapian::Error &e) {
+	LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
+	return false;
     } catch (...) {
-	LOGERR(("Stem database build failed: no stemmer for %s ? \n", 
+	LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", 
 		lang.c_str()));
 	return false;
     }
 
+    class DirWiper {
+    public:
+	string dir;
+	bool do_it;
+	DirWiper(string d) : dir(d), do_it(true) {}
+	~DirWiper() {
+	    if (do_it) {
+		wipedir(dir);
+		rmdir(dir.c_str());
+	    }
+	}
+    };
     // Create xapian database for stem relations
     string stemdbdir = stemdbname(ndb->basedir, lang);
-    string ermsg = "NOERROR";
+    // We want to get rid of the db dir in case of error. This gets disarmed
+    // just before success return.
+    DirWiper wiper(stemdbdir);
+    const char *ermsg = "NOERROR";
     Xapian::WritableDatabase sdb;
     try {
 	sdb = Xapian::WritableDatabase(stemdbdir, 
 				       Xapian::DB_CREATE_OR_OVERWRITE);
     } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
     } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
     } catch (const char *s) {
 	ermsg = s;
     } catch (...) {
@@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
     }
     if (ermsg != "NOERROR") {
 	LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", 
-		stemdbdir.c_str(), ermsg.c_str()));
+		stemdbdir.c_str(), ermsg));
 	return false;
     }
 
@@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
     }
     LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", 
 	    assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
+    wiper.do_it = false;
     return true;
 }
 
+list<string> Rcl::Db::getStemLangs()
+{
+    list<string> dirs;
+    LOGDEB(("Rcl::Db::getStemLang\n"));
+    if (pdata == 0)
+	return dirs;
+    Native *ndb = (Native *)pdata;
+    string pattern = stemdirstem + "*";
+    dirs = path_dirglob(ndb->basedir, pattern);
+    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
+	*it = path_basename(*it);
+	*it = it->substr(stemdirstem.length(), string::npos);
+    }
+    return dirs;
+}
+
+
 /**
  * This is called at the end of an indexing session, to delete the
  *  documents for files that are no longer there. We also build the
@@ -658,7 +721,11 @@ bool Rcl::Db::purge()
     // and does nothing). Maybe related to the exceptions below when
     // trying to delete an unexistant document ?
     // Flushing before trying the deletes seeems to work around the problem
-    ndb->wdb.flush();
+    try {
+	ndb->wdb.flush();
+    } catch (...) {
+	LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
+    }
     for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
 	if (!ndb->updated[docid]) {
 	    try {
@@ -669,7 +736,11 @@ bool Rcl::Db::purge()
 	    }
 	}
     }
-    ndb->wdb.flush();
+    try {
+	ndb->wdb.flush();
+    } catch (...) {
+	LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
+    }
     return true;
 }
 
@@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
 };
 
 
-//
 // Turn string into list of xapian queries. There is little
 // interpretation done on the string (no +term -term or filename:term
 // stuff). We just separate words and phrases, and interpret
@@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
     Native *ndb = (Native *)pdata;
 
     string hash;
-#ifdef HASHPATH
     pathHash(fn, hash, PATHHASHLEN);
-#else
-    hash = fn;
-#endif
     string pathterm  = "P" + hash;
-    if (!ndb->db.term_exists(pathterm)) {
-	LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", 
-		 pathterm.c_str(), pathterm.length()));
-	return false;
-    }
 
     // Look for all documents with this path, searching for the one
     // with the appropriate ipath. This is very inefficient.
+    const char *ermsg = "";
     try {
+	if (!ndb->db.term_exists(pathterm)) {
+	    LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", 
+		    pathterm.c_str(), pathterm.length()));
+	    return false;
+	}
 	for (Xapian::PostingIterator docid = 
 		 ndb->db.postlist_begin(pathterm);
 	     docid != ndb->db.postlist_end(pathterm); docid++) {
@@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
 	    if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
 		return true;
 	}
+    } catch (const Xapian::Error &e) {
+	ermsg = e.get_msg().c_str();
+    } catch (const string &s) {
+	ermsg = s.c_str();
+    } catch (const char *s) {
+	ermsg = s;
     } catch (...) {
-	return false;
+	ermsg = "Caught unknown exception";
+    }
+    if (*ermsg) {
+	LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
     }
     return false;
 }
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 5ac1c87f..6cce5c4b 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -1,6 +1,6 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 #include <list>
@@ -102,6 +102,7 @@ public:
     bool needUpdate(const string &filename, const struct stat *stp);
     bool purge();
     bool createStemDb(const string &lang);
+    bool deleteStemDb(const string &lang);
 
     // Query-related functions
 
@@ -127,6 +128,10 @@ public:
     /** Get results count for current query */
     int getResCnt();
 
+    /** Get a list of existing stemming databases */
+    std::list<std::string> getStemLangs();
+
+    /** Things we don't want to have here. */
     friend class Rcl::DbPops;
 
 private:
diff --git a/src/utils/Makefile b/src/utils/Makefile
index 8d6f8f7a..2e4e187b 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
 	$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
 	       -DTEST_FSTREEWALK fstreewalk.cpp
 
-PATHUT_OBJS= trpathut.o pathut.o 
-trpathut : $(PATHUT_OBJS)
+PATHUT_OBJS= trpathut.o pathut.o  $(BIGLIB)
+trpathut : $(PATHUT_OBJS) 
 	$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
 trpathut.o : pathut.cpp pathut.h
 	$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
diff --git a/src/utils/pathut.cpp b/src/utils/pathut.cpp
index 51a2c265..7c93509e 100644
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@@ -1,15 +1,21 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #ifndef TEST_PATHUT
 #include <unistd.h>
+#include <sys/param.h>
 #include <pwd.h>
+
 #include <iostream>
+#include <list>
+#include <stack>
 
 #include "pathut.h"
 #ifndef NO_NAMESPACES
 using std::string;
+using std::list;
+using std::stack;
 #endif /* NO_NAMESPACES */
 
 void path_catslash(std::string &s) {
@@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
     return simple;
 }
 
+string path_basename(const string &s, const string &suff)
+{
+    string simple = path_getsimple(s);
+    string::size_type pos = string::npos;
+    if (suff.length() && simple.length() > suff.length()) {
+	pos = simple.rfind(suff);
+	if (pos != string::npos && pos + suff.length() == simple.length())
+	    return simple.substr(0, pos);
+    } 
+    return simple;
+}
+
 string path_home()
 {
     uid_t uid = getuid();
@@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
     return o;
 }
 
+#include <smallut.h>
+extern std::string path_canon(const std::string &is)
+{
+    if (is.length() == 0)
+	return is;
+    string s = is;
+    if (s[0] != '/') {
+	char buf[MAXPATHLEN];
+	if (!getcwd(buf, MAXPATHLEN)) {
+	    return "";
+	}
+	s = path_cat(string(buf), s); 
+    }
+    list<string>elems;
+    stringToTokens(s, elems, "/");
+    list<string> cleaned;
+    for (list<string>::const_iterator it = elems.begin(); 
+	 it != elems.end(); it++){
+	if (*it == "..") {
+	    if (!cleaned.empty())
+		cleaned.pop_back();
+	} else if (it->empty() || *it == ".") {
+	} else {
+	    cleaned.push_back(*it);
+	}
+    }
+    string ret;
+    if (!cleaned.empty()) {
+	for (list<string>::const_iterator it = cleaned.begin(); 
+	     it != cleaned.end(); it++) {
+	    ret += "/";
+	    ret += *it;
+	}
+    } else {
+	ret = "/";
+    }
+    return ret;
+}
+
+#include <glob.h>
+#include <sys/stat.h>
+list<std::string> path_dirglob(const std::string &dir, 
+				    const std::string pattern)
+{
+    list<string> res;
+    glob_t mglob;
+    string mypat=path_cat(dir, pattern);
+    if (glob(mypat.c_str(), 0, 0, &mglob)) {
+	return res;
+    }
+    for (int i = 0; i < mglob.gl_pathc; i++) {
+	res.push_back(mglob.gl_pathv[i]);
+    }
+    globfree(&mglob);
+    return res;
+}
+
+
 #else // TEST_PATHUT
 
 #include <iostream>
@@ -108,7 +184,7 @@ using namespace std;
 const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
 			 "/dir1/dir2",
 			"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
-			"/dir/.c",
+			"/dir/.c", "/dir/toto.txt", "toto.txt1"
 };
 
 const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
 
 int main(int argc, const char **argv)
 {
+    string s;
+    list<string>::const_iterator it;
 #if 0
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
     }
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
+    }
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+	cout << tstvec[i] << " Basename " << 
+	    path_basename(tstvec[i], ".txt") << endl;
     }
 #endif
-    string s;
 
+#if 0
     for (int i = 0; i < nttvec; i++) {
 	cout << "tildexp: '" << ttvec[i] << "' -> '" << 
 	    path_tildexpand(ttvec[i]) << "'" << endl;
     }
-    
+#endif
 
+#if 0
+    const string canontst[] = {"/dir1/../../..", "/////", "", 
+			       "/dir1/../../.././/////dir2///////",
+			       "../../", 
+			       "../../../../../../../../../../"
+    };
+    unsigned int nttvec = sizeof(canontst) / sizeof(string);
+    for (unsigned int i = 0; i < nttvec; i++) {
+	cout << "canon: '" << canontst[i] << "' -> '" << 
+	    path_canon(canontst[i]) << "'" << endl;
+    }
+#endif    
+#if 1
+    if (argc != 3) {
+	fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
+	exit(1);
+    }
+    string dir=argv[1], pattern=argv[2];
+    list<string> matched = path_dirglob(dir, pattern);
+    for (it = matched.begin(); it != matched.end();it++) {
+	cout << *it << endl;
+    }
+#endif
 
     return 0;
 }
diff --git a/src/utils/pathut.h b/src/utils/pathut.h
index 13425a05..8160238d 100644
--- a/src/utils/pathut.h
+++ b/src/utils/pathut.h
@@ -1,14 +1,19 @@
 #ifndef _PATHUT_H_INCLUDED_
 #define _PATHUT_H_INCLUDED_
-/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
+#include <list>
 
 extern void path_catslash(std::string &s);
 extern std::string path_cat(const std::string &s1, const std::string &s2);
 extern std::string path_getsimple(const std::string &s);
+extern std::string path_basename(const std::string &s, const std::string &suff="");
 extern std::string path_getfather(const std::string &s);
 extern std::string path_home();
 extern std::string path_tildexpand(const std::string &s);
 
+extern std::string path_canon(const std::string &s);
+extern std::list<std::string> path_dirglob(const std::string &dir, 
+					   const std::string pattern);
 #endif /* _PATHUT_H_INCLUDED_ */