allow independant creation / deletion of stem dbs

2006-01-09 16:53:31 +00:00 · 2006-01-09 16:53:31 +00:00 · dac569ab51
commit dac569ab51
parent c4ce5cf691
9 changed files with 364 additions and 122 deletions
--- a/src/excludefile
+++ b/src/excludefile
@ -1,29 +1,29 @@
 #*
 *.cache
 *.core
 *.o
 *~
-*.core
+.#*
-*.cache
+.#*
 #*
 .moc
 .obj
 .ui
 .#*
 CVS
 alldeps
 .#*
 autom4*
 TAGS
 alldeps
 autom4*
 config.cache
 config.log
 config.status
 excludefile
 lib/librcl.a
 makesrcdist.sh
 recollinstall
 mk/localdefs
 sysconf
 qtgui/Makefile
 qtgui/preview/Makefile
 qtgui/preview/preview.pro
 qtgui/preview/pvmain.cpp
-lib/librcl.a
+recollinstall
 sampleconf/recoll.conf
 sysconf
 wxgui
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
 #include <iostream>
 #include <list>
 #include <map>
 #include <algorithm>
 #include "pathut.h"
 #include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
    // filesystem anymore.
    db.purge();
-    // Create stemming databases
+    // Create stemming databases. We also remove those which are not
    // configured.
    string slangs;
    if (config->getConfParam("indexstemminglanguages", slangs)) {
 	list<string> langs;
 	stringToStrings(slangs, langs);
-	for (list<string>::const_iterator it = langs.begin(); 
+
-	     it != langs.end(); it++) {
+	// Get the list of existing stem dbs from the database (some may have 
 	// been manually created, we just keep those from the config
 	list<string> dblangs = db.getStemLangs();
 	list<string>::const_iterator it;
 	for (it = dblangs.begin(); it != dblangs.end(); it++) {
 	    if (find(langs.begin(), langs.end(), *it) == langs.end())
 		db.deleteStemDb(*it);
 	}
 	for (it = langs.begin(); it != langs.end(); it++) {
 	    db.createStemDb(*it);
 	}
    }
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
    return true;
 }
 bool DbIndexer::createStemDb(const string &lang)
 {
    if (!init())
 	return false;
    return db.createStemDb(lang);
 }
 /** 
 Index individual files, out of a full tree run. No database purging
 */
 bool DbIndexer::indexFiles(const list<string> &filenames)
 {
    if (!init())
--- a/src/index/indexer.h
+++ b/src/index/indexer.h
@ -1,6 +1,6 @@
 #ifndef _INDEXER_H_INCLUDED_
 #define _INDEXER_H_INCLUDED_
-/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
@ -24,7 +24,9 @@ class DbIndexer;
 class ConfIndexer {
 public:
    enum runStatus {IndexerOk, IndexerError};
-    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
+    ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) 
 	{
 	}
    virtual ~ConfIndexer();
    /** Worker function: doe the actual indexing */
    bool index(bool resetbefore = false);
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
    /** Index a list of files. No db cleaning or stemdb updating */
    bool indexFiles(const std::list<std::string> &files);
    /** Create stem database for given language */
    bool createStemDb(const string &lang);
    /**  Tree walker callback method */
    FsTreeWalker::Status 
 	processone(const std::string &, const struct stat *, 
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
 #include "pathut.h"
 // Globals for exit cleanup
 ConfIndexer *confindexer;
 DbIndexer *dbindexer;
-bool indexfiles(RclConfig *config, const list<string> &filenames)
+// Index a list of files 
 static bool indexfiles(RclConfig *config, const list<string> &filenames)
 {
    if (filenames.empty())
 	return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
    return dbindexer->indexFiles(filenames);
 }
 // Create additional stem database 
 static bool createstemdb(RclConfig *config, const string &lang)
 {
    // Note that we do not bother to check for multiple databases,
    // which are currently a fiction anyway. 
    string dbdir;
    if (!config->getConfParam("dbdir", dbdir)) {
 	LOGERR(("createstemdb: no database directory in configuration\n"));
 	return false;
    }
    dbdir = path_tildexpand(dbdir);
    dbindexer = new DbIndexer(config, dbdir);
    return dbindexer->createStemDb(lang);
 }
 static void cleanup()
 {
    delete confindexer;
@ -63,15 +80,19 @@ static int     op_flags;
 #define OPT_z     0x2 
 #define OPT_h     0x4 
 #define OPT_i     0x8
 #define OPT_s     0x10
 static const char usage [] =
 "\n"
 "recollindex [-hz] \n"
 "    Normal index run\n"
 "recollindex -i <filename [filename ...]>\n"
 "    Index individual files. No db purge or stem database updates\n"
 "recollindex -s <lang>\n"
 "    Build stem database for language <lang>\n"
 "Options:\n"
 " -h : print this message\n"
 " -z : reset database before starting indexation\n\n"
 " -i <filename [filename ...]> : index individual files. No db purge or stem\n"
 "           database updates in this case\n"
 ;
 static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
 	    case 'z': op_flags |= OPT_z; break;
 	    case 'h': op_flags |= OPT_h; break;
 	    case 'i': op_flags |= OPT_i; break;
 	    case 's': op_flags |= OPT_s; break;
 	    default: Usage(); break;
 	    }
    b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
    string reason;
    RclConfig *config = recollinit(cleanup, sigcleanup, reason);
    if (config == 0 || !config->ok()) {
 	cerr << "Configuration problem: " << reason << endl;
 	exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
 	    }
 	}
 	exit(!indexfiles(config, filenames));
    } else if (op_flags & OPT_s) {
 	if (argc != 1) 
 	    Usage();
 	string lang = *argv++; argc--;
 	exit(!createstemdb(config, lang));
    } else {
 	confindexer = new ConfIndexer(config);
 	bool rezero(op_flags & OPT_z);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
 #include "smallut.h"
 #include "pathhash.h"
 #include "utf8iter.h"
 #include "wipedir.h"
 #include "xapian.h"
 #include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
 	    ndb->iswritable));
    if (ndb->isopen == false)
 	return;
-    string ermsg;
+    const char *ermsg = "Unknown error";
    try {
 	LOGDEB(("Rcl::Db::~Db: closing native database\n"));
-	if (ndb->iswritable == true)
+	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
 	}
 	delete ndb;
 	return;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
-    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
+    LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
 }
 bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	LOGERR(("Rcl::Db::open: already open\n"));
 	return false;
    }
-    string ermsg;
+    const char *ermsg = "Unknown";
    try {
 	switch (mode) {
 	case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 	ndb->basedir = dir;
 	return true;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
    LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", 
-	    dir.c_str(), ermsg.c_str()));
+	    dir.c_str(), ermsg));
    return false;
 }
@ -148,7 +150,7 @@ bool Rcl::Db::close()
 	    ndb->iswritable));
    if (ndb->isopen == false)
 	return true;
-    string ermsg;
+    const char *ermsg = "Unknown";
    try {
 	if (ndb->iswritable == true) {
 	    ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
 	if (pdata)
 	    return true;
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
 	ermsg = "Caught unknown exception";
    }
-    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", 
+    LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
 	    ermsg.c_str()));
    return false;
 }
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
 // Callback for the document to word splitting class during indexation
 bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
 {
-    // cerr << "splitCb: term " << term << endl;
+#if 0
-    //string printable;
+    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
-    //transcode(term, printable, "UTF-8", "ISO-8859-1");
+    string printable;
-    //cerr << "Adding " << printable << endl;
+    if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
 	LOGDEB(("                                [%s]\n", printable.c_str()));
    }
 #endif
    const char *ermsg;
    try {
-	// 1 is the value for wdfinc in index_text when called from omindex
+	// Note: 1 is the within document frequency increment. It would 
-	// TOBEDONE: check what this is used for
+	// be possible to assign different weigths to doc parts (ie title)
 	// by using a higher value
 	curpos = pos;
 	doc.add_posting(term, basepos + curpos, 1);
    } catch (...) {
 	LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
 	return false;
    }
 	return true;
    } catch (const Xapian::Error &e) {
 	ermsg = e.get_msg().c_str();
    } catch (...) {
 	ermsg= "Unknown error";
    }
    LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
    return false;
 }
 // Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
    return true;
 }
-/* omindex direct */
+/* From omindex direct */
 /* Truncate a string to a given maxlength, avoiding cutting off midword
 * if reasonably possible. */
 string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
 	output += " ...";
    }
-
+    // No need to replace newlines with spaces, we do this in dumb_string()
    // replace newlines with spaces
    size_t i = 0;    
    while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
    return output;
 }
-// Truncate longer path and uniquize with hash . The goad for this is
+// Truncate longer path and uniquize with hash . The goal for this is
 // to avoid xapian max term length limitations, not to gain space (we
 // gain very little even with very short maxlens like 30)
 #define HASHPATH
 #define PATHHASHLEN 150
 // Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    // Split and index file name. This supposes that it's either ascii
    // or utf-8. If this fails, we just go on. We need a config
-    // parameter for file name charset
+    // parameter for file name charset.
    // Do we really want to fold case here ?
    if (dumb_string(fn, noacc)) {
 	splitter.text_to_words(noacc);
 	splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;
-    // Split body and index terms
+    // Split and index body
    if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;
-    // Split keywords and index terms
+    // Split and index keywords
    if (!dumb_string(doc.keywords, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;
-    // Split abstract and index terms
+    // Split and index abstract
    if (!dumb_string(doc.abstract, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
 	return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    // Path name
    string hash;
 #ifdef HASHPATH
    pathHash(fn, hash, PATHHASHLEN);
 #else
    hash = fn;
 #endif
    LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
    string pathterm  = "P" + hash;
    newdocument.add_term(pathterm);
-    // File path + internal path: document unique identifier for
+    // Internal path: with path, makes unique identifier for documents
-    // documents inside multidocument files.
+    // inside multidocument files.
    string uniterm;
    if (!doc.ipath.empty()) {
 	uniterm  = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    string record = "url=file://" + fn;
    record += "\nmtype=" + doc.mimetype;
    record += "\nfmtime=" + doc.fmtime;
-    if (!doc.dmtime.empty())
+    if (!doc.dmtime.empty()) {
 	record += "\ndmtime=" + doc.dmtime;
    }
    record += "\norigcharset=" + doc.origcharset;
    record += "\ncaption=" + doc.title;
    record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	record += "\nipath=" + doc.ipath;
    }
    record += "\n";
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
    newdocument.set_data(record);
    const char *fnc = fn.c_str();
    // Add db entry or update existing entry:
    try {
 	Xapian::docid did = 
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	}
    } catch (...) {
 	// FIXME: is this ever actually needed?
 	try {
 	    ndb->wdb.add_document(newdocument);
 	    LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", 
 		    fnc));
 	} catch (...) {
 	    LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
 	    return false;
 	}
    }
    return true;
 }
 // Test if given filename has changed since last indexed:
 bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 {
    if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
    // If no document exist with this path, we do need update
    string hash;
 #ifdef HASHPATH
    pathHash(filename, hash, PATHHASHLEN);
 #else
    hash = filename;
 #endif
    string pathterm  = "P" + hash;
-    if (!ndb->wdb.term_exists(pathterm)) {
+    const char *ermsg;
 	LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
 	return true;
    }
    // Look for all documents with this path. We need to look at all
    // to set their existence flag.  We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
    // file changed)
    Xapian::PostingIterator doc;
    try {
 	if (!ndb->wdb.term_exists(pathterm)) {
 	    LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
 	    return true;
 	}
 	Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
 	for (Xapian::PostingIterator docid = docid0;
 	     docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
 	    if (*docid < ndb->updated.size())
 		ndb->updated[*docid] = true;
 	}
 	return false;
    } catch (const Xapian::Error &e) {
 	ermsg = e.get_msg().c_str();
    } catch (...) {
 	ermsg= "Unknown error";
    }
    LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
    return true;
 }
-    return false;
+const static string stemdirstem = "stem_";
 }
 /// Compute name of stem db for given base database and language
 static string stemdbname(const string& basename, string lang)
 {
-    string nm = path_cat(basename, string("stem_") + lang);
+    string nm = path_cat(basename, stemdirstem + lang);
    return nm;
 }
-// Is char non-lowercase ascii ?
+// Deciding if we try to stem the term. If it has numerals or capitals
 // we don't
 inline static bool
 p_notlowerorutf(unsigned int c)
 {
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
    return false;
 }
 /**
 * Delete stem db for given language
 */
 bool Rcl::Db::deleteStemDb(const string& lang)
 {
    LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
    if (ndb->isopen == false)
 	return false;
    string dir = stemdbname(ndb->basedir, lang);
    if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
 	return true;
    return false;
 }
 /**
 * Create database of stem to parents associations for a given language.
 * We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
-    if (ndb->isopen == false || ndb->iswritable == false)
+    if (ndb->isopen == false)
 	return false;
    // First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
 	    }
 	    assocs.insert(pair<string,string>(stem, *it));
 	}
    } catch (const Xapian::Error &e) {
 	LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
 	return false;
    } catch (...) {
-	LOGERR(("Stem database build failed: no stemmer for %s ? \n", 
+	LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", 
 		lang.c_str()));
 	return false;
    }
    class DirWiper {
    public:
 	string dir;
 	bool do_it;
 	DirWiper(string d) : dir(d), do_it(true) {}
 	~DirWiper() {
 	    if (do_it) {
 		wipedir(dir);
 		rmdir(dir.c_str());
 	    }
 	}
    };
    // Create xapian database for stem relations
    string stemdbdir = stemdbname(ndb->basedir, lang);
-    string ermsg = "NOERROR";
+    // We want to get rid of the db dir in case of error. This gets disarmed
    // just before success return.
    DirWiper wiper(stemdbdir);
    const char *ermsg = "NOERROR";
    Xapian::WritableDatabase sdb;
    try {
 	sdb = Xapian::WritableDatabase(stemdbdir, 
 				       Xapian::DB_CREATE_OR_OVERWRITE);
    } catch (const Xapian::Error &e) {
-	ermsg = e.get_msg();
+	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
-	ermsg = s;
+	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
    }
    if (ermsg != "NOERROR") {
 	LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", 
-		stemdbdir.c_str(), ermsg.c_str()));
+		stemdbdir.c_str(), ermsg));
 	return false;
    }
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
    }
    LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", 
 	    assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
    wiper.do_it = false;
    return true;
 }
 list<string> Rcl::Db::getStemLangs()
 {
    list<string> dirs;
    LOGDEB(("Rcl::Db::getStemLang\n"));
    if (pdata == 0)
 	return dirs;
    Native *ndb = (Native *)pdata;
    string pattern = stemdirstem + "*";
    dirs = path_dirglob(ndb->basedir, pattern);
    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
 	*it = path_basename(*it);
 	*it = it->substr(stemdirstem.length(), string::npos);
    }
    return dirs;
 }
 /**
 * This is called at the end of an indexing session, to delete the
 *  documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
    // and does nothing). Maybe related to the exceptions below when
    // trying to delete an unexistant document ?
    // Flushing before trying the deletes seeems to work around the problem
    try {
 	ndb->wdb.flush();
    } catch (...) {
 	LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
    }
    for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
 	if (!ndb->updated[docid]) {
 	    try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
 	    }
 	}
    }
    try {
 	ndb->wdb.flush();
    } catch (...) {
 	LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
    }
    return true;
 }
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
 };
 //
 // Turn string into list of xapian queries. There is little
 // interpretation done on the string (no +term -term or filename:term
 // stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
    Native *ndb = (Native *)pdata;
    string hash;
 #ifdef HASHPATH
    pathHash(fn, hash, PATHHASHLEN);
 #else
    hash = fn;
 #endif
    string pathterm  = "P" + hash;
    // Look for all documents with this path, searching for the one
    // with the appropriate ipath. This is very inefficient.
    const char *ermsg = "";
    try {
 	if (!ndb->db.term_exists(pathterm)) {
 	    LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", 
 		    pathterm.c_str(), pathterm.length()));
 	    return false;
 	}
    // Look for all documents with this path, searching for the one
    // with the appropriate ipath. This is very inefficient.
    try {
 	for (Xapian::PostingIterator docid = 
 		 ndb->db.postlist_begin(pathterm);
 	     docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
 	    if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
 		return true;
 	}
    } catch (const Xapian::Error &e) {
 	ermsg = e.get_msg().c_str();
    } catch (const string &s) {
 	ermsg = s.c_str();
    } catch (const char *s) {
 	ermsg = s;
    } catch (...) {
-	return false;
+	ermsg = "Caught unknown exception";
    }
    if (*ermsg) {
 	LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
    }
    return false;
 }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -1,6 +1,6 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
@ -102,6 +102,7 @@ public:
    bool needUpdate(const string &filename, const struct stat *stp);
    bool purge();
    bool createStemDb(const string &lang);
    bool deleteStemDb(const string &lang);
    // Query-related functions
@ -127,6 +128,10 @@ public:
    /** Get results count for current query */
    int getResCnt();
    /** Get a list of existing stemming databases */
    std::list<std::string> getStemLangs();
    /** Things we don't want to have here. */
    friend class Rcl::DbPops;
 private:
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -15,7 +15,7 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
 	$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
 	       -DTEST_FSTREEWALK fstreewalk.cpp
-PATHUT_OBJS= trpathut.o pathut.o 
+PATHUT_OBJS= trpathut.o pathut.o  $(BIGLIB)
 trpathut : $(PATHUT_OBJS) 
 	$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
 trpathut.o : pathut.cpp pathut.h
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@ -1,15 +1,21 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_PATHUT
 #include <unistd.h>
 #include <sys/param.h>
 #include <pwd.h>
 #include <iostream>
 #include <list>
 #include <stack>
 #include "pathut.h"
 #ifndef NO_NAMESPACES
 using std::string;
 using std::list;
 using std::stack;
 #endif /* NO_NAMESPACES */
 void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
    return simple;
 }
 string path_basename(const string &s, const string &suff)
 {
    string simple = path_getsimple(s);
    string::size_type pos = string::npos;
    if (suff.length() && simple.length() > suff.length()) {
 	pos = simple.rfind(suff);
 	if (pos != string::npos && pos + suff.length() == simple.length())
 	    return simple.substr(0, pos);
    } 
    return simple;
 }
 string path_home()
 {
    uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
    return o;
 }
 #include <smallut.h>
 extern std::string path_canon(const std::string &is)
 {
    if (is.length() == 0)
 	return is;
    string s = is;
    if (s[0] != '/') {
 	char buf[MAXPATHLEN];
 	if (!getcwd(buf, MAXPATHLEN)) {
 	    return "";
 	}
 	s = path_cat(string(buf), s); 
    }
    list<string>elems;
    stringToTokens(s, elems, "/");
    list<string> cleaned;
    for (list<string>::const_iterator it = elems.begin(); 
 	 it != elems.end(); it++){
 	if (*it == "..") {
 	    if (!cleaned.empty())
 		cleaned.pop_back();
 	} else if (it->empty() || *it == ".") {
 	} else {
 	    cleaned.push_back(*it);
 	}
    }
    string ret;
    if (!cleaned.empty()) {
 	for (list<string>::const_iterator it = cleaned.begin(); 
 	     it != cleaned.end(); it++) {
 	    ret += "/";
 	    ret += *it;
 	}
    } else {
 	ret = "/";
    }
    return ret;
 }
 #include <glob.h>
 #include <sys/stat.h>
 list<std::string> path_dirglob(const std::string &dir, 
 				    const std::string pattern)
 {
    list<string> res;
    glob_t mglob;
    string mypat=path_cat(dir, pattern);
    if (glob(mypat.c_str(), 0, 0, &mglob)) {
 	return res;
    }
    for (int i = 0; i < mglob.gl_pathc; i++) {
 	res.push_back(mglob.gl_pathv[i]);
    }
    globfree(&mglob);
    return res;
 }
 #else // TEST_PATHUT
 #include <iostream>
@ -108,7 +184,7 @@ using namespace std;
 const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
 			 "/dir1/dir2",
 			"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
-			"/dir/.c",
+			"/dir/.c", "/dir/toto.txt", "toto.txt1"
 };
 const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
 int main(int argc, const char **argv)
 {
    string s;
    list<string>::const_iterator it;
 #if 0
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
+	cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
    }
-    for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
+    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
-	cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
+	cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
    }
    for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
 	cout << tstvec[i] << " Basename " << 
 	    path_basename(tstvec[i], ".txt") << endl;
    }
 #endif
    string s;
 #if 0
    for (int i = 0; i < nttvec; i++) {
 	cout << "tildexp: '" << ttvec[i] << "' -> '" << 
 	    path_tildexpand(ttvec[i]) << "'" << endl;
    }
 #endif
-
+#if 0
    const string canontst[] = {"/dir1/../../..", "/////", "", 
 			       "/dir1/../../.././/////dir2///////",
 			       "../../", 
 			       "../../../../../../../../../../"
    };
    unsigned int nttvec = sizeof(canontst) / sizeof(string);
    for (unsigned int i = 0; i < nttvec; i++) {
 	cout << "canon: '" << canontst[i] << "' -> '" << 
 	    path_canon(canontst[i]) << "'" << endl;
    }
 #endif    
 #if 1
    if (argc != 3) {
 	fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
 	exit(1);
    }
    string dir=argv[1], pattern=argv[2];
    list<string> matched = path_dirglob(dir, pattern);
    for (it = matched.begin(); it != matched.end();it++) {
 	cout << *it << endl;
    }
 #endif
    return 0;
 }
--- a/src/utils/pathut.h
+++ b/src/utils/pathut.h
@ -1,14 +1,19 @@
 #ifndef _PATHUT_H_INCLUDED_
 #define _PATHUT_H_INCLUDED_
-/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
 extern void path_catslash(std::string &s);
 extern std::string path_cat(const std::string &s1, const std::string &s2);
 extern std::string path_getsimple(const std::string &s);
 extern std::string path_basename(const std::string &s, const std::string &suff="");
 extern std::string path_getfather(const std::string &s);
 extern std::string path_home();
 extern std::string path_tildexpand(const std::string &s);
 extern std::string path_canon(const std::string &s);
 extern std::list<std::string> path_dirglob(const std::string &dir, 
 					   const std::string pattern);
 #endif /* _PATHUT_H_INCLUDED_ */