begin i/f cleanup: opacify doc uptodate sig (size+mtime)

2008-07-28 08:42:52 +00:00 · 2008-07-28 08:42:52 +00:00 · ca4a4e65b0
commit ca4a4e65b0
parent 23163f1b4f
7 changed files with 187 additions and 186 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.65 2007-12-20 09:08:04 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.66 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -386,7 +386,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
    // from on to off it may happen that some files which are now
    // without mime type will not be purged from the db, resulting
    // in possible 'cannot intern file' messages at query time...
-    if (!m_db.needUpdate(fn, stp)) {
+    char cbuf[100]; 
+    // Document signature
+    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
+    string sig = cbuf;
+    if (!m_db.needUpdate(fn, sig)) {
 	LOGDEB(("processone: up to date: %s\n", fn.c_str()));
 	if (m_updater) {
 	    m_updater->status.fn = fn;
@ -465,8 +469,17 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	if (doc.utf8fn.empty())
 	    doc.utf8fn = utf8fn;

+	char cbuf[100]; 
+	sprintf(cbuf, "%ld", (long)stp->st_size);
+	doc.fbytes = cbuf;
+	// Document signature for up to date checks: concatenate mtime and 
+	// size. Note: looking for changes only, no need to parseback so no
+	// need for reversible formatting
+	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
+	doc.sig = cbuf;
+
 	// Add document to database
-	if (!m_db.add(fn, doc, stp)) 
+	if (!m_db.add(fn, doc)) 
 	    return FsTreeWalker::FtwError;

 	// Tell what we are doing and check for interrupt request
@ -491,7 +504,15 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	fileDoc.fmtime = ascdate;
 	fileDoc.utf8fn = utf8fn;
 	fileDoc.mimetype = interner.getMimetype();
-	if (!m_db.add(fn, fileDoc, stp)) 
+
+
+	char cbuf[100]; 
+	sprintf(cbuf, "%ld", (long)stp->st_size);
+	fileDoc.fbytes = cbuf;
+	// Document signature for up to date checks.
+	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
+	fileDoc.sig = cbuf;
+	if (!m_db.add(fn, fileDoc)) 
 	    return FsTreeWalker::FtwError;
    }

--- a/src/qtgui/uiprefs_w.cpp
+++ b/src/qtgui/uiprefs_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.24 2008-05-05 20:24:55 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.25 2008-07-28 08:42:52 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -56,6 +56,7 @@ static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.24 2008-05-05 20:24:55 dockes E
 #include "recoll.h"
 #include "guiutils.h"
 #include "rcldb.h"
+#include "rclconfig.h"
 #include "pathut.h"
 #include "uiprefs_w.h"
 #include "viewaction_w.h"
@ -363,9 +364,7 @@ void UIPrefsDialog::addExtraDbPB_clicked()
    }
    struct stat st1, st2;
    stat(dbdir.c_str(), &st1);
-    string rcldbdir;
-    if (rcldb) 
-	rcldbdir = rcldb->getDbDir();
+    string rcldbdir = RclConfig::getMainConfig()->getDbDir();
    stat(rcldbdir.c_str(), &st2);
    path_catslash(rcldbdir);
    fprintf(stderr, "rcldbdir: [%s]\n", rcldbdir.c_str());
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.134 2008-07-01 11:51:51 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.135 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -20,7 +20,6 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.134 2008-07-01 11:51:51 dockes Exp
 #include <stdio.h>
 #include <cstring>
 #include <unistd.h>
-#include <sys/stat.h>
 #include <fnmatch.h>
 #include <regex.h>
 #include <math.h>
@ -59,76 +58,82 @@ using namespace std;
 #define MIN(A,B) (A<B?A:B)
 #endif

+// Omega compatible values. We leave a hole for future omega values. Not sure 
+// it makes any sense to keep any level of omega compat given that the index
+// is incompatible anyway.
+enum value_slot {
+    VALUE_LASTMOD = 0,	// 4 byte big endian value - seconds since 1970.
+    VALUE_MD5 = 1,	// 16 byte MD5 checksum of original document.
+    VALUE_SIG = 10      // Doc sig as chosen by app (ex: mtime+size
+};
+
+
 // This is the word position offset at which we index the body text
 // (abstract, keywords, etc.. are stored before this)
 static const unsigned int baseTextPosition = 100000;

-#undef MTIME_IN_VALUE
-#ifdef MTIME_IN_VALUE
-// Omega compatible values
-#define enum value_slot {
-    VALUE_LASTMOD = 0,	// 4 byte big endian value - seconds since 1970.
-    VALUE_MD5 = 1	// 16 byte MD5 checksum of original document.
-};
-#endif
-
 #ifndef NO_NAMESPACES
 namespace Rcl {
 #endif
-    
-// Max length for path terms stored for each document. Truncate
-// longer path and uniquize with hash. The goal for this is to avoid
-// xapian max term length limitations, not to gain space (we gain very
-// little even with very short maxlens like 30)
-// Note that Q terms add the ipath to this, and that the xapian max
-// key length seems to be around 250
+
+// Synthetic abstract marker (to discriminate from abstract actually
+// found in document)
+const static string rclSyntAbs("?!#@");
+
+// Maximum length for path terms stored for each document. We truncate
+// longer paths and uniquize them by appending a hashed value. This
+// is done to avoid xapian max term length limitations, not
+// to gain space (we gain very little even with very short maxlens
+// like 30) Note that Q terms add the ipath to this, and that the
+// xapian max key length seems to be around 250.
 // The value for PATHHASHLEN includes the length of the hash part.
 #define PATHHASHLEN 150

-// Synthetic abstract marker (to discriminate from abstract actually
-// found in doc)
-const static string rclSyntAbs = "?!#@";
-const static string emptystring;
+// Compute the unique term used to link documents to their file-system source:
+// Hashed path + possible internal path
+static inline string make_uniterm(const string& fn, const string& ipath)
+{
+    string hash;
+    pathHash(fn, hash, PATHHASHLEN);
+    string s("Q");
+    s.append(hash);
+    s.append("|");
+    s.append(ipath);
+    return s;
+}

-/* See comment in class declaration */
-bool Db::Native::subDocs(const string &hash, vector<Xapian::docid>& docids) 
+/* See comment in class declaration: return all subdocuments of a
+ * document given by its unique path id */
+bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids) 
 {
    docids.clear();
-    string qterm = "Q"+ hash + "|";
-    string ermsg;

+    string ermsg;
    for (int tries = 0; tries < 2; tries++) {
 	try {
 	    Xapian::TermIterator it = db.allterms_begin(); 
-	    it.skip_to(qterm);
-	    for (;it != db.allterms_end(); it++) {
-		// If current term does not begin with qterm or has
+	    it.skip_to(uniterm);
+	    // Don't return the doc itself:
+	    it++;
+	    for (; it != db.allterms_end(); it++) {
+		LOGDEB2(("Testing [%s]\n", (*it).c_str()));
+		// If current term does not begin with uniterm or has
 		// another |, not the same file
-		if ((*it).find(qterm) != 0 || 
-		    (*it).find_last_of("|") != qterm.length() -1)
+		if ((*it).find(uniterm) != 0 || 
+		    (*it).find_last_of("|") != uniterm.length() - 1)
 		    break;
 		docids.push_back(*(db.postlist_begin(*it)));
 	    }
+	    LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
 	    return true;
 	} catch (const Xapian::DatabaseModifiedError &e) {
 	    LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
-	    // Can't use reOpen here, it would delete *me*
+	    // Can't use reOpen() here, I'm a Native:: method, this
+	    // would delete my own object
 	    db = Xapian::Database(m_db->m_basedir);
-	} catch (const Xapian::Error &e) {
-	    ermsg = e.get_msg().c_str();
+	} XCATCHERROR(ermsg);
+	if (!ermsg.empty()) 
 	    break;
-	} catch (const string &s) {
-	    ermsg = s;
-	    if (ermsg.empty()) 
-		ermsg = "Empty error message"; 
-	} catch (const char *s) {
-	    ermsg = s ? s : string();
-	    if (ermsg.empty()) 
-		ermsg = "Empty error message"; 
-	} catch (...) {
-	    ermsg= "Unknown xapian error (not Xapian::Error or string)";
-	    break;
-	}
    }
    LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
    return false;
@ -159,6 +164,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc
    parms.get(string("ipath"), doc.ipath);
    parms.get(string("fbytes"), doc.fbytes);
    parms.get(string("dbytes"), doc.dbytes);
+    parms.get(string("sig"), doc.sig);
    doc.xdocid = docid;
    return true;
 }
@ -544,11 +550,6 @@ bool Db::open(const string& dir, const string &stops, OpenMode mode,
    return false;
 }

-string Db::getDbDir()
-{
-    return m_basedir;
-}
-
 // Note: xapian has no close call, we delete and recreate the db
 bool Db::close()
 {
@ -811,7 +812,7 @@ static const int MB = 1024 * 1024;
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
 // metadata), and update database
-bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
+bool Db::add(const string &fn, const Doc &idoc)
 {
    LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
    if (m_ndb == 0)
@ -899,7 +900,7 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
 	    }
 	    splitData.setprefix(pfx); // Subject
 	    splitter.text_to_words(noacc);
-	    splitData.setprefix(emptystring);
+	    splitData.setprefix(string());
 	    splitData.basepos += splitData.curpos + 100;
 	}
    }
@ -934,31 +935,9 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
 	newdocument.add_term(noacc);
    }

-    // Pathname/ipath terms. This is used for file existence/uptodate
-    // checks, and unique id for the replace_document() call 
-
-    // Truncate the filepath part to a reasonable length and
-    // replace the truncated part with a hopefully unique hash
-    string hash;
-    pathHash(fn, hash, PATHHASHLEN);
-    LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
-
-    // Unique term: makes unique identifier for documents
-    // either path or path+ipath inside multidocument files.
-    // We only add a path term if ipath is empty. Else there will be a qterm
-    // (path+ipath), and a pseudo-doc will be created to stand for the file 
-    // itself (for up to date checks). This is handled by 
-    // DbIndexer::processone() 
-    string uniterm;
-    if (doc.ipath.empty()) {
-	uniterm = "P" + hash;
-#ifdef MTIME_IN_VALUE
-#error need to fix fmtime to be stored as omega does it (bin net order str)
-	newdocument.add_value(VALUE_LASTMOD, doc.fmtime);
-#endif
-    } else {
-	uniterm = "Q" + hash + "|" + doc.ipath;
-    }
+    // Pathname/ipath unique term: this is used for file existence/uptodate
+    // checks, and unique id for the replace_document() call.
+    string uniterm = make_uniterm(fn, doc.ipath);
    newdocument.add_term(uniterm);

    // Dates etc...
@ -985,14 +964,18 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
 	record += "\ndmtime=" + doc.dmtime;
    }
    record += "\norigcharset=" + doc.origcharset;
-    char sizebuf[20]; 
-    sizebuf[0] = 0;
-    if (stp)
-	sprintf(sizebuf, "%ld", (long)stp->st_size);
-    if (sizebuf[0])
-	record += string("\nfbytes=") + sizebuf;
+
+    if (!doc.fbytes.empty())
+	record += string("\nfbytes=") + doc.fbytes;
+    // Note that we add the signature both as a value and in the data record
+    if (!doc.sig.empty())
+	record += string("\nsig=") + doc.sig;
+    newdocument.add_value(VALUE_SIG, doc.sig);
+
+    char sizebuf[30]; 
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
    record += string("\ndbytes=") + sizebuf;
+
    if (!doc.ipath.empty()) {
 	record += "\nipath=" + doc.ipath;
    }
@ -1062,71 +1045,58 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
 }

 // Test if given filename has changed since last indexed:
-bool Db::needUpdate(const string &filename, const struct stat *stp)
+bool Db::needUpdate(const string &filename, const string& sig)
 {
-    //    Chrono chron;
    if (m_ndb == 0)
 	return false;

-    string hash;
-    pathHash(filename, hash, PATHHASHLEN);
-    string pterm  = "P" + hash;
+    string uniterm = make_uniterm(filename, string());
    string ermsg;

-    // We look up the document indexed by the Pterm. This is either
+    // We look up the document indexed by the uniterm. This is either
    // the actual document file, or, for a multi-document file, the
    // pseudo-doc we create to stand for the file itself.

    // We try twice in case database needs to be reopened.
    for (int tries = 0; tries < 2; tries++) {
 	try {
-	    // Get the Pterm doc or pseudo-doc
-	    Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pterm);
-	    if (docid == m_ndb->db.postlist_end(pterm)) {
+	    // Get the doc or pseudo-doc
+	    Xapian::PostingIterator docid = m_ndb->db.postlist_begin(uniterm);
+	    if (docid == m_ndb->db.postlist_end(uniterm)) {
 		// If no document exist with this path, we do need update
-		LOGDEB2(("Db::needUpdate: no path: [%s]\n", pterm.c_str()));
+		LOGDEB(("Db::needUpdate: no path: [%s]\n", uniterm.c_str()));
 		return true;
 	    }
 	    Xapian::Document doc = m_ndb->db.get_document(*docid);

-	    // Retrieve file modification time from db stored value
-#ifdef MTIME_IN_VALUE
-	    // This is slightly faster, but we'd need to setup a conversion
-	    // for old dbs, and it's not really worth it
-	    string value = doc.get_value(VALUE_LASTMOD);
-#error fixme make storage format compatible with omega
-	    const char *cp = value.c_str();
-#else
+	    // Retrieve old file/doc signature from value
+	    string osig = doc.get_value(VALUE_SIG);
+#if 0
+	    // Get old  sig from data record
 	    string data = doc.get_data();
-	    const char *cp = strstr(data.c_str(), "fmtime=");
-	    if (cp) {
-		cp += 7;
-	    } else {
-		cp = strstr(data.c_str(), "mtime=");
-		if (cp)
-		    cp+= 6;
-	    }
+	    string::size_type i1, i2;
+	    i1 = data.find("sig=");
+	    if (i1 == string::npos) 
+		return true;
+	    i1 += 4;
+	    if (i1 >= data.length())
+		return true;
+	    i2 = data.find_first_of("\n\r", i1);
+	    if (i2 == string::npos)
+		return true;
+	    string osig = data.substr(i1, i2-i1);
 #endif
-	    // If the time string begins with a "+", force an update. Happens
-	    // after a filter error, see indexer.cpp, processone()
-	    time_t mtime = (!cp || *cp == '+') ? 0 : atoll(cp);
-
-	    // Retrieve file size as stored in db data
-	    cp = strstr(data.c_str(), "fbytes=");
-	    if (cp)
-		cp += 7; 
-	    off_t fbytes = cp ? atoll(cp) : 0;
-
-	    // Compare db time and size data to filesystem's
-	    if (mtime != stp->st_mtime || fbytes != stp->st_size) {
-		LOGDEB2(("Db::needUpdate:yes: mtime: D %ld F %ld."
-			 "sz D %ld F %ld\n", long(mtime), long(stp->st_mtime),
-			 long(fbytes), long(stp->st_size)));
+	    LOGDEB(("Db::needUpdate: oldsig [%s] new [%s]\n",
+		    osig.c_str(), sig.c_str()));
+	    // Compare new/old sig
+	    if (sig != osig) {
+		LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s]\n",
+			osig.c_str(), sig.c_str()));
 		// Db is not up to date. Let's index the file
 		return true;
 	    } 

-	    LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));
+	    LOGDEB(("Db::needUpdate: uptodate: [%s]\n", uniterm.c_str()));

 	    // Up to date. 

@ -1135,7 +1105,7 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)

 	    // Set the existence flag for all the subdocs (if any)
 	    vector<Xapian::docid> docids;
-	    if (!m_ndb->subDocs(hash, docids)) {
+	    if (!m_ndb->subDocs(uniterm, docids)) {
 		LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
 		return true;
 	    }
@ -1146,12 +1116,13 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
 		    updated[*it] = true;
 		}
 	    }
-	    //	    LOGDEB(("Db::needUpdate: used %d mS\n", chron.millis()));
 	    return false;
 	} catch (const Xapian::DatabaseModifiedError &e) {
 	    LOGDEB(("Db::needUpdate: got modified error. reopen/retry\n"));
 	    reOpen();
 	} XCATCHERROR(ermsg);
+	if (!ermsg.empty())
+	    break;
    }
    LOGERR(("Db::needUpdate: error while checking existence: %s\n", 
 	    ermsg.c_str()));
@ -1258,22 +1229,20 @@ bool Db::purgeFile(const string &fn)
    if (m_ndb == 0)
 	return false;
    Xapian::WritableDatabase db = m_ndb->wdb;
-    string hash;
-    pathHash(fn, hash, PATHHASHLEN);
-    string pterm  = "P" + hash;
+    string uniterm = make_uniterm(fn, string());
    string ermsg;
    try {
-	Xapian::PostingIterator docid = db.postlist_begin(pterm);
-	if (docid == db.postlist_end(pterm))
+	Xapian::PostingIterator docid = db.postlist_begin(uniterm);
+	if (docid == db.postlist_end(uniterm))
 	    return true;
 	LOGDEB(("purgeFile: delete docid %d\n", *docid));
 	db.delete_document(*docid);
 	vector<Xapian::docid> docids;
-	m_ndb->subDocs(hash, docids);
+	m_ndb->subDocs(uniterm, docids);
 	LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
 	for (vector<Xapian::docid>::iterator it = docids.begin();
 	     it != docids.end(); it++) {
-	    LOGDEB2(("Db::purgeFile: delete subdoc %d\n", *it));
+	    LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it));
 	    db.delete_document(*it);
 	}
 	return true;
@ -1573,22 +1542,20 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
    if (*pc)
 	*pc = 100;

-    string hash;
-    pathHash(fn, hash, PATHHASHLEN);
-    string pqterm  = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
+    string uniterm = make_uniterm(fn, ipath);
    string ermsg;
    try {
-	if (!m_ndb->db.term_exists(pqterm)) {
+	if (!m_ndb->db.term_exists(uniterm)) {
 	    // Document found in history no longer in the database.
 	    // We return true (because their might be other ok docs further)
 	    // but indicate the error with pc = -1
 	    if (*pc) 
 		*pc = -1;
 	    LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
-		     pqterm.c_str(), pqterm.length()));
+		     uniterm.c_str(), uniterm.length()));
 	    return true;
 	}
-	Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
+	Xapian::PostingIterator docid = m_ndb->db.postlist_begin(uniterm);
 	Xapian::Document xdoc = m_ndb->db.get_document(*docid);
 	string data = xdoc.get_data();
 	list<string> terms;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -16,7 +16,7 @@
 */
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.56 2008-07-01 08:28:45 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.57 2008-07-28 08:42:52 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -86,42 +86,45 @@ class Db {
    bool close();
    bool isopen();

-    /** Retrieve main database directory */
-    string getDbDir();
-
    /** Get explanation about last error */
    string getReason() const {return m_reason;}

-    /** Return list of configured stop words */
-    const StopList& getStopList() const {return m_stops;}
-
-    /** Field name to prefix translation (ie: author -> 'A') */
-    bool fieldToPrefix(const string& fldname, string &pfx);
-
    /** List possible stemmer names */
    static list<string> getStemmerNames();

-    /* Update-related methods ******************************************/
+    /** List existing stemming databases */
+    std::list<std::string> getStemLangs();

-    /** Add document. The Doc class should have been filled as much as
-       possible depending on the document type */
-    bool add(const string &filename, const Doc &doc, const struct stat *stp);
+    /* The next two, only for searchdata, should be somehow hidden */
+    /* Return list of configured stop words */
+    const StopList& getStopList() const {return m_stops;}
+    /* Field name to prefix translation (ie: author -> 'A') */
+    bool fieldToPrefix(const string& fldname, string &pfx);
+
+    /* Update-related methods ******************************************/

    /** Test if the db entry for the given filename/stat is up to date. This
     * has the side-effect of setting the existence flag for the file document
     * and all subdocs if any (for later use by 'purge()') */
-    bool needUpdate(const string &filename, const struct stat *stp);
+    bool needUpdate(const string &udi, const string& sig);
+
+    /** Add document. The Doc class should have been filled as much as
+      * possible depending on the document type */
+    bool add(const string &udi, const Doc &doc);
+
+    /** Delete document(s) for given UDI, including subdocs */
+    bool purgeFile(const string &fn);

    /** Remove documents that no longer exist in the file system. This
-	depends on the update map, which is built during
-	indexation. This should only be called after a full walk of
-	the file system, else the update map will not be complete, and
-	many documents will be deleted that shouldn't */
+     * depends on the update map, which is built during
+     * indexation. This should only be called after a full walk of
+     * the file system, else the update map will not be complete, and
+     * many documents will be deleted that shouldn't, which is why this
+     * has to be called externally, we can't know if the indexing
+     * pass was complete or partial.
+     */
    bool purge();

-    /** Delete document(s) for given filename */
-    bool purgeFile(const string &filename);
-
    /** Create stem expansion database for given language. */
    bool createStemDb(const string &lang);
    /** Delete stem expansion database for given language. */
@ -146,6 +149,9 @@ class Db {
    bool termMatch(MatchType typ, const string &lang, const string &s, 
 		   list<TermMatchEntry>& result, int max = -1);

+    /** Specific filename wildcard expansion */
+    bool filenameWildExp(const string& exp, list<string>& names);
+
    /** Set parameters for synthetic abstract generation */
    void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);

@ -153,12 +159,11 @@ class Db {
     * the input query. This uses index data only (no access to the file) */
    bool makeDocAbstract(Doc &doc, Query *query, string& abstract);

-    /** Get document for given filename and ipath */
+    /** Get document for given filename and ipath. Used by the 'history'
+     * feature (and nothing else?) */
    bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent);

-    /** Get a list of existing stemming databases */
-    std::list<std::string> getStemLangs();
-
+    /* The following are mainly for the aspell module */
    /** Whole term list walking. */
    TermIter *termWalkOpen();
    bool termWalkNext(TermIter *, string &term);
@ -169,9 +174,6 @@ class Db {
    bool stemDiffers(const string& lang, const string& term, 
 		     const string& base);
    
-    /** Filename wildcard expansion */
-    bool filenameWildExp(const string& exp, list<string>& names);
-
    /* This has to be public for access by embedded Query::Native */
    Native *m_ndb; 

--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@ -4,7 +4,7 @@
 #include "xapian.h"

 namespace Rcl {
-/* @(#$Id: rcldb_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $  (C) 2007 J.F.Dockes */
+/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $  (C) 2007 J.F.Dockes */

 // Generic Xapian exception catching code. We do this quite often,
 // and I have no idea how to do this except for a macro
@ -60,7 +60,7 @@ class Db::Native {
     *  unique term for replace_document, and for retrieving by
     *  path/ipath (history)
     */
-    bool subDocs(const string &hash, vector<Xapian::docid>& docids);
+    bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);

 };
 }
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -16,7 +16,7 @@
 */
 #ifndef _RCLDOC_H_INCLUDED_
 #define _RCLDOC_H_INCLUDED_
-/* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: rcldoc.h,v 1.4 2008-07-28 08:42:52 dockes Exp $  (C) 2006 J.F.Dockes */

 #include <string>
 #include <map>
@ -58,9 +58,20 @@ class Doc {
    // Attribute for the "abstract" entry. true if it is just the top
    // of doc, not a native document attribute
    bool   syntabs;      
-
-    string fbytes;       // File size. Set by Db::Add
-    string dbytes;       // Doc size. Set by Db::Add from text length
+    
+    // File size. Index: Set by caller prior to Db::Add. Query: set by
+    // rcldb from index doc data. Historically this always has
+    // represented the whole file size (as from stat()), but there
+    // would be a need for a 3rd value for multidoc files (file
+    // size/doc size/ doc text size)
+    string fbytes;       
+    // Doc text size. Index: from text.length(). Query: set by rcldb from
+    // index doc data.
+    string dbytes;
+    // Doc signature. Used for up to date checks. This is opaque, and
+    // could just as well be ctime, size, ctime+size, md5, whatever.
+    // Index: set by Db::Add caller. Query: set from doc data.
+    string sig;

    // The following fields don't go to the db record
    
@ -82,6 +93,7 @@ class Doc {
 	syntabs = false;
 	fbytes.erase();
 	dbytes.erase();
+	sig.erase();

 	text.erase();
 	pc = 0;
--- a/src/utils/base64.h
+++ b/src/utils/base64.h
@ -16,10 +16,10 @@
 */
 #ifndef _BASE64_H_INCLUDED_
 #define _BASE64_H_INCLUDED_
-/* @(#$Id: base64.h,v 1.2 2006-01-30 11:15:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: base64.h,v 1.3 2008-07-28 08:42:52 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>

-void base64_encode(const std::string &in, std::string &out);
+void base64_encode(const std::string& in, std::string& out);
 bool base64_decode(const std::string& in, std::string& out);

 #endif /* _BASE64_H_INCLUDED_ */