From 56a56500c1d8f46d213f0a04e8ad631ab5081a83 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Fri, 4 Oct 2013 10:57:11 +0200
Subject: [PATCH] Handle partial indexing of document restricted to metadata
 from extended attributes

---
 src/VERSION              |   2 +-
 src/index/fsindexer.cpp  | 250 ++++++++------
 src/index/fsindexer.h    |   4 +
 src/index/rclmonrcv.cpp  |  30 +-
 src/rcldb/rcldb.cpp      | 722 ++++++++++++++++++++++++---------------
 src/rcldb/rcldb.h        |   4 +
 src/rcldb/rcldb_p.h      |  12 +
 src/rcldb/rcldoc.h       |   7 +-
 src/sampleconf/fields    |   2 +-
 tests/config/recoll.conf |   1 +
 tests/empty/empty.txt    |   2 +-
 tests/runtests.sh        |  31 ++
 tests/xattr/fields       |   4 +
 tests/xattr/xattr.sh     |  85 +++++
 tests/xattr/xattr.txt    |  57 ++++
 15 files changed, 811 insertions(+), 402 deletions(-)
 create mode 100644 tests/xattr/fields
 create mode 100755 tests/xattr/xattr.sh
 create mode 100644 tests/xattr/xattr.txt

diff --git a/src/VERSION b/src/VERSION
index 83d5e73f..39893559 100644
--- a/src/VERSION
+++ b/src/VERSION
@@ -1 +1 @@
-1.19.5
+1.20.0
diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp
index 0a91349d..30f9b169 100644
--- a/src/index/fsindexer.cpp
+++ b/src/index/fsindexer.cpp
@@ -45,7 +45,7 @@
 #include "cancelcheck.h"
 #include "rclinit.h"
 #include "execmd.h"
-
+#include "extrameta.h"
 
 // When using extended attributes, we have to use the ctime, because
 // this is all that gets set when the attributes are modified. 
@@ -104,7 +104,7 @@ public:
 
 FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) 
     : m_config(cnf), m_db(db), m_updater(updfunc), 
-      m_missing(new FSIFIMissingStore)
+      m_missing(new FSIFIMissingStore), m_detectxattronly(false)
 #ifdef IDX_THREADS
     , m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first), 
       m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
@@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
 {
     LOGDEB1(("FsIndexer::FsIndexer\n"));
     m_havelocalfields = m_config->hasNameAnywhere("localfields");
+    m_config->getConfParam("detectxattronly", &m_detectxattronly);
 
 #ifdef IDX_THREADS
     m_stableconfig = new RclConfig(*m_config);
@@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
     bool existingDoc;
     bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
 
+    // If ctime (which we use for the sig) differs from mtime, then at most
+    // the extended attributes were changed, no need to index content.
+    // This unfortunately leaves open the case where the data was
+    // modified, then the extended attributes, in which case we will
+    // miss the data update. We would have to store both the mtime and
+    // the ctime to avoid this
+    bool xattronly = m_detectxattronly && !m_db->inFullReset() && 
+	existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
+    
     if (!needupdate) {
 	LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
 	if (m_updater) {
@@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
     LOGDEB0(("processone: processing: [%s] %s\n", 
              displayableBytes(stp->st_size).c_str(), fn.c_str()));
 
-    FileInterner interner(fn, stp, config, FileInterner::FIF_none);
-    if (!interner.ok()) {
-        // no indexing whatsoever in this case. This typically means that
-        // indexallfilenames is not set
-        return FsTreeWalker::FtwOk;
-    }
-    interner.setMissingStore(m_missing);
-
     string utf8fn = compute_utf8fn(config, fn);
 
     // parent_udi is initially the same as udi, it will be used if there 
@@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config,
     char ascdate[30];
     sprintf(ascdate, "%ld", long(stp->st_mtime));
 
-    FileInterner::Status fis = FileInterner::FIAgain;
     bool hadNullIpath = false;
-    bool hadNonNullIpath = false;
-    while (fis == FileInterner::FIAgain) {
-	doc.erase();
-        try {
-            fis = interner.internfile(doc);
-        } catch (CancelExcept) {
-            LOGERR(("fsIndexer::processone: interrupted\n"));
-            return FsTreeWalker::FtwStop;
-        }
+    string mimetype;
 
-        // We index at least the file name even if there was an error.
-        // We'll change the signature to ensure that the indexing will
-        // be retried every time.
+    if (!xattronly) {
+	FileInterner interner(fn, stp, config, FileInterner::FIF_none);
+	if (!interner.ok()) {
+	    // no indexing whatsoever in this case. This typically means that
+	    // indexallfilenames is not set
+	    return FsTreeWalker::FtwOk;
+	}
+	mimetype = interner.getMimetype();
 
-	// Internal access path for multi-document files. If empty, this is
-	// for the main file.
-	if (doc.ipath.empty()) {
-	    hadNullIpath = true;
-	    if (hadNonNullIpath) {
-		// Note that only the filters can reliably compute
-		// this. What we do is dependant of the doc order (if
-		// we see the top doc first, we won't set the flag)
-		doc.haschildren = true;
+	interner.setMissingStore(m_missing);
+	FileInterner::Status fis = FileInterner::FIAgain;
+	bool hadNonNullIpath = false;
+	while (fis == FileInterner::FIAgain) {
+	    doc.erase();
+	    try {
+		fis = interner.internfile(doc);
+	    } catch (CancelExcept) {
+		LOGERR(("fsIndexer::processone: interrupted\n"));
+		return FsTreeWalker::FtwStop;
 	    }
-	} else {
-	    hadNonNullIpath = true;
-	    make_udi(fn, doc.ipath, udi);
-	}
 
-	// Set file name, mod time and url if not done by filter
-	if (doc.fmtime.empty())
-	    doc.fmtime = ascdate;
-        if (doc.url.empty())
-            doc.url = cstr_fileu + fn;
-	const string *fnp = 0;
-	if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
-	    doc.meta[Rcl::Doc::keyfn] = utf8fn;
+	    // We index at least the file name even if there was an error.
+	    // We'll change the signature to ensure that the indexing will
+	    // be retried every time.
 
-	char cbuf[100]; 
-	sprintf(cbuf, "%lld", (long long)stp->st_size);
-	doc.pcbytes = cbuf;
-	// Document signature for up to date checks. All subdocs inherit the
-	// file's.
-	doc.sig = sig;
-
-	// If there was an error, ensure indexing will be
-	// retried. This is for the once missing, later installed
-	// filter case. It can make indexing much slower (if there are
-	// myriads of such files, the ext script is executed for them
-	// and fails every time)
-	if (fis == FileInterner::FIError) {
-	    doc.sig += cstr_plus;
-	}
-
-        // Possibly add fields from local config
-        if (m_havelocalfields) 
-            setlocalfields(localfields, doc);
-
-	// Add document to database. If there is an ipath, add it as a children
-	// of the file document.
-#ifdef IDX_THREADS
-	if (m_haveSplitQ) {
-	    DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? 
-					  cstr_null : parent_udi, doc);
-	    if (!m_dwqueue.put(tp)) {
-		LOGERR(("processonefile: wqueue.put failed\n"));
-		return FsTreeWalker::FtwError;
-	    } 
-	} else {
-#endif
-	    if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? 
-				   cstr_null : parent_udi, doc)) {
-		return FsTreeWalker::FtwError;
+	    // Internal access path for multi-document files. If empty, this is
+	    // for the main file.
+	    if (doc.ipath.empty()) {
+		hadNullIpath = true;
+		if (hadNonNullIpath) {
+		    // Note that only the filters can reliably compute
+		    // this. What we do is dependant of the doc order (if
+		    // we see the top doc first, we won't set the flag)
+		    doc.haschildren = true;
+		}
+	    } else {
+		hadNonNullIpath = true;
+		make_udi(fn, doc.ipath, udi);
 	    }
+
+	    // Set file name, mod time and url if not done by filter
+	    if (doc.fmtime.empty())
+		doc.fmtime = ascdate;
+	    if (doc.url.empty())
+		doc.url = cstr_fileu + fn;
+	    const string *fnp = 0;
+	    if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
+		doc.meta[Rcl::Doc::keyfn] = utf8fn;
+
+	    char cbuf[100]; 
+	    sprintf(cbuf, "%lld", (long long)stp->st_size);
+	    doc.pcbytes = cbuf;
+	    // Document signature for up to date checks. All subdocs inherit the
+	    // file's.
+	    doc.sig = sig;
+
+	    // If there was an error, ensure indexing will be
+	    // retried. This is for the once missing, later installed
+	    // filter case. It can make indexing much slower (if there are
+	    // myriads of such files, the ext script is executed for them
+	    // and fails every time)
+	    if (fis == FileInterner::FIError) {
+		doc.sig += cstr_plus;
+	    }
+
+	    // Possibly add fields from local config
+	    if (m_havelocalfields) 
+		setlocalfields(localfields, doc);
+
+	    // Add document to database. If there is an ipath, add it
+	    // as a child of the file document.
 #ifdef IDX_THREADS
-	}
+	    if (m_haveSplitQ) {
+		DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? 
+					      cstr_null : parent_udi, doc);
+		if (!m_dwqueue.put(tp)) {
+		    LOGERR(("processonefile: wqueue.put failed\n"));
+		    return FsTreeWalker::FtwError;
+		} 
+	    } else {
+#endif
+		if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? 
+				       cstr_null : parent_udi, doc)) {
+		    return FsTreeWalker::FtwError;
+		}
+#ifdef IDX_THREADS
+	    }
 #endif
 
-	// Tell what we are doing and check for interrupt request
-	if (m_updater) {
+	    // Tell what we are doing and check for interrupt request
+	    if (m_updater) {
 #ifdef IDX_THREADS
-	    PTMutexLocker locker(m_updater->m_mutex);
+		PTMutexLocker locker(m_updater->m_mutex);
 #endif
-	    ++(m_updater->status.docsdone);
-            if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
-                m_updater->status.dbtotdocs = m_updater->status.docsdone;
-            m_updater->status.fn = fn;
-            if (!doc.ipath.empty())
-                m_updater->status.fn += "|" + doc.ipath;
-            if (!m_updater->update()) {
-                return FsTreeWalker::FtwStop;
-            }
+		++(m_updater->status.docsdone);
+		if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
+		    m_updater->status.dbtotdocs = m_updater->status.docsdone;
+		m_updater->status.fn = fn;
+		if (!doc.ipath.empty())
+		    m_updater->status.fn += "|" + doc.ipath;
+		if (!m_updater->update()) {
+		    return FsTreeWalker::FtwStop;
+		}
+	    }
 	}
-    }
 
-    // If this doc existed and it's a container, recording for
-    // possible subdoc purge (this will be used only if we don't do a
-    // db-wide purge, e.g. if we're called from indexfiles()).
-    LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
-	     existingDoc, hadNonNullIpath));
-    if (existingDoc && hadNonNullIpath) {
-	m_purgeCandidates.record(parent_udi);
+	// If this doc existed and it's a container, recording for
+	// possible subdoc purge (this will be used only if we don't do a
+	// db-wide purge, e.g. if we're called from indexfiles()).
+	LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
+		 existingDoc, hadNonNullIpath));
+	if (existingDoc && hadNonNullIpath) {
+	    m_purgeCandidates.record(parent_udi);
+	}
     }
 
     // If we had no instance with a null ipath, we create an empty
     // document to stand for the file itself, to be used mainly for up
     // to date checks. Typically this happens for an mbox file.
-    if (hadNullIpath == false) {
-	LOGDEB1(("Creating empty doc for file\n"));
+    //
+    // If xattronly is set, ONLY the extattr metadata is valid and will be used
+    // by the following step.
+    if (xattronly || hadNullIpath == false) {
+	LOGDEB(("Creating empty doc for file or pure xattr update\n"));
 	Rcl::Doc fileDoc;
-	fileDoc.fmtime = ascdate;
-	fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
-	fileDoc.haschildren = true;
-	fileDoc.mimetype = interner.getMimetype();
-	fileDoc.url = cstr_fileu + fn;
-        if (m_havelocalfields) 
-            setlocalfields(localfields, fileDoc);
-	char cbuf[100]; 
-	sprintf(cbuf, "%lld", (long long)stp->st_size);
-	fileDoc.pcbytes = cbuf;
+	if (xattronly) {
+	    map<string, string> xfields;
+	    reapXAttrs(config, fn, xfields);
+	    docFieldsFromXattrs(config, xfields, fileDoc);
+	    fileDoc.onlyxattr = true;
+	} else {
+	    fileDoc.fmtime = ascdate;
+	    fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
+	    fileDoc.haschildren = true;
+	    fileDoc.mimetype = mimetype;
+	    fileDoc.url = cstr_fileu + fn;
+	    if (m_havelocalfields) 
+		setlocalfields(localfields, fileDoc);
+	    char cbuf[100]; 
+	    sprintf(cbuf, "%lld", (long long)stp->st_size);
+	    fileDoc.pcbytes = cbuf;
+	}
+
 	fileDoc.sig = sig;
 
 #ifdef IDX_THREADS
diff --git a/src/index/fsindexer.h b/src/index/fsindexer.h
index 4f3a176c..f7ce0cd9 100644
--- a/src/index/fsindexer.h
+++ b/src/index/fsindexer.h
@@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
     string       m_slocalfields;
     map<string, string>  m_localfields;
 
+    // Activate detection of xattr-only document updates. Experimental, so
+    // needs a config option
+    bool         m_detectxattronly;
+
 #ifdef IDX_THREADS
     friend void *FsIndexerDbUpdWorker(void*);
     friend void *FsIndexerInternfileWorker(void*);
diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp
index d961b600..72305757 100644
--- a/src/index/rclmonrcv.cpp
+++ b/src/index/rclmonrcv.cpp
@@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
     code &= ~(IN_ISDIR|IN_ONESHOT);
     switch (code) {
     case IN_ACCESS: return "IN_ACCESS";
+    case IN_MODIFY: return "IN_MODIFY";
     case IN_ATTRIB: return "IN_ATTRIB";
-    case IN_CLOSE: return "IN_CLOSE";
-    case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
     case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
+    case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
+    case IN_CLOSE: return "IN_CLOSE";
+    case IN_OPEN: return "IN_OPEN";
+    case IN_MOVED_FROM: return "IN_MOVED_FROM";
+    case IN_MOVED_TO: return "IN_MOVED_TO";
+    case IN_MOVE: return "IN_MOVE";
     case IN_CREATE: return "IN_CREATE";
     case IN_DELETE: return "IN_DELETE";
     case IN_DELETE_SELF: return "IN_DELETE_SELF";
-    case IN_IGNORED: return "IN_IGNORED";
-    case IN_MODIFY: return "IN_MODIFY";
-    case IN_MOVE: return "IN_MOVE";
-    case IN_MOVED_FROM: return "IN_MOVED_FROM";
-    case IN_MOVED_TO: return "IN_MOVED_TO";
     case IN_MOVE_SELF: return "IN_MOVE_SELF";
-    case IN_OPEN: return "IN_OPEN";
-    case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
     case IN_UNMOUNT: return "IN_UNMOUNT";
+    case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
+    case IN_IGNORED: return "IN_IGNORED";
     default: {
 	static char msg[50];
 	sprintf(msg, "Unknown event 0x%x", code);
@@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
     uint32_t mask = IN_MODIFY | IN_CREATE
         | IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
 #ifdef RCL_USE_XATTR
-	// It seems that IN_ATTRIB is not needed to receive extattr
-	// modification events, which is a bit weird because only ctime is
-	// set. 
-	// | IN_ATTRIB
+	// IN_ATTRIB used to be not needed to receive extattr
+	// modification events, which was a bit weird because only ctime is
+	// set, and now it is...
+	| IN_ATTRIB
 #endif
 #ifdef IN_DONT_FOLLOW
 	| IN_DONT_FOLLOW
@@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
 	eraseWatchSubTree(m_idtopath, ev.m_path);
     }
 
-    // IN_ATTRIB apparently not needed, see comment above
-    if (evp->mask & (IN_MODIFY)) {
+    // IN_ATTRIB used to be not needed, but now it is
+    if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
 	ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
     } else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
 	ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 123c7dd4..737a9cc7 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
     return false;
 }
 
+// Clear term from document if its frequency is 0. This should
+// probably be done by Xapian when the freq goes to 0 when removing a
+// posting, but we have to do it ourselves
+bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
+{
+    LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
+
+    // Find the term
+    Xapian::TermIterator xit;
+    XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
+	   xrdb, m_rcldb->m_reason);
+    if (!m_rcldb->m_reason.empty()) {
+	LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", 
+		term.c_str(), m_rcldb->m_reason.c_str()));
+	return false;
+    }
+    if (xit == xdoc.termlist_end() || term.compare(*xit)) {
+	LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", 
+		 term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
+	return false;
+    }
+
+    // Clear the term if its frequency is 0
+    if (xit.get_wdf() == 0) {
+	LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
+	XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
+	if (!m_rcldb->m_reason.empty()) {
+	    LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", 
+		     term.c_str(), m_rcldb->m_reason.c_str()));
+	}
+    }
+    return true;
+}
+
+// Holder for term + pos
+struct DocPosting {
+    DocPosting(string t, Xapian::termpos ps)
+	: term(t), pos(ps) {}
+    string term;
+    Xapian::termpos pos;
+};
+
+// Clear all terms for given field for given document.
+// The terms to be cleared are all those with the appropriate
+// prefix. We also remove the postings for the unprefixed terms (that
+// is, we undo what we did when indexing).
+bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
+			    Xapian::termcount wdfdec)
+{
+    LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
+	     pfx.c_str(), unsigned(xdoc.get_docid())));
+
+    vector<DocPosting> eraselist;
+
+    string wrapd = wrap_prefix(pfx);
+
+    m_rcldb->m_reason.clear();
+    for (int tries = 0; tries < 2; tries++) {
+	try {
+	    Xapian::TermIterator xit;
+	    xit = xdoc.termlist_begin();
+	    xit.skip_to(wrapd);
+	    while (xit != xdoc.termlist_end() && 
+		!(*xit).compare(0, wrapd.size(), wrapd)) {
+		LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
+		Xapian::PositionIterator posit;
+		for (posit = xit.positionlist_begin();
+		     posit != xit.positionlist_end(); posit++) {
+		    eraselist.push_back(DocPosting(*xit, *posit));
+		    eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
+		}
+		xit++;
+	    }
+	} catch (const Xapian::DatabaseModifiedError &e) {
+	    m_rcldb->m_reason = e.get_msg();
+	    xrdb.reopen();
+	    continue;
+	} XCATCHERROR(m_rcldb->m_reason);
+	break;
+    }
+    if (!m_rcldb->m_reason.empty()) {
+	LOGERR(("Db::clearField: failed building erase list: %s\n", 
+		m_rcldb->m_reason.c_str()));
+	return false;
+    }
+
+    // Now remove the found positions, and the terms if the wdf is 0
+    for (vector<DocPosting>::const_iterator it = eraselist.begin();
+	 it != eraselist.end(); it++) {
+	LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", 
+		 it->term.c_str(), int(it->pos)));
+	XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, 
+	       xwdb,m_rcldb->m_reason);
+	if (!m_rcldb->m_reason.empty()) {
+	    // Not that this normally fails for non-prefixed XXST and
+	    // ND, don't make a fuss
+	    LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
+		     it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
+	}
+	clearDocTermIfWdf0(xdoc, it->term);
+    }
+    return true;
+}
+
 // Check if doc given by udi is indexed by term
 bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
 {
@@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
 {
 #ifdef IDX_THREADS
     Chrono chron;
-    // In the case where there is a separate (single) db update
-    // thread, we only need to protect the update map update below
-    // (against interaction with threads calling needUpdate()). Else,
-    // all threads from above need to synchronize here
-    PTMutexLocker lock(m_mutex, m_havewriteq);
+    PTMutexLocker lock(m_mutex);
 #endif
 
     // Check file system full every mbyte of indexed text. It's a bit wasteful
@@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
     try {
 	Xapian::docid did = 
 	    xwdb.replace_document(uniterm, newdocument);
-#ifdef IDX_THREADS
-	// Need to protect against interaction with the up-to-date checks
-	// which also update the existence map
-	PTMutexLocker lock(m_mutex, !m_havewriteq);
-#endif
 	if (did < m_rcldb->updated.size()) {
 	    m_rcldb->updated[did] = true;
 	    LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
@@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
     return false;
 }
 
-
 // The splitter breaks text into words and adds postings to the Xapian
 // document. We use a single object to split all of the document
 // fields and position jumps to separate fields
@@ -1151,7 +1245,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 	return false;
 
     Xapian::Document newdocument;
-
+    
     // The term processing pipeline:
     TermProcIdx tpidx;
     TermProc *nxt = &tpidx;
@@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
     TextSplitDb splitter(newdocument, nxt);
     tpidx.setTSD(&splitter);
 
-    // If the ipath is like a path, index the last element. This is
-    // for compound documents like zip and chm for which the filter
-    // uses the file path as ipath. 
-    if (!doc.ipath.empty() && 
-	doc.ipath.find_first_not_of("0123456789") != string::npos) {
-	string utf8ipathlast;
-	// There is no way in hell we could have an idea of the
-	// charset here, so let's hope it's ascii or utf-8. We call
-	// transcode to strip the bad chars and pray
-	if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
-		      "UTF-8", "UTF-8")) {
-	    splitter.text_to_words(utf8ipathlast);
-	}
-    }
-
-    // Split and index the path from the url for path-based filtering
-    {
-	string path = url_gpath(doc.url);
-	vector<string> vpath;
-	stringToTokens(path, vpath, "/");
-	// If vpath is not /, the last elt is the file/dir name, not a
-	// part of the path.
-	if (vpath.size())
-	    vpath.resize(vpath.size()-1);
-	splitter.curpos = 0;
-	newdocument.add_posting(wrap_prefix(pathelt_prefix),
-				splitter.basepos + splitter.curpos++);
-	for (vector<string>::iterator it = vpath.begin(); 
-	     it != vpath.end(); it++){
-	    if (it->length() > 230) {
-		// Just truncate it. May still be useful because of wildcards
-		*it = it->substr(0, 230);
-	    }
-	    newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
-				    splitter.basepos + splitter.curpos++);
-	}
-    }
-
-    // Index textual metadata.  These are all indexed as text with
-    // positions, as we may want to do phrase searches with them (this
-    // makes no sense for keywords by the way).
-    //
-    // The order has no importance, and we set a position gap of 100
-    // between fields to avoid false proximity matches.
-    map<string, string>::iterator meta_it;
-    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
-	if (!meta_it->second.empty()) {
-	    const FieldTraits *ftp;
-	    // We don't test for an empty prefix here. Some fields are part
-	    // of the internal conf with an empty prefix (ie: abstract).
-	    if (!fieldToTraits(meta_it->first, &ftp)) {
-		LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
-			 meta_it->first.c_str()));
-		continue;
-	    }
-	    LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
-		     meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
-		     meta_it->second.c_str()));
-	    splitter.setprefix(ftp->pfx);
-	    splitter.setwdfinc(ftp->wdfinc);
-	    if (!splitter.text_to_words(meta_it->second))
-                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
-                        meta_it->first.c_str()));
-	}
-    }
-    splitter.setprefix(string());
-    splitter.setwdfinc(1);
-
-    if (splitter.curpos < baseTextPosition)
-	splitter.basepos = baseTextPosition;
-
-    // Split and index body text
-    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
-
-#ifdef TEXTSPLIT_STATS
-    splitter.resetStats();
-#endif
-    if (!splitter.text_to_words(doc.text))
-        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
-
-#ifdef TEXTSPLIT_STATS
-    // Reject bad data. unrecognized base64 text is characterized by
-    // high avg word length and high variation (because there are
-    // word-splitters like +/ inside the data).
-    TextSplit::Stats::Values v = splitter.getStats();
-    // v.avglen > 15 && v.sigma > 12 
-    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
-	LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
-	 "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
-		 v.count, v.avglen, v.sigma, doc.url.c_str(), 
-		 doc.ipath.c_str(), doc.text.c_str()));
-	return true;
-    }
-#endif
-
-    ////// Special terms for other metadata. No positions for these.
-    // Mime type
-    newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
-
-    // Simple file name indexed unsplit for specific "file name"
-    // searches. This is not the same as a filename: clause inside the
-    // query language.
-    // We also add a term for the filename extension if any.
-    string utf8fn;
-    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
-	string fn;
-	if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
-	    // We should truncate after extracting the extension, but this is
-	    // a pathological case anyway
-	    if (fn.size() > 230)
-		utf8truncate(fn, 230);
-	    string::size_type pos = fn.rfind('.');
-	    if (pos != string::npos && pos != fn.length() - 1) {
-		newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
-					     fn.substr(pos + 1));
-	    }
-	    newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
-	}
-    }
-
     // Udi unique term: this is used for file existence/uptodate
     // checks, and unique id for the replace_document() call.
     string uniterm = make_uniterm(udi);
-    newdocument.add_boolean_term(uniterm);
-    // Parent term. This is used to find all descendents, mostly to delete them 
-    // when the parent goes away
-    if (!parent_udi.empty()) {
-	newdocument.add_boolean_term(make_parentterm(parent_udi));
-    }
-    // Dates etc.
-    time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
-			 doc.dmtime.c_str());
-    struct tm *tm = localtime(&mtime);
-    char buf[9];
-    snprintf(buf, 9, "%04d%02d%02d",
-	    tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
-    // Date (YYYYMMDD)
-    newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
-    // Month (YYYYMM)
-    buf[6] = '\0';
-    newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
-    // Year (YYYY)
-    buf[4] = '\0';
-    newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
 
-
-    //////////////////////////////////////////////////////////////////
-    // Document data record. omindex has the following nl separated fields:
-    // - url
-    // - sample
-    // - caption (title limited to 100 chars)
-    // - mime type 
-    //
-    // The title, author, abstract and keywords fields are special,
-    // they always get stored in the document data
-    // record. Configurable other fields can be, too.
-    //
-    // We truncate stored fields abstract, title and keywords to
-    // reasonable lengths and suppress newlines (so that the data
-    // record can keep a simple syntax)
-
-    string record;
-    RECORD_APPEND(record, Doc::keyurl, doc.url);
-    RECORD_APPEND(record, Doc::keytp, doc.mimetype);
-    // We left-zero-pad the times so that they are lexico-sortable
-    leftzeropad(doc.fmtime, 11);
-    RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
-    if (!doc.dmtime.empty()) {
-	leftzeropad(doc.dmtime, 11);
-	RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
-    }
-    RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
-
-    if (doc.fbytes.empty())
-	doc.fbytes = doc.pcbytes;
-
-    if (!doc.fbytes.empty()) {
-	RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
-	leftzeropad(doc.fbytes, 12);
-	newdocument.add_value(VALUE_SIZE, doc.fbytes);
-    }
-    if (doc.haschildren) {
-	newdocument.add_boolean_term(has_children_term);
-    }	
-    if (!doc.pcbytes.empty())
-	RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
-    char sizebuf[30]; 
-    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
-    RECORD_APPEND(record, Doc::keyds, sizebuf);
-
-    // Note that we add the signature both as a value and in the data record
-    if (!doc.sig.empty()) {
-	RECORD_APPEND(record, Doc::keysig, doc.sig);
-	newdocument.add_value(VALUE_SIG, doc.sig);
-    }
-
-    if (!doc.ipath.empty())
-	RECORD_APPEND(record, Doc::keyipt, doc.ipath);
-
-    doc.meta[Doc::keytt] = 
-	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
-    if (!doc.meta[Doc::keytt].empty())
-	RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
-
-    trimstring(doc.meta[Doc::keykw], " \t\r\n");
-    doc.meta[Doc::keykw] = 
-	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
-    // No need to explicitly append the keywords, this will be done by 
-    // the "stored" loop
-
-    // If abstract is empty, we make up one with the beginning of the
-    // document. This is then not indexed, but part of the doc data so
-    // that we can return it to a query without having to decode the
-    // original file.
-    bool syntabs = false;
-    // Note that the map accesses by operator[] create empty entries if they
-    // don't exist yet.
-    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
-    if (doc.meta[Doc::keyabs].empty()) {
-	syntabs = true;
-	if (!doc.text.empty())
-	    doc.meta[Doc::keyabs] = cstr_syntAbs + 
-		neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
+    if (doc.onlyxattr) {
+	// Only updating an existing doc with new extended attributes
+	// data.  Need to read the old doc and its data record
+	// first. This is so different from the normal processing that
+	// it uses a fully separate code path (with some duplication
+	// unfortunately)
+	if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
+	    return false;
     } else {
-	doc.meta[Doc::keyabs] = 
-	    neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
-		      cstr_nc);
-    }
 
-    const set<string>& stored = m_config->getStoredFields();
-    for (set<string>::const_iterator it = stored.begin();
-	 it != stored.end(); it++) {
-	string nm = m_config->fieldCanon(*it);
-	if (!doc.meta[nm].empty()) {
-	    string value = 
-		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
-	    RECORD_APPEND(record, nm, value);
+	// If the ipath is like a path, index the last element. This is
+	// for compound documents like zip and chm for which the filter
+	// uses the file path as ipath. 
+	if (!doc.ipath.empty() && 
+	    doc.ipath.find_first_not_of("0123456789") != string::npos) {
+	    string utf8ipathlast;
+	    // There is no way in hell we could have an idea of the
+	    // charset here, so let's hope it's ascii or utf-8. We call
+	    // transcode to strip the bad chars and pray
+	    if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
+			  "UTF-8", "UTF-8")) {
+		splitter.text_to_words(utf8ipathlast);
+	    }
 	}
-    }
 
-    // If empty pages (multiple break at same pos) were recorded, save
-    // them (this is because we have no way to record them in the
-    // Xapian list
-    if (!tpidx.m_pageincrvec.empty()) {
-	ostringstream multibreaks;
-	for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
-	    if (i != 0)
-		multibreaks << ",";
-	    multibreaks << tpidx.m_pageincrvec[i].first << "," << 
-		tpidx.m_pageincrvec[i].second;
+	// Split and index the path from the url for path-based filtering
+	{
+	    string path = url_gpath(doc.url);
+	    vector<string> vpath;
+	    stringToTokens(path, vpath, "/");
+	    // If vpath is not /, the last elt is the file/dir name, not a
+	    // part of the path.
+	    if (vpath.size())
+		vpath.resize(vpath.size()-1);
+	    splitter.curpos = 0;
+	    newdocument.add_posting(wrap_prefix(pathelt_prefix),
+				    splitter.basepos + splitter.curpos++);
+	    for (vector<string>::iterator it = vpath.begin(); 
+		 it != vpath.end(); it++){
+		if (it->length() > 230) {
+		    // Just truncate it. May still be useful because of wildcards
+		    *it = it->substr(0, 230);
+		}
+		newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
+					splitter.basepos + splitter.curpos++);
+	    }
+	}
+
+	// Index textual metadata.  These are all indexed as text with
+	// positions, as we may want to do phrase searches with them (this
+	// makes no sense for keywords by the way).
+	//
+	// The order has no importance, and we set a position gap of 100
+	// between fields to avoid false proximity matches.
+	map<string, string>::iterator meta_it;
+	for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
+	    if (!meta_it->second.empty()) {
+		const FieldTraits *ftp;
+		// We don't test for an empty prefix here. Some fields are part
+		// of the internal conf with an empty prefix (ie: abstract).
+		if (!fieldToTraits(meta_it->first, &ftp)) {
+		    LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
+			     meta_it->first.c_str()));
+		    continue;
+		}
+		LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
+			 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
+			 meta_it->second.c_str()));
+		splitter.setprefix(ftp->pfx);
+		splitter.setwdfinc(ftp->wdfinc);
+		if (!splitter.text_to_words(meta_it->second))
+		    LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
+			    meta_it->first.c_str()));
+	    }
+	}
+	splitter.setprefix(string());
+	splitter.setwdfinc(1);
+
+	if (splitter.curpos < baseTextPosition)
+	    splitter.basepos = baseTextPosition;
+
+	// Split and index body text
+	LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
+
+#ifdef TEXTSPLIT_STATS
+	splitter.resetStats();
+#endif
+	if (!splitter.text_to_words(doc.text))
+	    LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
+
+#ifdef TEXTSPLIT_STATS
+	// Reject bad data. unrecognized base64 text is characterized by
+	// high avg word length and high variation (because there are
+	// word-splitters like +/ inside the data).
+	TextSplit::Stats::Values v = splitter.getStats();
+	// v.avglen > 15 && v.sigma > 12 
+	if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
+	    LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
+		     "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
+		     v.count, v.avglen, v.sigma, doc.url.c_str(), 
+		     doc.ipath.c_str(), doc.text.c_str()));
+	    return true;
+	}
+#endif
+
+	////// Special terms for other metadata. No positions for these.
+	// Mime type
+	newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
+
+	// Simple file name indexed unsplit for specific "file name"
+	// searches. This is not the same as a filename: clause inside the
+	// query language.
+	// We also add a term for the filename extension if any.
+	string utf8fn;
+	if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
+	    string fn;
+	    if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
+		// We should truncate after extracting the extension, but this is
+		// a pathological case anyway
+		if (fn.size() > 230)
+		    utf8truncate(fn, 230);
+		string::size_type pos = fn.rfind('.');
+		if (pos != string::npos && pos != fn.length() - 1) {
+		    newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
+						 fn.substr(pos + 1));
+		}
+		newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
+	    }
+	}
+
+	newdocument.add_boolean_term(uniterm);
+	// Parent term. This is used to find all descendents, mostly
+	// to delete them when the parent goes away
+	if (!parent_udi.empty()) {
+	    newdocument.add_boolean_term(make_parentterm(parent_udi));
+	}
+	// Dates etc.
+	time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
+			     doc.dmtime.c_str());
+	struct tm *tm = localtime(&mtime);
+	char buf[9];
+	snprintf(buf, 9, "%04d%02d%02d",
+		 tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
+	// Date (YYYYMMDD)
+	newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
+	// Month (YYYYMM)
+	buf[6] = '\0';
+	newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
+	// Year (YYYY)
+	buf[4] = '\0';
+	newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
+
+
+	//////////////////////////////////////////////////////////////////
+	// Document data record. omindex has the following nl separated fields:
+	// - url
+	// - sample
+	// - caption (title limited to 100 chars)
+	// - mime type 
+	//
+	// The title, author, abstract and keywords fields are special,
+	// they always get stored in the document data
+	// record. Configurable other fields can be, too.
+	//
+	// We truncate stored fields abstract, title and keywords to
+	// reasonable lengths and suppress newlines (so that the data
+	// record can keep a simple syntax)
+
+	string record;
+	RECORD_APPEND(record, Doc::keyurl, doc.url);
+	RECORD_APPEND(record, Doc::keytp, doc.mimetype);
+	// We left-zero-pad the times so that they are lexico-sortable
+	leftzeropad(doc.fmtime, 11);
+	RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
+	if (!doc.dmtime.empty()) {
+	    leftzeropad(doc.dmtime, 11);
+	    RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
+	}
+	RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
+
+	if (doc.fbytes.empty())
+	    doc.fbytes = doc.pcbytes;
+
+	if (!doc.fbytes.empty()) {
+	    RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
+	    leftzeropad(doc.fbytes, 12);
+	    newdocument.add_value(VALUE_SIZE, doc.fbytes);
+	}
+	if (doc.haschildren) {
+	    newdocument.add_boolean_term(has_children_term);
+	}	
+	if (!doc.pcbytes.empty())
+	    RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
+	char sizebuf[30]; 
+	sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
+	RECORD_APPEND(record, Doc::keyds, sizebuf);
+
+	// Note that we add the signature both as a value and in the data record
+	if (!doc.sig.empty()) {
+	    RECORD_APPEND(record, Doc::keysig, doc.sig);
+	    newdocument.add_value(VALUE_SIG, doc.sig);
+	}
+
+	if (!doc.ipath.empty())
+	    RECORD_APPEND(record, Doc::keyipt, doc.ipath);
+
+	doc.meta[Doc::keytt] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
+	if (!doc.meta[Doc::keytt].empty())
+	    RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
+
+	trimstring(doc.meta[Doc::keykw], " \t\r\n");
+	doc.meta[Doc::keykw] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
+	// No need to explicitly append the keywords, this will be done by 
+	// the "stored" loop
+
+	// If abstract is empty, we make up one with the beginning of the
+	// document. This is then not indexed, but part of the doc data so
+	// that we can return it to a query without having to decode the
+	// original file.
+	bool syntabs = false;
+	// Note that the map accesses by operator[] create empty entries if they
+	// don't exist yet.
+	trimstring(doc.meta[Doc::keyabs], " \t\r\n");
+	if (doc.meta[Doc::keyabs].empty()) {
+	    syntabs = true;
+	    if (!doc.text.empty())
+		doc.meta[Doc::keyabs] = cstr_syntAbs + 
+		    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
+	} else {
+	    doc.meta[Doc::keyabs] = 
+		neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
+			  cstr_nc);
+	}
+
+	const set<string>& stored = m_config->getStoredFields();
+	for (set<string>::const_iterator it = stored.begin();
+	     it != stored.end(); it++) {
+	    string nm = m_config->fieldCanon(*it);
+	    if (!doc.meta[nm].empty()) {
+		string value = 
+		    neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
+		RECORD_APPEND(record, nm, value);
+	    }
+	}
+
+	// If empty pages (multiple break at same pos) were recorded, save
+	// them (this is because we have no way to record them in the
+	// Xapian list
+	if (!tpidx.m_pageincrvec.empty()) {
+	    ostringstream multibreaks;
+	    for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
+		if (i != 0)
+		    multibreaks << ",";
+		multibreaks << tpidx.m_pageincrvec[i].first << "," << 
+		    tpidx.m_pageincrvec[i].second;
+	    }
+	    RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
 	}
-	RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
-    }
     
-    // If the file's md5 was computed, add value and term. 
-    // The value is optionally used for query result duplicate elimination, 
-    // and the term to find the duplicates.
-    // We don't do this for empty docs.
-    const string *md5;
-    if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
-	md5->compare(cstr_md5empty)) {
-	string digest;
-	MD5HexScan(*md5, digest);
-	newdocument.add_value(VALUE_MD5, digest);
-	newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
+	// If the file's md5 was computed, add value and term. 
+	// The value is optionally used for query result duplicate elimination, 
+	// and the term to find the duplicates.
+	// We don't do this for empty docs.
+	const string *md5;
+	if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
+	    md5->compare(cstr_md5empty)) {
+	    string digest;
+	    MD5HexScan(*md5, digest);
+	    newdocument.add_value(VALUE_MD5, digest);
+	    newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
+	}
+
+	LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
+	newdocument.set_data(record);
     }
-
-    LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
-    newdocument.set_data(record);
-
 #ifdef IDX_THREADS
     if (m_ndb->m_havewriteq) {
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
@@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 				   doc.text.length());
 }
 
+bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
+				    Doc &doc, Xapian::Document& xdoc)
+{
+    LOGDEB0(("Db::docToXdocXattrOnly\n"));
+    PTMutexLocker lock(m_mutex);
+
+    // Read existing document and its data record
+    if (getDoc(udi, 0, xdoc) == 0) {
+	LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
+	return false;
+    }
+    string data;
+    XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
+    if (!m_rcldb->m_reason.empty()) {
+        LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
+        return false;
+    }
+
+    // Clear the term lists for the incoming fields and index the new values
+    map<string, string>::iterator meta_it;
+    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
+	const FieldTraits *ftp;
+	if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
+	    LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
+		     meta_it->first.c_str()));
+	    continue;
+	}
+	// Clear the previous terms for the field
+	clearField(xdoc, ftp->pfx, ftp->wdfinc);
+	LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", 
+		 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
+		 meta_it->second.c_str()));
+	splitter->setprefix(ftp->pfx);
+	splitter->setwdfinc(ftp->wdfinc);
+	if (!splitter->text_to_words(meta_it->second))
+	    LOGDEB(("Db::xattrOnly: split failed for %s\n", 
+		    meta_it->first.c_str()));
+    }
+    xdoc.add_value(VALUE_SIG, doc.sig);
+
+    // Parse current data record into a dict for ease of processing
+    ConfSimple datadic(data);
+    if (!datadic.ok()) {
+	LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
+	return false;
+    }
+
+    // For each "stored" field, check if set in doc metadata and
+    // update the value if it is
+    const set<string>& stored = m_rcldb->m_config->getStoredFields();
+    for (set<string>::const_iterator it = stored.begin();
+	 it != stored.end(); it++) {
+	string nm = m_rcldb->m_config->fieldCanon(*it);
+	if (doc.getmeta(nm, 0)) {
+	    string value = 
+		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
+	    datadic.set(nm, value, "");
+	}
+    }
+
+    // Recreate the record. We want to do this with the local RECORD_APPEND
+    // method for consistency in format, instead of using ConfSimple print
+    vector<string> names = datadic.getNames("");
+    data.clear();
+    for (vector<string>::const_iterator it = names.begin(); 
+	 it != names.end(); it++) {
+	string value;
+	datadic.get(*it, value, "");
+	RECORD_APPEND(data, *it, value);
+    }
+    RECORD_APPEND(data, Doc::keysig, doc.sig);
+    xdoc.set_data(data);
+    return true;
+}
+
 #ifdef IDX_THREADS
 void Db::waitUpdIdle()
 {
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 1962c3b4..477f7cbe 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -237,6 +237,10 @@ class Db {
      */
     bool needUpdate(const string &udi, const string& sig, bool *existed=0);
 
+    /** Indicate if we are doing a systematic reindex. This complements
+	needUpdate() return */
+    bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
+
     /** Add or update document identified by unique identifier.
      * @param config Config object to use. Can be the same as the member config
      *   or a clone, to avoid sharing when called in multithread context.
diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h
index 65dceee8..fda1b018 100644
--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@@ -66,6 +66,8 @@ public:
 };
 #endif // IDX_THREADS
 
+class TextSplitDb;
+
 // A class for data and methods that would have to expose
 // Xapian-specific stuff if they were in Rcl::Db. There could actually be
 // 2 different ones for indexing or query as there is not much in
@@ -141,6 +143,16 @@ class Db::Native {
     /** Check if doc is indexed by term */
     bool hasTerm(const string& udi, int idxi, const string& term);
 
+    /** Update existing Xapian document for pure extended attrs change */
+    bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
+			    Doc &doc, Xapian::Document& xdoc);
+    /** Remove all terms currently indexed for field defined by idx prefix */
+    bool clearField(Xapian::Document& xdoc, const string& pfx, 
+		    Xapian::termcount wdfdec);
+
+    /** Check if term wdf is 0 and remove term if so */
+    bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
+
     /** Compute list of subdocuments for a given udi. We look for documents 
      * indexed by a parent term matching the udi, the posting list for the 
      * parentterm(udi)  (As suggested by James Aylett)
diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h
index e37f0045..2ba5b4ca 100644
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@@ -131,6 +131,10 @@ class Doc {
     // ipath descendants.
     bool haschildren;
 
+    // During indexing: only fields from extended attributes were set, no
+    // doc content. Allows for faster reindexing of existing doc
+    bool onlyxattr;
+
     ///////////////////////////////////////////////////////////////////
 
     void erase() {
@@ -154,10 +158,11 @@ class Doc {
 	idxi = 0;
 	haspages = false;
 	haschildren = false;
+	onlyxattr = false;
     }
     Doc()
 	: idxi(0), syntabs(false), pc(0), xdocid(0),
-	  haspages(false), haschildren(false)
+	  haspages(false), haschildren(false), onlyxattr(false)
     {
     }
     /** Get value for named field. If value pointer is 0, just test existence */
diff --git a/src/sampleconf/fields b/src/sampleconf/fields
index 0ec4d846..b38030e5 100644
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@@ -13,7 +13,7 @@
 #####################################################
 # This section defines what prefix the terms inside named fields will be
 # indexed with (in addition to prefix-less indexing for general search)
-# ALL prefixes MUST be all UPPERCASE. 
+# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
 # 
 # The field names should be the canonic ones, not the aliases defined in
 # the following section. Don't change those which are predefined here, 
diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf
index d2885db6..c6626ac7 100644
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@@ -5,6 +5,7 @@ daemloglevel = 6
 daemlogfilename = /tmp/rclmontrace
 
 indexStripChars = 1
+detectxattronly = 1
 
 topdirs = /home/dockes/projets/fulltext/testrecoll/
 
diff --git a/tests/empty/empty.txt b/tests/empty/empty.txt
index dbc3778c..47eb4c03 100644
--- a/tests/empty/empty.txt
+++ b/tests/empty/empty.txt
@@ -1,2 +1,2 @@
 1 results
-application/x-fsdirectory	[file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm]	[emptyUniqueTerm]	4096	bytes	
+inode/directory	[file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm]	[emptyUniqueTerm]	4096	bytes	
diff --git a/tests/runtests.sh b/tests/runtests.sh
index a0076b16..0e0442d2 100644
--- a/tests/runtests.sh
+++ b/tests/runtests.sh
@@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
     rerootResults
 fi
 
+iscmd()
+{
+    cmd=$1
+    case $cmd in
+    */*)
+	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
+    *)
+      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
+      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
+          iscmdresult=$d/$cmd && return 0;done
+      return 1 ;;
+    esac
+}
+
+checkcmds()
+{
+    result=0
+    for cmd in $*;do
+      if iscmd $cmd 
+      then 
+        echo $cmd is $iscmdresult
+      else 
+        echo $cmd not found
+        result=1
+      fi
+    done
+    return $result
+}
+
+checkcmds recollq recollindex pxattr xadump || exit 1
+
 makeindex() {
   echo "Zeroing Index" 
   rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
diff --git a/tests/xattr/fields b/tests/xattr/fields
new file mode 100644
index 00000000..91995081
--- /dev/null
+++ b/tests/xattr/fields
@@ -0,0 +1,4 @@
+[prefixes]
+myattr = XYXATA
+[stored]
+myattr =
diff --git a/tests/xattr/xattr.sh b/tests/xattr/xattr.sh
new file mode 100755
index 00000000..6e46e613
--- /dev/null
+++ b/tests/xattr/xattr.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+
+# Test extended attributes indexing. This should work both with
+# "detectxattronly" set or unset in the config, but should be run with
+# the variable set, because we test its function by exploiting a bug
+# (see comments further)
+#
+# We use the RECOLL_CONFTOP variable to add our own fields configuration
+
+thisdir=`dirname $0`
+topdir=$thisdir/..
+. $topdir/shared.sh
+
+initvariables $0
+
+RECOLL_CONFTOP=$thisdir
+export RECOLL_CONFTOP
+
+xrun()
+{
+    echo $*
+    $*
+}
+
+tstfile=${tstdata}/xattrs/tstxattrs.txt
+rm -f $tstfile
+
+(
+    # Create the file with an extended attribute, index, and query it
+    # by content and field
+    echo xattruniqueinfile > $tstfile
+    xrun pxattr -n myattr -v xattrunique1 $tstfile
+    xrun recollindex -Zi $tstfile
+    echo "1 result expected"
+    xrun recollq xattruniqueinfile
+    echo "1 result expected"
+    xrun recollq myattr:xattrunique1 
+
+    sleep 1
+
+    # Change the value for the field, check that the old value is gone
+    # and the new works
+    xrun pxattr -n myattr -v xattrunique2 $tstfile
+    xrun recollindex -i $tstfile
+    echo "1 result expected"
+    xrun recollq xattruniqueinfile
+    echo "0 result expected:"
+    xrun recollq myattr:xattrunique1 
+    echo "1 result expected:"
+    xrun recollq myattr:xattrunique2
+
+    # Change the contents then the xattr. With xattronly set, recoll
+    # should miss the contents change and index only the xattr. That's
+    # a bug but we use it to check that pure xattr update indexing
+    # works
+    echo xattruniqueinfile1 > $tstfile
+    sleep 2
+    xrun pxattr -n myattr -v xattrunique3 $tstfile
+    xrun recollindex -i $tstfile
+    echo "1 result expected"
+    xrun recollq xattruniqueinfile
+    echo "0 result expected"
+    xrun recollq xattruniqueinfile1
+    echo "0 result expected:"
+    xrun recollq myattr:xattrunique1 
+    echo "0 result expected:"
+    xrun recollq myattr:xattrunique2
+    echo "1 result expected:"
+    xrun recollq myattr:xattrunique3
+
+    # Reset the index and check that the contents were seen all right
+    xrun recollindex -Zi $tstfile
+    echo "0 result expected"
+    xrun recollq xattruniqueinfile
+    echo "1 result expected"
+    xrun recollq xattruniqueinfile1
+    echo "0 result expected:"
+    xrun recollq myattr:xattrunique2
+    echo "1 result expected:"
+    xrun recollq myattr:xattrunique3
+
+) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
+
+diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
+checkresult
diff --git a/tests/xattr/xattr.txt b/tests/xattr/xattr.txt
new file mode 100644
index 00000000..2e01ef28
--- /dev/null
+++ b/tests/xattr/xattr.txt
@@ -0,0 +1,57 @@
+pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+1 result expected
+recollq xattruniqueinfile
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+1 result expected
+recollq myattr:xattrunique1
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+1 result expected
+recollq xattruniqueinfile
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+0 result expected:
+recollq myattr:xattrunique1
+0 results
+1 result expected:
+recollq myattr:xattrunique2
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+1 result expected
+recollq xattruniqueinfile
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+0 result expected
+recollq xattruniqueinfile1
+0 results
+0 result expected:
+recollq myattr:xattrunique1
+0 results
+0 result expected:
+recollq myattr:xattrunique2
+0 results
+1 result expected:
+recollq myattr:xattrunique3
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
+recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
+0 result expected
+recollq xattruniqueinfile
+0 results
+1 result expected
+recollq xattruniqueinfile1
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	19	bytes	
+0 result expected:
+recollq myattr:xattrunique2
+0 results
+1 result expected:
+recollq myattr:xattrunique3
+1 results
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	19	bytes