Handle partial indexing of document restricted to metadata from extended attributes

2013-10-04 10:57:11 +02:00 · 2013-10-04 10:57:11 +02:00 · 56a56500c1
commit 56a56500c1
parent b2eeec067b
15 changed files with 811 additions and 402 deletions
--- a/src/VERSION
+++ b/src/VERSION
@ -1 +1 @@
-1.19.5
+1.20.0
--- a/src/index/fsindexer.cpp
+++ b/src/index/fsindexer.cpp
@ -45,7 +45,7 @@
 #include "cancelcheck.h"
 #include "rclinit.h"
 #include "execmd.h"
-
+#include "extrameta.h"
 // When using extended attributes, we have to use the ctime, because
 // this is all that gets set when the attributes are modified. 
@ -104,7 +104,7 @@ public:
 FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) 
    : m_config(cnf), m_db(db), m_updater(updfunc), 
-      m_missing(new FSIFIMissingStore)
+      m_missing(new FSIFIMissingStore), m_detectxattronly(false)
 #ifdef IDX_THREADS
    , m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first), 
      m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
 {
    LOGDEB1(("FsIndexer::FsIndexer\n"));
    m_havelocalfields = m_config->hasNameAnywhere("localfields");
    m_config->getConfParam("detectxattronly", &m_detectxattronly);
 #ifdef IDX_THREADS
    m_stableconfig = new RclConfig(*m_config);
@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
    bool existingDoc;
    bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
    // If ctime (which we use for the sig) differs from mtime, then at most
    // the extended attributes were changed, no need to index content.
    // This unfortunately leaves open the case where the data was
    // modified, then the extended attributes, in which case we will
    // miss the data update. We would have to store both the mtime and
    // the ctime to avoid this
    bool xattronly = m_detectxattronly && !m_db->inFullReset() && 
 	existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
    if (!needupdate) {
 	LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
 	if (m_updater) {
@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
    LOGDEB0(("processone: processing: [%s] %s\n", 
             displayableBytes(stp->st_size).c_str(), fn.c_str()));
    FileInterner interner(fn, stp, config, FileInterner::FIF_none);
    if (!interner.ok()) {
        // no indexing whatsoever in this case. This typically means that
        // indexallfilenames is not set
        return FsTreeWalker::FtwOk;
    }
    interner.setMissingStore(m_missing);
    string utf8fn = compute_utf8fn(config, fn);
    // parent_udi is initially the same as udi, it will be used if there 
@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config,
    char ascdate[30];
    sprintf(ascdate, "%ld", long(stp->st_mtime));
    FileInterner::Status fis = FileInterner::FIAgain;
    bool hadNullIpath = false;
-    bool hadNonNullIpath = false;
+    string mimetype;
    while (fis == FileInterner::FIAgain) {
 	doc.erase();
        try {
            fis = interner.internfile(doc);
        } catch (CancelExcept) {
            LOGERR(("fsIndexer::processone: interrupted\n"));
            return FsTreeWalker::FtwStop;
        }
-        // We index at least the file name even if there was an error.
+    if (!xattronly) {
-        // We'll change the signature to ensure that the indexing will
+	FileInterner interner(fn, stp, config, FileInterner::FIF_none);
-        // be retried every time.
+	if (!interner.ok()) {
 	    // no indexing whatsoever in this case. This typically means that
 	    // indexallfilenames is not set
 	    return FsTreeWalker::FtwOk;
 	}
 	mimetype = interner.getMimetype();
-	// Internal access path for multi-document files. If empty, this is
+	interner.setMissingStore(m_missing);
-	// for the main file.
+	FileInterner::Status fis = FileInterner::FIAgain;
-	if (doc.ipath.empty()) {
+	bool hadNonNullIpath = false;
-	    hadNullIpath = true;
+	while (fis == FileInterner::FIAgain) {
-	    if (hadNonNullIpath) {
+	    doc.erase();
-		// Note that only the filters can reliably compute
+	    try {
-		// this. What we do is dependant of the doc order (if
+		fis = interner.internfile(doc);
-		// we see the top doc first, we won't set the flag)
+	    } catch (CancelExcept) {
-		doc.haschildren = true;
+		LOGERR(("fsIndexer::processone: interrupted\n"));
 		return FsTreeWalker::FtwStop;
 	    }
 	} else {
 	    hadNonNullIpath = true;
 	    make_udi(fn, doc.ipath, udi);
 	}
-	// Set file name, mod time and url if not done by filter
+	    // We index at least the file name even if there was an error.
-	if (doc.fmtime.empty())
+	    // We'll change the signature to ensure that the indexing will
-	    doc.fmtime = ascdate;
+	    // be retried every time.
        if (doc.url.empty())
            doc.url = cstr_fileu + fn;
 	const string *fnp = 0;
 	if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
 	    doc.meta[Rcl::Doc::keyfn] = utf8fn;
-	char cbuf[100]; 
+	    // Internal access path for multi-document files. If empty, this is
-	sprintf(cbuf, "%lld", (long long)stp->st_size);
+	    // for the main file.
-	doc.pcbytes = cbuf;
+	    if (doc.ipath.empty()) {
-	// Document signature for up to date checks. All subdocs inherit the
+		hadNullIpath = true;
-	// file's.
+		if (hadNonNullIpath) {
-	doc.sig = sig;
+		    // Note that only the filters can reliably compute
 		    // this. What we do is dependant of the doc order (if
 		    // we see the top doc first, we won't set the flag)
 		    doc.haschildren = true;
 		}
 	    } else {
 		hadNonNullIpath = true;
 		make_udi(fn, doc.ipath, udi);
 	    }
-	// If there was an error, ensure indexing will be
+	    // Set file name, mod time and url if not done by filter
-	// retried. This is for the once missing, later installed
+	    if (doc.fmtime.empty())
-	// filter case. It can make indexing much slower (if there are
+		doc.fmtime = ascdate;
-	// myriads of such files, the ext script is executed for them
+	    if (doc.url.empty())
-	// and fails every time)
+		doc.url = cstr_fileu + fn;
-	if (fis == FileInterner::FIError) {
+	    const string *fnp = 0;
-	    doc.sig += cstr_plus;
+	    if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
-	}
+		doc.meta[Rcl::Doc::keyfn] = utf8fn;
-        // Possibly add fields from local config
+	    char cbuf[100]; 
-        if (m_havelocalfields) 
+	    sprintf(cbuf, "%lld", (long long)stp->st_size);
-            setlocalfields(localfields, doc);
+	    doc.pcbytes = cbuf;
 	    // Document signature for up to date checks. All subdocs inherit the
 	    // file's.
 	    doc.sig = sig;
-	// Add document to database. If there is an ipath, add it as a children
+	    // If there was an error, ensure indexing will be
-	// of the file document.
+	    // retried. This is for the once missing, later installed
 	    // filter case. It can make indexing much slower (if there are
 	    // myriads of such files, the ext script is executed for them
 	    // and fails every time)
 	    if (fis == FileInterner::FIError) {
 		doc.sig += cstr_plus;
 	    }
 	    // Possibly add fields from local config
 	    if (m_havelocalfields) 
 		setlocalfields(localfields, doc);
 	    // Add document to database. If there is an ipath, add it
 	    // as a child of the file document.
 #ifdef IDX_THREADS
-	if (m_haveSplitQ) {
+	    if (m_haveSplitQ) {
-	    DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? 
+		DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? 
-					  cstr_null : parent_udi, doc);
+					      cstr_null : parent_udi, doc);
-	    if (!m_dwqueue.put(tp)) {
+		if (!m_dwqueue.put(tp)) {
-		LOGERR(("processonefile: wqueue.put failed\n"));
+		    LOGERR(("processonefile: wqueue.put failed\n"));
-		return FsTreeWalker::FtwError;
+		    return FsTreeWalker::FtwError;
-	    } 
+		} 
-	} else {
+	    } else {
 #endif
-	    if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? 
+		if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? 
-				   cstr_null : parent_udi, doc)) {
+				       cstr_null : parent_udi, doc)) {
-		return FsTreeWalker::FtwError;
+		    return FsTreeWalker::FtwError;
-	    }
+		}
 #ifdef IDX_THREADS
-	}
+	    }
 #endif
-	// Tell what we are doing and check for interrupt request
+	    // Tell what we are doing and check for interrupt request
-	if (m_updater) {
+	    if (m_updater) {
 #ifdef IDX_THREADS
-	    PTMutexLocker locker(m_updater->m_mutex);
+		PTMutexLocker locker(m_updater->m_mutex);
 #endif
-	    ++(m_updater->status.docsdone);
+		++(m_updater->status.docsdone);
-            if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
+		if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
-                m_updater->status.dbtotdocs = m_updater->status.docsdone;
+		    m_updater->status.dbtotdocs = m_updater->status.docsdone;
-            m_updater->status.fn = fn;
+		m_updater->status.fn = fn;
-            if (!doc.ipath.empty())
+		if (!doc.ipath.empty())
-                m_updater->status.fn += "|" + doc.ipath;
+		    m_updater->status.fn += "|" + doc.ipath;
-            if (!m_updater->update()) {
+		if (!m_updater->update()) {
-                return FsTreeWalker::FtwStop;
+		    return FsTreeWalker::FtwStop;
-            }
+		}
 	    }
 	}
    }
-    // If this doc existed and it's a container, recording for
+	// If this doc existed and it's a container, recording for
-    // possible subdoc purge (this will be used only if we don't do a
+	// possible subdoc purge (this will be used only if we don't do a
-    // db-wide purge, e.g. if we're called from indexfiles()).
+	// db-wide purge, e.g. if we're called from indexfiles()).
-    LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
+	LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
-	     existingDoc, hadNonNullIpath));
+		 existingDoc, hadNonNullIpath));
-    if (existingDoc && hadNonNullIpath) {
+	if (existingDoc && hadNonNullIpath) {
-	m_purgeCandidates.record(parent_udi);
+	    m_purgeCandidates.record(parent_udi);
 	}
    }
    // If we had no instance with a null ipath, we create an empty
    // document to stand for the file itself, to be used mainly for up
    // to date checks. Typically this happens for an mbox file.
-    if (hadNullIpath == false) {
+    //
-	LOGDEB1(("Creating empty doc for file\n"));
+    // If xattronly is set, ONLY the extattr metadata is valid and will be used
    // by the following step.
    if (xattronly || hadNullIpath == false) {
 	LOGDEB(("Creating empty doc for file or pure xattr update\n"));
 	Rcl::Doc fileDoc;
-	fileDoc.fmtime = ascdate;
+	if (xattronly) {
-	fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
+	    map<string, string> xfields;
-	fileDoc.haschildren = true;
+	    reapXAttrs(config, fn, xfields);
-	fileDoc.mimetype = interner.getMimetype();
+	    docFieldsFromXattrs(config, xfields, fileDoc);
-	fileDoc.url = cstr_fileu + fn;
+	    fileDoc.onlyxattr = true;
-        if (m_havelocalfields) 
+	} else {
-            setlocalfields(localfields, fileDoc);
+	    fileDoc.fmtime = ascdate;
-	char cbuf[100]; 
+	    fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
-	sprintf(cbuf, "%lld", (long long)stp->st_size);
+	    fileDoc.haschildren = true;
-	fileDoc.pcbytes = cbuf;
+	    fileDoc.mimetype = mimetype;
 	    fileDoc.url = cstr_fileu + fn;
 	    if (m_havelocalfields) 
 		setlocalfields(localfields, fileDoc);
 	    char cbuf[100]; 
 	    sprintf(cbuf, "%lld", (long long)stp->st_size);
 	    fileDoc.pcbytes = cbuf;
 	}
 	fileDoc.sig = sig;
 #ifdef IDX_THREADS
--- a/src/index/fsindexer.h
+++ b/src/index/fsindexer.h
@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
    string       m_slocalfields;
    map<string, string>  m_localfields;
    // Activate detection of xattr-only document updates. Experimental, so
    // needs a config option
    bool         m_detectxattronly;
 #ifdef IDX_THREADS
    friend void *FsIndexerDbUpdWorker(void*);
    friend void *FsIndexerInternfileWorker(void*);
--- a/src/index/rclmonrcv.cpp
+++ b/src/index/rclmonrcv.cpp
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
    code &= ~(IN_ISDIR|IN_ONESHOT);
    switch (code) {
    case IN_ACCESS: return "IN_ACCESS";
    case IN_MODIFY: return "IN_MODIFY";
    case IN_ATTRIB: return "IN_ATTRIB";
    case IN_CLOSE: return "IN_CLOSE";
    case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
    case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
    case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
    case IN_CLOSE: return "IN_CLOSE";
    case IN_OPEN: return "IN_OPEN";
    case IN_MOVED_FROM: return "IN_MOVED_FROM";
    case IN_MOVED_TO: return "IN_MOVED_TO";
    case IN_MOVE: return "IN_MOVE";
    case IN_CREATE: return "IN_CREATE";
    case IN_DELETE: return "IN_DELETE";
    case IN_DELETE_SELF: return "IN_DELETE_SELF";
    case IN_IGNORED: return "IN_IGNORED";
    case IN_MODIFY: return "IN_MODIFY";
    case IN_MOVE: return "IN_MOVE";
    case IN_MOVED_FROM: return "IN_MOVED_FROM";
    case IN_MOVED_TO: return "IN_MOVED_TO";
    case IN_MOVE_SELF: return "IN_MOVE_SELF";
    case IN_OPEN: return "IN_OPEN";
    case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
    case IN_UNMOUNT: return "IN_UNMOUNT";
    case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
    case IN_IGNORED: return "IN_IGNORED";
    default: {
 	static char msg[50];
 	sprintf(msg, "Unknown event 0x%x", code);
@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
    uint32_t mask = IN_MODIFY | IN_CREATE
        | IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
 #ifdef RCL_USE_XATTR
-	// It seems that IN_ATTRIB is not needed to receive extattr
+	// IN_ATTRIB used to be not needed to receive extattr
-	// modification events, which is a bit weird because only ctime is
+	// modification events, which was a bit weird because only ctime is
-	// set. 
+	// set, and now it is...
-	// | IN_ATTRIB
+	| IN_ATTRIB
 #endif
 #ifdef IN_DONT_FOLLOW
 	| IN_DONT_FOLLOW
@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
 	eraseWatchSubTree(m_idtopath, ev.m_path);
    }
-    // IN_ATTRIB apparently not needed, see comment above
+    // IN_ATTRIB used to be not needed, but now it is
-    if (evp->mask & (IN_MODIFY)) {
+    if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
 	ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
    } else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
 	ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
    return false;
 }
 // Clear term from document if its frequency is 0. This should
 // probably be done by Xapian when the freq goes to 0 when removing a
 // posting, but we have to do it ourselves
 bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
 {
    LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
    // Find the term
    Xapian::TermIterator xit;
    XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
 	   xrdb, m_rcldb->m_reason);
    if (!m_rcldb->m_reason.empty()) {
 	LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", 
 		term.c_str(), m_rcldb->m_reason.c_str()));
 	return false;
    }
    if (xit == xdoc.termlist_end() || term.compare(*xit)) {
 	LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", 
 		 term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
 	return false;
    }
    // Clear the term if its frequency is 0
    if (xit.get_wdf() == 0) {
 	LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
 	XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
 	if (!m_rcldb->m_reason.empty()) {
 	    LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", 
 		     term.c_str(), m_rcldb->m_reason.c_str()));
 	}
    }
    return true;
 }
 // Holder for term + pos
 struct DocPosting {
    DocPosting(string t, Xapian::termpos ps)
 	: term(t), pos(ps) {}
    string term;
    Xapian::termpos pos;
 };
 // Clear all terms for given field for given document.
 // The terms to be cleared are all those with the appropriate
 // prefix. We also remove the postings for the unprefixed terms (that
 // is, we undo what we did when indexing).
 bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
 			    Xapian::termcount wdfdec)
 {
    LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
 	     pfx.c_str(), unsigned(xdoc.get_docid())));
    vector<DocPosting> eraselist;
    string wrapd = wrap_prefix(pfx);
    m_rcldb->m_reason.clear();
    for (int tries = 0; tries < 2; tries++) {
 	try {
 	    Xapian::TermIterator xit;
 	    xit = xdoc.termlist_begin();
 	    xit.skip_to(wrapd);
 	    while (xit != xdoc.termlist_end() && 
 		!(*xit).compare(0, wrapd.size(), wrapd)) {
 		LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
 		Xapian::PositionIterator posit;
 		for (posit = xit.positionlist_begin();
 		     posit != xit.positionlist_end(); posit++) {
 		    eraselist.push_back(DocPosting(*xit, *posit));
 		    eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
 		}
 		xit++;
 	    }
 	} catch (const Xapian::DatabaseModifiedError &e) {
 	    m_rcldb->m_reason = e.get_msg();
 	    xrdb.reopen();
 	    continue;
 	} XCATCHERROR(m_rcldb->m_reason);
 	break;
    }
    if (!m_rcldb->m_reason.empty()) {
 	LOGERR(("Db::clearField: failed building erase list: %s\n", 
 		m_rcldb->m_reason.c_str()));
 	return false;
    }
    // Now remove the found positions, and the terms if the wdf is 0
    for (vector<DocPosting>::const_iterator it = eraselist.begin();
 	 it != eraselist.end(); it++) {
 	LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", 
 		 it->term.c_str(), int(it->pos)));
 	XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, 
 	       xwdb,m_rcldb->m_reason);
 	if (!m_rcldb->m_reason.empty()) {
 	    // Not that this normally fails for non-prefixed XXST and
 	    // ND, don't make a fuss
 	    LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
 		     it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
 	}
 	clearDocTermIfWdf0(xdoc, it->term);
    }
    return true;
 }
 // Check if doc given by udi is indexed by term
 bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
 {
@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
 {
 #ifdef IDX_THREADS
    Chrono chron;
-    // In the case where there is a separate (single) db update
+    PTMutexLocker lock(m_mutex);
    // thread, we only need to protect the update map update below
    // (against interaction with threads calling needUpdate()). Else,
    // all threads from above need to synchronize here
    PTMutexLocker lock(m_mutex, m_havewriteq);
 #endif
    // Check file system full every mbyte of indexed text. It's a bit wasteful
@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
    try {
 	Xapian::docid did = 
 	    xwdb.replace_document(uniterm, newdocument);
 #ifdef IDX_THREADS
 	// Need to protect against interaction with the up-to-date checks
 	// which also update the existence map
 	PTMutexLocker lock(m_mutex, !m_havewriteq);
 #endif
 	if (did < m_rcldb->updated.size()) {
 	    m_rcldb->updated[did] = true;
 	    LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
    return false;
 }
 // The splitter breaks text into words and adds postings to the Xapian
 // document. We use a single object to split all of the document
 // fields and position jumps to separate fields
@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
    TextSplitDb splitter(newdocument, nxt);
    tpidx.setTSD(&splitter);
    // If the ipath is like a path, index the last element. This is
    // for compound documents like zip and chm for which the filter
    // uses the file path as ipath. 
    if (!doc.ipath.empty() && 
 	doc.ipath.find_first_not_of("0123456789") != string::npos) {
 	string utf8ipathlast;
 	// There is no way in hell we could have an idea of the
 	// charset here, so let's hope it's ascii or utf-8. We call
 	// transcode to strip the bad chars and pray
 	if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
 		      "UTF-8", "UTF-8")) {
 	    splitter.text_to_words(utf8ipathlast);
 	}
    }
    // Split and index the path from the url for path-based filtering
    {
 	string path = url_gpath(doc.url);
 	vector<string> vpath;
 	stringToTokens(path, vpath, "/");
 	// If vpath is not /, the last elt is the file/dir name, not a
 	// part of the path.
 	if (vpath.size())
 	    vpath.resize(vpath.size()-1);
 	splitter.curpos = 0;
 	newdocument.add_posting(wrap_prefix(pathelt_prefix),
 				splitter.basepos + splitter.curpos++);
 	for (vector<string>::iterator it = vpath.begin(); 
 	     it != vpath.end(); it++){
 	    if (it->length() > 230) {
 		// Just truncate it. May still be useful because of wildcards
 		*it = it->substr(0, 230);
 	    }
 	    newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
 				    splitter.basepos + splitter.curpos++);
 	}
    }
    // Index textual metadata.  These are all indexed as text with
    // positions, as we may want to do phrase searches with them (this
    // makes no sense for keywords by the way).
    //
    // The order has no importance, and we set a position gap of 100
    // between fields to avoid false proximity matches.
    map<string, string>::iterator meta_it;
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	if (!meta_it->second.empty()) {
 	    const FieldTraits *ftp;
 	    // We don't test for an empty prefix here. Some fields are part
 	    // of the internal conf with an empty prefix (ie: abstract).
 	    if (!fieldToTraits(meta_it->first, &ftp)) {
 		LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
 			 meta_it->first.c_str()));
 		continue;
 	    }
 	    LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
 		     meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
 		     meta_it->second.c_str()));
 	    splitter.setprefix(ftp->pfx);
 	    splitter.setwdfinc(ftp->wdfinc);
 	    if (!splitter.text_to_words(meta_it->second))
                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
                        meta_it->first.c_str()));
 	}
    }
    splitter.setprefix(string());
    splitter.setwdfinc(1);
    if (splitter.curpos < baseTextPosition)
 	splitter.basepos = baseTextPosition;
    // Split and index body text
    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
 #ifdef TEXTSPLIT_STATS
    splitter.resetStats();
 #endif
    if (!splitter.text_to_words(doc.text))
        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
 #ifdef TEXTSPLIT_STATS
    // Reject bad data. unrecognized base64 text is characterized by
    // high avg word length and high variation (because there are
    // word-splitters like +/ inside the data).
    TextSplit::Stats::Values v = splitter.getStats();
    // v.avglen > 15 && v.sigma > 12 
    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
 	LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
 	 "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
 		 v.count, v.avglen, v.sigma, doc.url.c_str(), 
 		 doc.ipath.c_str(), doc.text.c_str()));
 	return true;
    }
 #endif
    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
    // Simple file name indexed unsplit for specific "file name"
    // searches. This is not the same as a filename: clause inside the
    // query language.
    // We also add a term for the filename extension if any.
    string utf8fn;
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
 	string fn;
 	if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
 	    // We should truncate after extracting the extension, but this is
 	    // a pathological case anyway
 	    if (fn.size() > 230)
 		utf8truncate(fn, 230);
 	    string::size_type pos = fn.rfind('.');
 	    if (pos != string::npos && pos != fn.length() - 1) {
 		newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
 					     fn.substr(pos + 1));
 	    }
 	    newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
 	}
    }
    // Udi unique term: this is used for file existence/uptodate
    // checks, and unique id for the replace_document() call.
    string uniterm = make_uniterm(udi);
    newdocument.add_boolean_term(uniterm);
    // Parent term. This is used to find all descendents, mostly to delete them 
    // when the parent goes away
    if (!parent_udi.empty()) {
 	newdocument.add_boolean_term(make_parentterm(parent_udi));
    }
    // Dates etc.
    time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
 			 doc.dmtime.c_str());
    struct tm *tm = localtime(&mtime);
    char buf[9];
    snprintf(buf, 9, "%04d%02d%02d",
 	    tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
    // Date (YYYYMMDD)
    newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
    // Month (YYYYMM)
    buf[6] = '\0';
    newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
    // Year (YYYY)
    buf[4] = '\0';
    newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
-
+    if (doc.onlyxattr) {
-    //////////////////////////////////////////////////////////////////
+	// Only updating an existing doc with new extended attributes
-    // Document data record. omindex has the following nl separated fields:
+	// data.  Need to read the old doc and its data record
-    // - url
+	// first. This is so different from the normal processing that
-    // - sample
+	// it uses a fully separate code path (with some duplication
-    // - caption (title limited to 100 chars)
+	// unfortunately)
-    // - mime type 
+	if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
-    //
+	    return false;
    // The title, author, abstract and keywords fields are special,
    // they always get stored in the document data
    // record. Configurable other fields can be, too.
    //
    // We truncate stored fields abstract, title and keywords to
    // reasonable lengths and suppress newlines (so that the data
    // record can keep a simple syntax)
    string record;
    RECORD_APPEND(record, Doc::keyurl, doc.url);
    RECORD_APPEND(record, Doc::keytp, doc.mimetype);
    // We left-zero-pad the times so that they are lexico-sortable
    leftzeropad(doc.fmtime, 11);
    RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
    if (!doc.dmtime.empty()) {
 	leftzeropad(doc.dmtime, 11);
 	RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
    }
    RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
    if (doc.fbytes.empty())
 	doc.fbytes = doc.pcbytes;
    if (!doc.fbytes.empty()) {
 	RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
 	leftzeropad(doc.fbytes, 12);
 	newdocument.add_value(VALUE_SIZE, doc.fbytes);
    }
    if (doc.haschildren) {
 	newdocument.add_boolean_term(has_children_term);
    }	
    if (!doc.pcbytes.empty())
 	RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
    char sizebuf[30]; 
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
    RECORD_APPEND(record, Doc::keyds, sizebuf);
    // Note that we add the signature both as a value and in the data record
    if (!doc.sig.empty()) {
 	RECORD_APPEND(record, Doc::keysig, doc.sig);
 	newdocument.add_value(VALUE_SIG, doc.sig);
    }
    if (!doc.ipath.empty())
 	RECORD_APPEND(record, Doc::keyipt, doc.ipath);
    doc.meta[Doc::keytt] = 
 	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
    if (!doc.meta[Doc::keytt].empty())
 	RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
    trimstring(doc.meta[Doc::keykw], " \t\r\n");
    doc.meta[Doc::keykw] = 
 	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
    // No need to explicitly append the keywords, this will be done by 
    // the "stored" loop
    // If abstract is empty, we make up one with the beginning of the
    // document. This is then not indexed, but part of the doc data so
    // that we can return it to a query without having to decode the
    // original file.
    bool syntabs = false;
    // Note that the map accesses by operator[] create empty entries if they
    // don't exist yet.
    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
    if (doc.meta[Doc::keyabs].empty()) {
 	syntabs = true;
 	if (!doc.text.empty())
 	    doc.meta[Doc::keyabs] = cstr_syntAbs + 
 		neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
    } else {
 	doc.meta[Doc::keyabs] = 
 	    neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
 		      cstr_nc);
    }
-    const set<string>& stored = m_config->getStoredFields();
+	// If the ipath is like a path, index the last element. This is
-    for (set<string>::const_iterator it = stored.begin();
+	// for compound documents like zip and chm for which the filter
-	 it != stored.end(); it++) {
+	// uses the file path as ipath. 
-	string nm = m_config->fieldCanon(*it);
+	if (!doc.ipath.empty() && 
-	if (!doc.meta[nm].empty()) {
+	    doc.ipath.find_first_not_of("0123456789") != string::npos) {
-	    string value = 
+	    string utf8ipathlast;
-		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
+	    // There is no way in hell we could have an idea of the
-	    RECORD_APPEND(record, nm, value);
+	    // charset here, so let's hope it's ascii or utf-8. We call
 	    // transcode to strip the bad chars and pray
 	    if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
 			  "UTF-8", "UTF-8")) {
 		splitter.text_to_words(utf8ipathlast);
 	    }
 	}
    }
-    // If empty pages (multiple break at same pos) were recorded, save
+	// Split and index the path from the url for path-based filtering
-    // them (this is because we have no way to record them in the
+	{
-    // Xapian list
+	    string path = url_gpath(doc.url);
-    if (!tpidx.m_pageincrvec.empty()) {
+	    vector<string> vpath;
-	ostringstream multibreaks;
+	    stringToTokens(path, vpath, "/");
-	for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
+	    // If vpath is not /, the last elt is the file/dir name, not a
-	    if (i != 0)
+	    // part of the path.
-		multibreaks << ",";
+	    if (vpath.size())
-	    multibreaks << tpidx.m_pageincrvec[i].first << "," << 
+		vpath.resize(vpath.size()-1);
-		tpidx.m_pageincrvec[i].second;
+	    splitter.curpos = 0;
 	    newdocument.add_posting(wrap_prefix(pathelt_prefix),
 				    splitter.basepos + splitter.curpos++);
 	    for (vector<string>::iterator it = vpath.begin(); 
 		 it != vpath.end(); it++){
 		if (it->length() > 230) {
 		    // Just truncate it. May still be useful because of wildcards
 		    *it = it->substr(0, 230);
 		}
 		newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
 					splitter.basepos + splitter.curpos++);
 	    }
 	}
-	RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
+
 	// Index textual metadata.  These are all indexed as text with
 	// positions, as we may want to do phrase searches with them (this
 	// makes no sense for keywords by the way).
 	//
 	// The order has no importance, and we set a position gap of 100
 	// between fields to avoid false proximity matches.
 	map<string, string>::iterator meta_it;
 	for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	    if (!meta_it->second.empty()) {
 		const FieldTraits *ftp;
 		// We don't test for an empty prefix here. Some fields are part
 		// of the internal conf with an empty prefix (ie: abstract).
 		if (!fieldToTraits(meta_it->first, &ftp)) {
 		    LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
 			     meta_it->first.c_str()));
 		    continue;
 		}
 		LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
 			 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
 			 meta_it->second.c_str()));
 		splitter.setprefix(ftp->pfx);
 		splitter.setwdfinc(ftp->wdfinc);
 		if (!splitter.text_to_words(meta_it->second))
 		    LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
 			    meta_it->first.c_str()));
 	    }
 	}
 	splitter.setprefix(string());
 	splitter.setwdfinc(1);
 	if (splitter.curpos < baseTextPosition)
 	    splitter.basepos = baseTextPosition;
 	// Split and index body text
 	LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
 #ifdef TEXTSPLIT_STATS
 	splitter.resetStats();
 #endif
 	if (!splitter.text_to_words(doc.text))
 	    LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
 #ifdef TEXTSPLIT_STATS
 	// Reject bad data. unrecognized base64 text is characterized by
 	// high avg word length and high variation (because there are
 	// word-splitters like +/ inside the data).
 	TextSplit::Stats::Values v = splitter.getStats();
 	// v.avglen > 15 && v.sigma > 12 
 	if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
 	    LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
 		     "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
 		     v.count, v.avglen, v.sigma, doc.url.c_str(), 
 		     doc.ipath.c_str(), doc.text.c_str()));
 	    return true;
 	}
 #endif
 	////// Special terms for other metadata. No positions for these.
 	// Mime type
 	newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
 	// Simple file name indexed unsplit for specific "file name"
 	// searches. This is not the same as a filename: clause inside the
 	// query language.
 	// We also add a term for the filename extension if any.
 	string utf8fn;
 	if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
 	    string fn;
 	    if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
 		// We should truncate after extracting the extension, but this is
 		// a pathological case anyway
 		if (fn.size() > 230)
 		    utf8truncate(fn, 230);
 		string::size_type pos = fn.rfind('.');
 		if (pos != string::npos && pos != fn.length() - 1) {
 		    newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
 						 fn.substr(pos + 1));
 		}
 		newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
 	    }
 	}
 	newdocument.add_boolean_term(uniterm);
 	// Parent term. This is used to find all descendents, mostly
 	// to delete them when the parent goes away
 	if (!parent_udi.empty()) {
 	    newdocument.add_boolean_term(make_parentterm(parent_udi));
 	}
 	// Dates etc.
 	time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
 			     doc.dmtime.c_str());
 	struct tm *tm = localtime(&mtime);
 	char buf[9];
 	snprintf(buf, 9, "%04d%02d%02d",
 		 tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
 	// Date (YYYYMMDD)
 	newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
 	// Month (YYYYMM)
 	buf[6] = '\0';
 	newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
 	// Year (YYYY)
 	buf[4] = '\0';
 	newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
 	//////////////////////////////////////////////////////////////////
 	// Document data record. omindex has the following nl separated fields:
 	// - url
 	// - sample
 	// - caption (title limited to 100 chars)
 	// - mime type 
 	//
 	// The title, author, abstract and keywords fields are special,
 	// they always get stored in the document data
 	// record. Configurable other fields can be, too.
 	//
 	// We truncate stored fields abstract, title and keywords to
 	// reasonable lengths and suppress newlines (so that the data
 	// record can keep a simple syntax)
 	string record;
 	RECORD_APPEND(record, Doc::keyurl, doc.url);
 	RECORD_APPEND(record, Doc::keytp, doc.mimetype);
 	// We left-zero-pad the times so that they are lexico-sortable
 	leftzeropad(doc.fmtime, 11);
 	RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
 	if (!doc.dmtime.empty()) {
 	    leftzeropad(doc.dmtime, 11);
 	    RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
 	}
 	RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
 	if (doc.fbytes.empty())
 	    doc.fbytes = doc.pcbytes;
 	if (!doc.fbytes.empty()) {
 	    RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
 	    leftzeropad(doc.fbytes, 12);
 	    newdocument.add_value(VALUE_SIZE, doc.fbytes);
 	}
 	if (doc.haschildren) {
 	    newdocument.add_boolean_term(has_children_term);
 	}	
 	if (!doc.pcbytes.empty())
 	    RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
 	char sizebuf[30]; 
 	sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
 	RECORD_APPEND(record, Doc::keyds, sizebuf);
 	// Note that we add the signature both as a value and in the data record
 	if (!doc.sig.empty()) {
 	    RECORD_APPEND(record, Doc::keysig, doc.sig);
 	    newdocument.add_value(VALUE_SIG, doc.sig);
 	}
 	if (!doc.ipath.empty())
 	    RECORD_APPEND(record, Doc::keyipt, doc.ipath);
 	doc.meta[Doc::keytt] = 
 	    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
 	if (!doc.meta[Doc::keytt].empty())
 	    RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
 	trimstring(doc.meta[Doc::keykw], " \t\r\n");
 	doc.meta[Doc::keykw] = 
 	    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
 	// No need to explicitly append the keywords, this will be done by 
 	// the "stored" loop
 	// If abstract is empty, we make up one with the beginning of the
 	// document. This is then not indexed, but part of the doc data so
 	// that we can return it to a query without having to decode the
 	// original file.
 	bool syntabs = false;
 	// Note that the map accesses by operator[] create empty entries if they
 	// don't exist yet.
 	trimstring(doc.meta[Doc::keyabs], " \t\r\n");
 	if (doc.meta[Doc::keyabs].empty()) {
 	    syntabs = true;
 	    if (!doc.text.empty())
 		doc.meta[Doc::keyabs] = cstr_syntAbs + 
 		    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
 	} else {
 	    doc.meta[Doc::keyabs] = 
 		neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
 			  cstr_nc);
 	}
 	const set<string>& stored = m_config->getStoredFields();
 	for (set<string>::const_iterator it = stored.begin();
 	     it != stored.end(); it++) {
 	    string nm = m_config->fieldCanon(*it);
 	    if (!doc.meta[nm].empty()) {
 		string value = 
 		    neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
 		RECORD_APPEND(record, nm, value);
 	    }
 	}
 	// If empty pages (multiple break at same pos) were recorded, save
 	// them (this is because we have no way to record them in the
 	// Xapian list
 	if (!tpidx.m_pageincrvec.empty()) {
 	    ostringstream multibreaks;
 	    for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
 		if (i != 0)
 		    multibreaks << ",";
 		multibreaks << tpidx.m_pageincrvec[i].first << "," << 
 		    tpidx.m_pageincrvec[i].second;
 	    }
 	    RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
 	}
 	// If the file's md5 was computed, add value and term. 
 	// The value is optionally used for query result duplicate elimination, 
 	// and the term to find the duplicates.
 	// We don't do this for empty docs.
 	const string *md5;
 	if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
 	    md5->compare(cstr_md5empty)) {
 	    string digest;
 	    MD5HexScan(*md5, digest);
 	    newdocument.add_value(VALUE_MD5, digest);
 	    newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
 	}
 	LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
 	newdocument.set_data(record);
    }
    // If the file's md5 was computed, add value and term. 
    // The value is optionally used for query result duplicate elimination, 
    // and the term to find the duplicates.
    // We don't do this for empty docs.
    const string *md5;
    if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
 	md5->compare(cstr_md5empty)) {
 	string digest;
 	MD5HexScan(*md5, digest);
 	newdocument.add_value(VALUE_MD5, digest);
 	newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
    }
    LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
    newdocument.set_data(record);
 #ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 				   doc.text.length());
 }
 bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
 				    Doc &doc, Xapian::Document& xdoc)
 {
    LOGDEB0(("Db::docToXdocXattrOnly\n"));
    PTMutexLocker lock(m_mutex);
    // Read existing document and its data record
    if (getDoc(udi, 0, xdoc) == 0) {
 	LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
 	return false;
    }
    string data;
    XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
    if (!m_rcldb->m_reason.empty()) {
        LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
        return false;
    }
    // Clear the term lists for the incoming fields and index the new values
    map<string, string>::iterator meta_it;
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	const FieldTraits *ftp;
 	if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
 	    LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
 		     meta_it->first.c_str()));
 	    continue;
 	}
 	// Clear the previous terms for the field
 	clearField(xdoc, ftp->pfx, ftp->wdfinc);
 	LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", 
 		 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
 		 meta_it->second.c_str()));
 	splitter->setprefix(ftp->pfx);
 	splitter->setwdfinc(ftp->wdfinc);
 	if (!splitter->text_to_words(meta_it->second))
 	    LOGDEB(("Db::xattrOnly: split failed for %s\n", 
 		    meta_it->first.c_str()));
    }
    xdoc.add_value(VALUE_SIG, doc.sig);
    // Parse current data record into a dict for ease of processing
    ConfSimple datadic(data);
    if (!datadic.ok()) {
 	LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
 	return false;
    }
    // For each "stored" field, check if set in doc metadata and
    // update the value if it is
    const set<string>& stored = m_rcldb->m_config->getStoredFields();
    for (set<string>::const_iterator it = stored.begin();
 	 it != stored.end(); it++) {
 	string nm = m_rcldb->m_config->fieldCanon(*it);
 	if (doc.getmeta(nm, 0)) {
 	    string value = 
 		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
 	    datadic.set(nm, value, "");
 	}
    }
    // Recreate the record. We want to do this with the local RECORD_APPEND
    // method for consistency in format, instead of using ConfSimple print
    vector<string> names = datadic.getNames("");
    data.clear();
    for (vector<string>::const_iterator it = names.begin(); 
 	 it != names.end(); it++) {
 	string value;
 	datadic.get(*it, value, "");
 	RECORD_APPEND(data, *it, value);
    }
    RECORD_APPEND(data, Doc::keysig, doc.sig);
    xdoc.set_data(data);
    return true;
 }
 #ifdef IDX_THREADS
 void Db::waitUpdIdle()
 {
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -237,6 +237,10 @@ class Db {
     */
    bool needUpdate(const string &udi, const string& sig, bool *existed=0);
    /** Indicate if we are doing a systematic reindex. This complements
 	needUpdate() return */
    bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
    /** Add or update document identified by unique identifier.
     * @param config Config object to use. Can be the same as the member config
     *   or a clone, to avoid sharing when called in multithread context.
--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@ -66,6 +66,8 @@ public:
 };
 #endif // IDX_THREADS
 class TextSplitDb;
 // A class for data and methods that would have to expose
 // Xapian-specific stuff if they were in Rcl::Db. There could actually be
 // 2 different ones for indexing or query as there is not much in
@ -141,6 +143,16 @@ class Db::Native {
    /** Check if doc is indexed by term */
    bool hasTerm(const string& udi, int idxi, const string& term);
    /** Update existing Xapian document for pure extended attrs change */
    bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
 			    Doc &doc, Xapian::Document& xdoc);
    /** Remove all terms currently indexed for field defined by idx prefix */
    bool clearField(Xapian::Document& xdoc, const string& pfx, 
 		    Xapian::termcount wdfdec);
    /** Check if term wdf is 0 and remove term if so */
    bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
    /** Compute list of subdocuments for a given udi. We look for documents 
     * indexed by a parent term matching the udi, the posting list for the 
     * parentterm(udi)  (As suggested by James Aylett)
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -131,6 +131,10 @@ class Doc {
    // ipath descendants.
    bool haschildren;
    // During indexing: only fields from extended attributes were set, no
    // doc content. Allows for faster reindexing of existing doc
    bool onlyxattr;
    ///////////////////////////////////////////////////////////////////
    void erase() {
@ -154,10 +158,11 @@ class Doc {
 	idxi = 0;
 	haspages = false;
 	haschildren = false;
 	onlyxattr = false;
    }
    Doc()
 	: idxi(0), syntabs(false), pc(0), xdocid(0),
-	  haspages(false), haschildren(false)
+	  haspages(false), haschildren(false), onlyxattr(false)
    {
    }
    /** Get value for named field. If value pointer is 0, just test existence */
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -13,7 +13,7 @@
 #####################################################
 # This section defines what prefix the terms inside named fields will be
 # indexed with (in addition to prefix-less indexing for general search)
-# ALL prefixes MUST be all UPPERCASE. 
+# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
 # 
 # The field names should be the canonic ones, not the aliases defined in
 # the following section. Don't change those which are predefined here, 
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@ -5,6 +5,7 @@ daemloglevel = 6
 daemlogfilename = /tmp/rclmontrace
 indexStripChars = 1
 detectxattronly = 1
 topdirs = /home/dockes/projets/fulltext/testrecoll/
--- a/tests/empty/empty.txt
+++ b/tests/empty/empty.txt
@ -1,2 +1,2 @@
 1 results
-application/x-fsdirectory	[file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm]	[emptyUniqueTerm]	4096	bytes	
+inode/directory	[file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm]	[emptyUniqueTerm]	4096	bytes	
--- a/tests/runtests.sh
+++ b/tests/runtests.sh
@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
    rerootResults
 fi
 iscmd()
 {
    cmd=$1
    case $cmd in
    */*)
 	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
    *)
      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
          iscmdresult=$d/$cmd && return 0;done
      return 1 ;;
    esac
 }
 checkcmds()
 {
    result=0
    for cmd in $*;do
      if iscmd $cmd 
      then 
        echo $cmd is $iscmdresult
      else 
        echo $cmd not found
        result=1
      fi
    done
    return $result
 }
 checkcmds recollq recollindex pxattr xadump || exit 1
 makeindex() {
  echo "Zeroing Index" 
  rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
--- a/tests/xattr/fields
+++ b/tests/xattr/fields
@ -0,0 +1,4 @@
 [prefixes]
 myattr = XYXATA
 [stored]
 myattr =
--- a/tests/xattr/xattr.sh
+++ b/tests/xattr/xattr.sh
@ -0,0 +1,85 @@
 #!/bin/sh
 # Test extended attributes indexing. This should work both with
 # "detectxattronly" set or unset in the config, but should be run with
 # the variable set, because we test its function by exploiting a bug
 # (see comments further)
 #
 # We use the RECOLL_CONFTOP variable to add our own fields configuration
 thisdir=`dirname $0`
 topdir=$thisdir/..
 . $topdir/shared.sh
 initvariables $0
 RECOLL_CONFTOP=$thisdir
 export RECOLL_CONFTOP
 xrun()
 {
    echo $*
    $*
 }
 tstfile=${tstdata}/xattrs/tstxattrs.txt
 rm -f $tstfile
 (
    # Create the file with an extended attribute, index, and query it
    # by content and field
    echo xattruniqueinfile > $tstfile
    xrun pxattr -n myattr -v xattrunique1 $tstfile
    xrun recollindex -Zi $tstfile
    echo "1 result expected"
    xrun recollq xattruniqueinfile
    echo "1 result expected"
    xrun recollq myattr:xattrunique1 
    sleep 1
    # Change the value for the field, check that the old value is gone
    # and the new works
    xrun pxattr -n myattr -v xattrunique2 $tstfile
    xrun recollindex -i $tstfile
    echo "1 result expected"
    xrun recollq xattruniqueinfile
    echo "0 result expected:"
    xrun recollq myattr:xattrunique1 
    echo "1 result expected:"
    xrun recollq myattr:xattrunique2
    # Change the contents then the xattr. With xattronly set, recoll
    # should miss the contents change and index only the xattr. That's
    # a bug but we use it to check that pure xattr update indexing
    # works
    echo xattruniqueinfile1 > $tstfile
    sleep 2
    xrun pxattr -n myattr -v xattrunique3 $tstfile
    xrun recollindex -i $tstfile
    echo "1 result expected"
    xrun recollq xattruniqueinfile
    echo "0 result expected"
    xrun recollq xattruniqueinfile1
    echo "0 result expected:"
    xrun recollq myattr:xattrunique1 
    echo "0 result expected:"
    xrun recollq myattr:xattrunique2
    echo "1 result expected:"
    xrun recollq myattr:xattrunique3
    # Reset the index and check that the contents were seen all right
    xrun recollindex -Zi $tstfile
    echo "0 result expected"
    xrun recollq xattruniqueinfile
    echo "1 result expected"
    xrun recollq xattruniqueinfile1
    echo "0 result expected:"
    xrun recollq myattr:xattrunique2
    echo "1 result expected:"
    xrun recollq myattr:xattrunique3
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
 checkresult
--- a/tests/xattr/xattr.txt
+++ b/tests/xattr/xattr.txt
@ -0,0 +1,57 @@
 pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 1 result expected
 recollq xattruniqueinfile
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 1 result expected
 recollq myattr:xattrunique1
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 1 result expected
 recollq xattruniqueinfile
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 0 result expected:
 recollq myattr:xattrunique1
 0 results
 1 result expected:
 recollq myattr:xattrunique2
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 1 result expected
 recollq xattruniqueinfile
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 0 result expected
 recollq xattruniqueinfile1
 0 results
 0 result expected:
 recollq myattr:xattrunique1
 0 results
 0 result expected:
 recollq myattr:xattrunique2
 0 results
 1 result expected:
 recollq myattr:xattrunique3
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	18	bytes	
 recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
 0 result expected
 recollq xattruniqueinfile
 0 results
 1 result expected
 recollq xattruniqueinfile1
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	19	bytes	
 0 result expected:
 recollq myattr:xattrunique2
 0 results
 1 result expected:
 recollq myattr:xattrunique3
 1 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt]	[tstxattrs.txt]	19	bytes
`@ -1,2 +1,2 @@`
	`1 results`	`1 results`
	`application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes`	`inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes`