From 56a56500c1d8f46d213f0a04e8ad631ab5081a83 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 4 Oct 2013 10:57:11 +0200 Subject: [PATCH] Handle partial indexing of document restricted to metadata from extended attributes --- src/VERSION | 2 +- src/index/fsindexer.cpp | 250 ++++++++------ src/index/fsindexer.h | 4 + src/index/rclmonrcv.cpp | 30 +- src/rcldb/rcldb.cpp | 722 ++++++++++++++++++++++++--------------- src/rcldb/rcldb.h | 4 + src/rcldb/rcldb_p.h | 12 + src/rcldb/rcldoc.h | 7 +- src/sampleconf/fields | 2 +- tests/config/recoll.conf | 1 + tests/empty/empty.txt | 2 +- tests/runtests.sh | 31 ++ tests/xattr/fields | 4 + tests/xattr/xattr.sh | 85 +++++ tests/xattr/xattr.txt | 57 ++++ 15 files changed, 811 insertions(+), 402 deletions(-) create mode 100644 tests/xattr/fields create mode 100755 tests/xattr/xattr.sh create mode 100644 tests/xattr/xattr.txt diff --git a/src/VERSION b/src/VERSION index 83d5e73f..39893559 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.19.5 +1.20.0 diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 0a91349d..30f9b169 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -45,7 +45,7 @@ #include "cancelcheck.h" #include "rclinit.h" #include "execmd.h" - +#include "extrameta.h" // When using extended attributes, we have to use the ctime, because // this is all that gets set when the attributes are modified. @@ -104,7 +104,7 @@ public: FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) : m_config(cnf), m_db(db), m_updater(updfunc), - m_missing(new FSIFIMissingStore) + m_missing(new FSIFIMissingStore), m_detectxattronly(false) #ifdef IDX_THREADS , m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first), m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first) @@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) { LOGDEB1(("FsIndexer::FsIndexer\n")); m_havelocalfields = m_config->hasNameAnywhere("localfields"); + m_config->getConfParam("detectxattronly", &m_detectxattronly); #ifdef IDX_THREADS m_stableconfig = new RclConfig(*m_config); @@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config, bool existingDoc; bool needupdate = m_db->needUpdate(udi, sig, &existingDoc); + // If ctime (which we use for the sig) differs from mtime, then at most + // the extended attributes were changed, no need to index content. + // This unfortunately leaves open the case where the data was + // modified, then the extended attributes, in which case we will + // miss the data update. We would have to store both the mtime and + // the ctime to avoid this + bool xattronly = m_detectxattronly && !m_db->inFullReset() && + existingDoc && needupdate && (stp->st_mtime < stp->st_ctime); + if (!needupdate) { LOGDEB0(("processone: up to date: %s\n", fn.c_str())); if (m_updater) { @@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config, LOGDEB0(("processone: processing: [%s] %s\n", displayableBytes(stp->st_size).c_str(), fn.c_str())); - FileInterner interner(fn, stp, config, FileInterner::FIF_none); - if (!interner.ok()) { - // no indexing whatsoever in this case. This typically means that - // indexallfilenames is not set - return FsTreeWalker::FtwOk; - } - interner.setMissingStore(m_missing); - string utf8fn = compute_utf8fn(config, fn); // parent_udi is initially the same as udi, it will be used if there @@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config, char ascdate[30]; sprintf(ascdate, "%ld", long(stp->st_mtime)); - FileInterner::Status fis = FileInterner::FIAgain; bool hadNullIpath = false; - bool hadNonNullIpath = false; - while (fis == FileInterner::FIAgain) { - doc.erase(); - try { - fis = interner.internfile(doc); - } catch (CancelExcept) { - LOGERR(("fsIndexer::processone: interrupted\n")); - return FsTreeWalker::FtwStop; - } + string mimetype; - // We index at least the file name even if there was an error. - // We'll change the signature to ensure that the indexing will - // be retried every time. + if (!xattronly) { + FileInterner interner(fn, stp, config, FileInterner::FIF_none); + if (!interner.ok()) { + // no indexing whatsoever in this case. This typically means that + // indexallfilenames is not set + return FsTreeWalker::FtwOk; + } + mimetype = interner.getMimetype(); - // Internal access path for multi-document files. If empty, this is - // for the main file. - if (doc.ipath.empty()) { - hadNullIpath = true; - if (hadNonNullIpath) { - // Note that only the filters can reliably compute - // this. What we do is dependant of the doc order (if - // we see the top doc first, we won't set the flag) - doc.haschildren = true; + interner.setMissingStore(m_missing); + FileInterner::Status fis = FileInterner::FIAgain; + bool hadNonNullIpath = false; + while (fis == FileInterner::FIAgain) { + doc.erase(); + try { + fis = interner.internfile(doc); + } catch (CancelExcept) { + LOGERR(("fsIndexer::processone: interrupted\n")); + return FsTreeWalker::FtwStop; } - } else { - hadNonNullIpath = true; - make_udi(fn, doc.ipath, udi); - } - // Set file name, mod time and url if not done by filter - if (doc.fmtime.empty()) - doc.fmtime = ascdate; - if (doc.url.empty()) - doc.url = cstr_fileu + fn; - const string *fnp = 0; - if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty()) - doc.meta[Rcl::Doc::keyfn] = utf8fn; + // We index at least the file name even if there was an error. + // We'll change the signature to ensure that the indexing will + // be retried every time. - char cbuf[100]; - sprintf(cbuf, "%lld", (long long)stp->st_size); - doc.pcbytes = cbuf; - // Document signature for up to date checks. All subdocs inherit the - // file's. - doc.sig = sig; - - // If there was an error, ensure indexing will be - // retried. This is for the once missing, later installed - // filter case. It can make indexing much slower (if there are - // myriads of such files, the ext script is executed for them - // and fails every time) - if (fis == FileInterner::FIError) { - doc.sig += cstr_plus; - } - - // Possibly add fields from local config - if (m_havelocalfields) - setlocalfields(localfields, doc); - - // Add document to database. If there is an ipath, add it as a children - // of the file document. -#ifdef IDX_THREADS - if (m_haveSplitQ) { - DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? - cstr_null : parent_udi, doc); - if (!m_dwqueue.put(tp)) { - LOGERR(("processonefile: wqueue.put failed\n")); - return FsTreeWalker::FtwError; - } - } else { -#endif - if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? - cstr_null : parent_udi, doc)) { - return FsTreeWalker::FtwError; + // Internal access path for multi-document files. If empty, this is + // for the main file. + if (doc.ipath.empty()) { + hadNullIpath = true; + if (hadNonNullIpath) { + // Note that only the filters can reliably compute + // this. What we do is dependant of the doc order (if + // we see the top doc first, we won't set the flag) + doc.haschildren = true; + } + } else { + hadNonNullIpath = true; + make_udi(fn, doc.ipath, udi); } + + // Set file name, mod time and url if not done by filter + if (doc.fmtime.empty()) + doc.fmtime = ascdate; + if (doc.url.empty()) + doc.url = cstr_fileu + fn; + const string *fnp = 0; + if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty()) + doc.meta[Rcl::Doc::keyfn] = utf8fn; + + char cbuf[100]; + sprintf(cbuf, "%lld", (long long)stp->st_size); + doc.pcbytes = cbuf; + // Document signature for up to date checks. All subdocs inherit the + // file's. + doc.sig = sig; + + // If there was an error, ensure indexing will be + // retried. This is for the once missing, later installed + // filter case. It can make indexing much slower (if there are + // myriads of such files, the ext script is executed for them + // and fails every time) + if (fis == FileInterner::FIError) { + doc.sig += cstr_plus; + } + + // Possibly add fields from local config + if (m_havelocalfields) + setlocalfields(localfields, doc); + + // Add document to database. If there is an ipath, add it + // as a child of the file document. #ifdef IDX_THREADS - } + if (m_haveSplitQ) { + DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ? + cstr_null : parent_udi, doc); + if (!m_dwqueue.put(tp)) { + LOGERR(("processonefile: wqueue.put failed\n")); + return FsTreeWalker::FtwError; + } + } else { +#endif + if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? + cstr_null : parent_udi, doc)) { + return FsTreeWalker::FtwError; + } +#ifdef IDX_THREADS + } #endif - // Tell what we are doing and check for interrupt request - if (m_updater) { + // Tell what we are doing and check for interrupt request + if (m_updater) { #ifdef IDX_THREADS - PTMutexLocker locker(m_updater->m_mutex); + PTMutexLocker locker(m_updater->m_mutex); #endif - ++(m_updater->status.docsdone); - if (m_updater->status.dbtotdocs < m_updater->status.docsdone) - m_updater->status.dbtotdocs = m_updater->status.docsdone; - m_updater->status.fn = fn; - if (!doc.ipath.empty()) - m_updater->status.fn += "|" + doc.ipath; - if (!m_updater->update()) { - return FsTreeWalker::FtwStop; - } + ++(m_updater->status.docsdone); + if (m_updater->status.dbtotdocs < m_updater->status.docsdone) + m_updater->status.dbtotdocs = m_updater->status.docsdone; + m_updater->status.fn = fn; + if (!doc.ipath.empty()) + m_updater->status.fn += "|" + doc.ipath; + if (!m_updater->update()) { + return FsTreeWalker::FtwStop; + } + } } - } - // If this doc existed and it's a container, recording for - // possible subdoc purge (this will be used only if we don't do a - // db-wide purge, e.g. if we're called from indexfiles()). - LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n", - existingDoc, hadNonNullIpath)); - if (existingDoc && hadNonNullIpath) { - m_purgeCandidates.record(parent_udi); + // If this doc existed and it's a container, recording for + // possible subdoc purge (this will be used only if we don't do a + // db-wide purge, e.g. if we're called from indexfiles()). + LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n", + existingDoc, hadNonNullIpath)); + if (existingDoc && hadNonNullIpath) { + m_purgeCandidates.record(parent_udi); + } } // If we had no instance with a null ipath, we create an empty // document to stand for the file itself, to be used mainly for up // to date checks. Typically this happens for an mbox file. - if (hadNullIpath == false) { - LOGDEB1(("Creating empty doc for file\n")); + // + // If xattronly is set, ONLY the extattr metadata is valid and will be used + // by the following step. + if (xattronly || hadNullIpath == false) { + LOGDEB(("Creating empty doc for file or pure xattr update\n")); Rcl::Doc fileDoc; - fileDoc.fmtime = ascdate; - fileDoc.meta[Rcl::Doc::keyfn] = utf8fn; - fileDoc.haschildren = true; - fileDoc.mimetype = interner.getMimetype(); - fileDoc.url = cstr_fileu + fn; - if (m_havelocalfields) - setlocalfields(localfields, fileDoc); - char cbuf[100]; - sprintf(cbuf, "%lld", (long long)stp->st_size); - fileDoc.pcbytes = cbuf; + if (xattronly) { + map xfields; + reapXAttrs(config, fn, xfields); + docFieldsFromXattrs(config, xfields, fileDoc); + fileDoc.onlyxattr = true; + } else { + fileDoc.fmtime = ascdate; + fileDoc.meta[Rcl::Doc::keyfn] = utf8fn; + fileDoc.haschildren = true; + fileDoc.mimetype = mimetype; + fileDoc.url = cstr_fileu + fn; + if (m_havelocalfields) + setlocalfields(localfields, fileDoc); + char cbuf[100]; + sprintf(cbuf, "%lld", (long long)stp->st_size); + fileDoc.pcbytes = cbuf; + } + fileDoc.sig = sig; #ifdef IDX_THREADS diff --git a/src/index/fsindexer.h b/src/index/fsindexer.h index 4f3a176c..f7ce0cd9 100644 --- a/src/index/fsindexer.h +++ b/src/index/fsindexer.h @@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB { string m_slocalfields; map m_localfields; + // Activate detection of xattr-only document updates. Experimental, so + // needs a config option + bool m_detectxattronly; + #ifdef IDX_THREADS friend void *FsIndexerDbUpdWorker(void*); friend void *FsIndexerInternfileWorker(void*); diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp index d961b600..72305757 100644 --- a/src/index/rclmonrcv.cpp +++ b/src/index/rclmonrcv.cpp @@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code) code &= ~(IN_ISDIR|IN_ONESHOT); switch (code) { case IN_ACCESS: return "IN_ACCESS"; + case IN_MODIFY: return "IN_MODIFY"; case IN_ATTRIB: return "IN_ATTRIB"; - case IN_CLOSE: return "IN_CLOSE"; - case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE"; case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE"; + case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE"; + case IN_CLOSE: return "IN_CLOSE"; + case IN_OPEN: return "IN_OPEN"; + case IN_MOVED_FROM: return "IN_MOVED_FROM"; + case IN_MOVED_TO: return "IN_MOVED_TO"; + case IN_MOVE: return "IN_MOVE"; case IN_CREATE: return "IN_CREATE"; case IN_DELETE: return "IN_DELETE"; case IN_DELETE_SELF: return "IN_DELETE_SELF"; - case IN_IGNORED: return "IN_IGNORED"; - case IN_MODIFY: return "IN_MODIFY"; - case IN_MOVE: return "IN_MOVE"; - case IN_MOVED_FROM: return "IN_MOVED_FROM"; - case IN_MOVED_TO: return "IN_MOVED_TO"; case IN_MOVE_SELF: return "IN_MOVE_SELF"; - case IN_OPEN: return "IN_OPEN"; - case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW"; case IN_UNMOUNT: return "IN_UNMOUNT"; + case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW"; + case IN_IGNORED: return "IN_IGNORED"; default: { static char msg[50]; sprintf(msg, "Unknown event 0x%x", code); @@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool) uint32_t mask = IN_MODIFY | IN_CREATE | IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE #ifdef RCL_USE_XATTR - // It seems that IN_ATTRIB is not needed to receive extattr - // modification events, which is a bit weird because only ctime is - // set. - // | IN_ATTRIB + // IN_ATTRIB used to be not needed to receive extattr + // modification events, which was a bit weird because only ctime is + // set, and now it is... + | IN_ATTRIB #endif #ifdef IN_DONT_FOLLOW | IN_DONT_FOLLOW @@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs) eraseWatchSubTree(m_idtopath, ev.m_path); } - // IN_ATTRIB apparently not needed, see comment above - if (evp->mask & (IN_MODIFY)) { + // IN_ATTRIB used to be not needed, but now it is + if (evp->mask & (IN_MODIFY|IN_ATTRIB)) { ev.m_etyp = RclMonEvent::RCLEVT_MODIFY; } else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) { ev.m_etyp = RclMonEvent::RCLEVT_DELETE; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 123c7dd4..737a9cc7 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi) return false; } +// Clear term from document if its frequency is 0. This should +// probably be done by Xapian when the freq goes to 0 when removing a +// posting, but we have to do it ourselves +bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term) +{ + LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str())); + + // Find the term + Xapian::TermIterator xit; + XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);, + xrdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", + term.c_str(), m_rcldb->m_reason.c_str())); + return false; + } + if (xit == xdoc.termlist_end() || term.compare(*xit)) { + LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", + term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str())); + return false; + } + + // Clear the term if its frequency is 0 + if (xit.get_wdf() == 0) { + LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str())); + XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", + term.c_str(), m_rcldb->m_reason.c_str())); + } + } + return true; +} + +// Holder for term + pos +struct DocPosting { + DocPosting(string t, Xapian::termpos ps) + : term(t), pos(ps) {} + string term; + Xapian::termpos pos; +}; + +// Clear all terms for given field for given document. +// The terms to be cleared are all those with the appropriate +// prefix. We also remove the postings for the unprefixed terms (that +// is, we undo what we did when indexing). +bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx, + Xapian::termcount wdfdec) +{ + LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n", + pfx.c_str(), unsigned(xdoc.get_docid()))); + + vector eraselist; + + string wrapd = wrap_prefix(pfx); + + m_rcldb->m_reason.clear(); + for (int tries = 0; tries < 2; tries++) { + try { + Xapian::TermIterator xit; + xit = xdoc.termlist_begin(); + xit.skip_to(wrapd); + while (xit != xdoc.termlist_end() && + !(*xit).compare(0, wrapd.size(), wrapd)) { + LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str())); + Xapian::PositionIterator posit; + for (posit = xit.positionlist_begin(); + posit != xit.positionlist_end(); posit++) { + eraselist.push_back(DocPosting(*xit, *posit)); + eraselist.push_back(DocPosting(strip_prefix(*xit), *posit)); + } + xit++; + } + } catch (const Xapian::DatabaseModifiedError &e) { + m_rcldb->m_reason = e.get_msg(); + xrdb.reopen(); + continue; + } XCATCHERROR(m_rcldb->m_reason); + break; + } + if (!m_rcldb->m_reason.empty()) { + LOGERR(("Db::clearField: failed building erase list: %s\n", + m_rcldb->m_reason.c_str())); + return false; + } + + // Now remove the found positions, and the terms if the wdf is 0 + for (vector::const_iterator it = eraselist.begin(); + it != eraselist.end(); it++) { + LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", + it->term.c_str(), int(it->pos))); + XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, + xwdb,m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + // Not that this normally fails for non-prefixed XXST and + // ND, don't make a fuss + LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n", + it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str())); + } + clearDocTermIfWdf0(xdoc, it->term); + } + return true; +} + // Check if doc given by udi is indexed by term bool Db::Native::hasTerm(const string& udi, int idxi, const string& term) { @@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, { #ifdef IDX_THREADS Chrono chron; - // In the case where there is a separate (single) db update - // thread, we only need to protect the update map update below - // (against interaction with threads calling needUpdate()). Else, - // all threads from above need to synchronize here - PTMutexLocker lock(m_mutex, m_havewriteq); + PTMutexLocker lock(m_mutex); #endif // Check file system full every mbyte of indexed text. It's a bit wasteful @@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, try { Xapian::docid did = xwdb.replace_document(uniterm, newdocument); -#ifdef IDX_THREADS - // Need to protect against interaction with the up-to-date checks - // which also update the existence map - PTMutexLocker lock(m_mutex, !m_havewriteq); -#endif if (did < m_rcldb->updated.size()) { m_rcldb->updated[did] = true; LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc)); @@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp) return false; } - // The splitter breaks text into words and adds postings to the Xapian // document. We use a single object to split all of the document // fields and position jumps to separate fields @@ -1151,7 +1245,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) return false; Xapian::Document newdocument; - + // The term processing pipeline: TermProcIdx tpidx; TermProc *nxt = &tpidx; @@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) TextSplitDb splitter(newdocument, nxt); tpidx.setTSD(&splitter); - // If the ipath is like a path, index the last element. This is - // for compound documents like zip and chm for which the filter - // uses the file path as ipath. - if (!doc.ipath.empty() && - doc.ipath.find_first_not_of("0123456789") != string::npos) { - string utf8ipathlast; - // There is no way in hell we could have an idea of the - // charset here, so let's hope it's ascii or utf-8. We call - // transcode to strip the bad chars and pray - if (transcode(path_getsimple(doc.ipath), utf8ipathlast, - "UTF-8", "UTF-8")) { - splitter.text_to_words(utf8ipathlast); - } - } - - // Split and index the path from the url for path-based filtering - { - string path = url_gpath(doc.url); - vector vpath; - stringToTokens(path, vpath, "/"); - // If vpath is not /, the last elt is the file/dir name, not a - // part of the path. - if (vpath.size()) - vpath.resize(vpath.size()-1); - splitter.curpos = 0; - newdocument.add_posting(wrap_prefix(pathelt_prefix), - splitter.basepos + splitter.curpos++); - for (vector::iterator it = vpath.begin(); - it != vpath.end(); it++){ - if (it->length() > 230) { - // Just truncate it. May still be useful because of wildcards - *it = it->substr(0, 230); - } - newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, - splitter.basepos + splitter.curpos++); - } - } - - // Index textual metadata. These are all indexed as text with - // positions, as we may want to do phrase searches with them (this - // makes no sense for keywords by the way). - // - // The order has no importance, and we set a position gap of 100 - // between fields to avoid false proximity matches. - map::iterator meta_it; - for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { - if (!meta_it->second.empty()) { - const FieldTraits *ftp; - // We don't test for an empty prefix here. Some fields are part - // of the internal conf with an empty prefix (ie: abstract). - if (!fieldToTraits(meta_it->first, &ftp)) { - LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n", - meta_it->first.c_str())); - continue; - } - LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", - meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, - meta_it->second.c_str())); - splitter.setprefix(ftp->pfx); - splitter.setwdfinc(ftp->wdfinc); - if (!splitter.text_to_words(meta_it->second)) - LOGDEB(("Db::addOrUpdate: split failed for %s\n", - meta_it->first.c_str())); - } - } - splitter.setprefix(string()); - splitter.setwdfinc(1); - - if (splitter.curpos < baseTextPosition) - splitter.basepos = baseTextPosition; - - // Split and index body text - LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); - -#ifdef TEXTSPLIT_STATS - splitter.resetStats(); -#endif - if (!splitter.text_to_words(doc.text)) - LOGDEB(("Db::addOrUpdate: split failed for main text\n")); - -#ifdef TEXTSPLIT_STATS - // Reject bad data. unrecognized base64 text is characterized by - // high avg word length and high variation (because there are - // word-splitters like +/ inside the data). - TextSplit::Stats::Values v = splitter.getStats(); - // v.avglen > 15 && v.sigma > 12 - if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { - LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats " - "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n", - v.count, v.avglen, v.sigma, doc.url.c_str(), - doc.ipath.c_str(), doc.text.c_str())); - return true; - } -#endif - - ////// Special terms for other metadata. No positions for these. - // Mime type - newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype); - - // Simple file name indexed unsplit for specific "file name" - // searches. This is not the same as a filename: clause inside the - // query language. - // We also add a term for the filename extension if any. - string utf8fn; - if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { - string fn; - if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { - // We should truncate after extracting the extension, but this is - // a pathological case anyway - if (fn.size() > 230) - utf8truncate(fn, 230); - string::size_type pos = fn.rfind('.'); - if (pos != string::npos && pos != fn.length() - 1) { - newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + - fn.substr(pos + 1)); - } - newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0); - } - } - // Udi unique term: this is used for file existence/uptodate // checks, and unique id for the replace_document() call. string uniterm = make_uniterm(udi); - newdocument.add_boolean_term(uniterm); - // Parent term. This is used to find all descendents, mostly to delete them - // when the parent goes away - if (!parent_udi.empty()) { - newdocument.add_boolean_term(make_parentterm(parent_udi)); - } - // Dates etc. - time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : - doc.dmtime.c_str()); - struct tm *tm = localtime(&mtime); - char buf[9]; - snprintf(buf, 9, "%04d%02d%02d", - tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); - // Date (YYYYMMDD) - newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); - // Month (YYYYMM) - buf[6] = '\0'; - newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf)); - // Year (YYYY) - buf[4] = '\0'; - newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); - - ////////////////////////////////////////////////////////////////// - // Document data record. omindex has the following nl separated fields: - // - url - // - sample - // - caption (title limited to 100 chars) - // - mime type - // - // The title, author, abstract and keywords fields are special, - // they always get stored in the document data - // record. Configurable other fields can be, too. - // - // We truncate stored fields abstract, title and keywords to - // reasonable lengths and suppress newlines (so that the data - // record can keep a simple syntax) - - string record; - RECORD_APPEND(record, Doc::keyurl, doc.url); - RECORD_APPEND(record, Doc::keytp, doc.mimetype); - // We left-zero-pad the times so that they are lexico-sortable - leftzeropad(doc.fmtime, 11); - RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); - if (!doc.dmtime.empty()) { - leftzeropad(doc.dmtime, 11); - RECORD_APPEND(record, Doc::keydmt, doc.dmtime); - } - RECORD_APPEND(record, Doc::keyoc, doc.origcharset); - - if (doc.fbytes.empty()) - doc.fbytes = doc.pcbytes; - - if (!doc.fbytes.empty()) { - RECORD_APPEND(record, Doc::keyfs, doc.fbytes); - leftzeropad(doc.fbytes, 12); - newdocument.add_value(VALUE_SIZE, doc.fbytes); - } - if (doc.haschildren) { - newdocument.add_boolean_term(has_children_term); - } - if (!doc.pcbytes.empty()) - RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); - char sizebuf[30]; - sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); - RECORD_APPEND(record, Doc::keyds, sizebuf); - - // Note that we add the signature both as a value and in the data record - if (!doc.sig.empty()) { - RECORD_APPEND(record, Doc::keysig, doc.sig); - newdocument.add_value(VALUE_SIG, doc.sig); - } - - if (!doc.ipath.empty()) - RECORD_APPEND(record, Doc::keyipt, doc.ipath); - - doc.meta[Doc::keytt] = - neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc); - if (!doc.meta[Doc::keytt].empty()) - RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]); - - trimstring(doc.meta[Doc::keykw], " \t\r\n"); - doc.meta[Doc::keykw] = - neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc); - // No need to explicitly append the keywords, this will be done by - // the "stored" loop - - // If abstract is empty, we make up one with the beginning of the - // document. This is then not indexed, but part of the doc data so - // that we can return it to a query without having to decode the - // original file. - bool syntabs = false; - // Note that the map accesses by operator[] create empty entries if they - // don't exist yet. - trimstring(doc.meta[Doc::keyabs], " \t\r\n"); - if (doc.meta[Doc::keyabs].empty()) { - syntabs = true; - if (!doc.text.empty()) - doc.meta[Doc::keyabs] = cstr_syntAbs + - neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc); + if (doc.onlyxattr) { + // Only updating an existing doc with new extended attributes + // data. Need to read the old doc and its data record + // first. This is so different from the normal processing that + // it uses a fully separate code path (with some duplication + // unfortunately) + if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument)) + return false; } else { - doc.meta[Doc::keyabs] = - neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), - cstr_nc); - } - const set& stored = m_config->getStoredFields(); - for (set::const_iterator it = stored.begin(); - it != stored.end(); it++) { - string nm = m_config->fieldCanon(*it); - if (!doc.meta[nm].empty()) { - string value = - neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); - RECORD_APPEND(record, nm, value); + // If the ipath is like a path, index the last element. This is + // for compound documents like zip and chm for which the filter + // uses the file path as ipath. + if (!doc.ipath.empty() && + doc.ipath.find_first_not_of("0123456789") != string::npos) { + string utf8ipathlast; + // There is no way in hell we could have an idea of the + // charset here, so let's hope it's ascii or utf-8. We call + // transcode to strip the bad chars and pray + if (transcode(path_getsimple(doc.ipath), utf8ipathlast, + "UTF-8", "UTF-8")) { + splitter.text_to_words(utf8ipathlast); + } } - } - // If empty pages (multiple break at same pos) were recorded, save - // them (this is because we have no way to record them in the - // Xapian list - if (!tpidx.m_pageincrvec.empty()) { - ostringstream multibreaks; - for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { - if (i != 0) - multibreaks << ","; - multibreaks << tpidx.m_pageincrvec[i].first << "," << - tpidx.m_pageincrvec[i].second; + // Split and index the path from the url for path-based filtering + { + string path = url_gpath(doc.url); + vector vpath; + stringToTokens(path, vpath, "/"); + // If vpath is not /, the last elt is the file/dir name, not a + // part of the path. + if (vpath.size()) + vpath.resize(vpath.size()-1); + splitter.curpos = 0; + newdocument.add_posting(wrap_prefix(pathelt_prefix), + splitter.basepos + splitter.curpos++); + for (vector::iterator it = vpath.begin(); + it != vpath.end(); it++){ + if (it->length() > 230) { + // Just truncate it. May still be useful because of wildcards + *it = it->substr(0, 230); + } + newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, + splitter.basepos + splitter.curpos++); + } + } + + // Index textual metadata. These are all indexed as text with + // positions, as we may want to do phrase searches with them (this + // makes no sense for keywords by the way). + // + // The order has no importance, and we set a position gap of 100 + // between fields to avoid false proximity matches. + map::iterator meta_it; + for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { + if (!meta_it->second.empty()) { + const FieldTraits *ftp; + // We don't test for an empty prefix here. Some fields are part + // of the internal conf with an empty prefix (ie: abstract). + if (!fieldToTraits(meta_it->first, &ftp)) { + LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n", + meta_it->first.c_str())); + continue; + } + LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", + meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, + meta_it->second.c_str())); + splitter.setprefix(ftp->pfx); + splitter.setwdfinc(ftp->wdfinc); + if (!splitter.text_to_words(meta_it->second)) + LOGDEB(("Db::addOrUpdate: split failed for %s\n", + meta_it->first.c_str())); + } + } + splitter.setprefix(string()); + splitter.setwdfinc(1); + + if (splitter.curpos < baseTextPosition) + splitter.basepos = baseTextPosition; + + // Split and index body text + LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); + +#ifdef TEXTSPLIT_STATS + splitter.resetStats(); +#endif + if (!splitter.text_to_words(doc.text)) + LOGDEB(("Db::addOrUpdate: split failed for main text\n")); + +#ifdef TEXTSPLIT_STATS + // Reject bad data. unrecognized base64 text is characterized by + // high avg word length and high variation (because there are + // word-splitters like +/ inside the data). + TextSplit::Stats::Values v = splitter.getStats(); + // v.avglen > 15 && v.sigma > 12 + if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { + LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats " + "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n", + v.count, v.avglen, v.sigma, doc.url.c_str(), + doc.ipath.c_str(), doc.text.c_str())); + return true; + } +#endif + + ////// Special terms for other metadata. No positions for these. + // Mime type + newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype); + + // Simple file name indexed unsplit for specific "file name" + // searches. This is not the same as a filename: clause inside the + // query language. + // We also add a term for the filename extension if any. + string utf8fn; + if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { + string fn; + if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { + // We should truncate after extracting the extension, but this is + // a pathological case anyway + if (fn.size() > 230) + utf8truncate(fn, 230); + string::size_type pos = fn.rfind('.'); + if (pos != string::npos && pos != fn.length() - 1) { + newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + + fn.substr(pos + 1)); + } + newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0); + } + } + + newdocument.add_boolean_term(uniterm); + // Parent term. This is used to find all descendents, mostly + // to delete them when the parent goes away + if (!parent_udi.empty()) { + newdocument.add_boolean_term(make_parentterm(parent_udi)); + } + // Dates etc. + time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : + doc.dmtime.c_str()); + struct tm *tm = localtime(&mtime); + char buf[9]; + snprintf(buf, 9, "%04d%02d%02d", + tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); + // Date (YYYYMMDD) + newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); + // Month (YYYYMM) + buf[6] = '\0'; + newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf)); + // Year (YYYY) + buf[4] = '\0'; + newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); + + + ////////////////////////////////////////////////////////////////// + // Document data record. omindex has the following nl separated fields: + // - url + // - sample + // - caption (title limited to 100 chars) + // - mime type + // + // The title, author, abstract and keywords fields are special, + // they always get stored in the document data + // record. Configurable other fields can be, too. + // + // We truncate stored fields abstract, title and keywords to + // reasonable lengths and suppress newlines (so that the data + // record can keep a simple syntax) + + string record; + RECORD_APPEND(record, Doc::keyurl, doc.url); + RECORD_APPEND(record, Doc::keytp, doc.mimetype); + // We left-zero-pad the times so that they are lexico-sortable + leftzeropad(doc.fmtime, 11); + RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); + if (!doc.dmtime.empty()) { + leftzeropad(doc.dmtime, 11); + RECORD_APPEND(record, Doc::keydmt, doc.dmtime); + } + RECORD_APPEND(record, Doc::keyoc, doc.origcharset); + + if (doc.fbytes.empty()) + doc.fbytes = doc.pcbytes; + + if (!doc.fbytes.empty()) { + RECORD_APPEND(record, Doc::keyfs, doc.fbytes); + leftzeropad(doc.fbytes, 12); + newdocument.add_value(VALUE_SIZE, doc.fbytes); + } + if (doc.haschildren) { + newdocument.add_boolean_term(has_children_term); + } + if (!doc.pcbytes.empty()) + RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); + char sizebuf[30]; + sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); + RECORD_APPEND(record, Doc::keyds, sizebuf); + + // Note that we add the signature both as a value and in the data record + if (!doc.sig.empty()) { + RECORD_APPEND(record, Doc::keysig, doc.sig); + newdocument.add_value(VALUE_SIG, doc.sig); + } + + if (!doc.ipath.empty()) + RECORD_APPEND(record, Doc::keyipt, doc.ipath); + + doc.meta[Doc::keytt] = + neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc); + if (!doc.meta[Doc::keytt].empty()) + RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]); + + trimstring(doc.meta[Doc::keykw], " \t\r\n"); + doc.meta[Doc::keykw] = + neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc); + // No need to explicitly append the keywords, this will be done by + // the "stored" loop + + // If abstract is empty, we make up one with the beginning of the + // document. This is then not indexed, but part of the doc data so + // that we can return it to a query without having to decode the + // original file. + bool syntabs = false; + // Note that the map accesses by operator[] create empty entries if they + // don't exist yet. + trimstring(doc.meta[Doc::keyabs], " \t\r\n"); + if (doc.meta[Doc::keyabs].empty()) { + syntabs = true; + if (!doc.text.empty()) + doc.meta[Doc::keyabs] = cstr_syntAbs + + neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc); + } else { + doc.meta[Doc::keyabs] = + neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), + cstr_nc); + } + + const set& stored = m_config->getStoredFields(); + for (set::const_iterator it = stored.begin(); + it != stored.end(); it++) { + string nm = m_config->fieldCanon(*it); + if (!doc.meta[nm].empty()) { + string value = + neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); + RECORD_APPEND(record, nm, value); + } + } + + // If empty pages (multiple break at same pos) were recorded, save + // them (this is because we have no way to record them in the + // Xapian list + if (!tpidx.m_pageincrvec.empty()) { + ostringstream multibreaks; + for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { + if (i != 0) + multibreaks << ","; + multibreaks << tpidx.m_pageincrvec[i].first << "," << + tpidx.m_pageincrvec[i].second; + } + RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); } - RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); - } - // If the file's md5 was computed, add value and term. - // The value is optionally used for query result duplicate elimination, - // and the term to find the duplicates. - // We don't do this for empty docs. - const string *md5; - if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() && - md5->compare(cstr_md5empty)) { - string digest; - MD5HexScan(*md5, digest); - newdocument.add_value(VALUE_MD5, digest); - newdocument.add_boolean_term(wrap_prefix("XM") + *md5); + // If the file's md5 was computed, add value and term. + // The value is optionally used for query result duplicate elimination, + // and the term to find the duplicates. + // We don't do this for empty docs. + const string *md5; + if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() && + md5->compare(cstr_md5empty)) { + string digest; + MD5HexScan(*md5, digest); + newdocument.add_value(VALUE_MD5, digest); + newdocument.add_boolean_term(wrap_prefix("XM") + *md5); + } + + LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str())); + newdocument.set_data(record); } - - LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str())); - newdocument.set_data(record); - #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, @@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) doc.text.length()); } +bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, + Doc &doc, Xapian::Document& xdoc) +{ + LOGDEB0(("Db::docToXdocXattrOnly\n")); + PTMutexLocker lock(m_mutex); + + // Read existing document and its data record + if (getDoc(udi, 0, xdoc) == 0) { + LOGERR(("docToXdocXattrOnly: existing doc not found\n")); + return false; + } + string data; + XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str())); + return false; + } + + // Clear the term lists for the incoming fields and index the new values + map::iterator meta_it; + for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { + const FieldTraits *ftp; + if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) { + LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n", + meta_it->first.c_str())); + continue; + } + // Clear the previous terms for the field + clearField(xdoc, ftp->pfx, ftp->wdfinc); + LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", + meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, + meta_it->second.c_str())); + splitter->setprefix(ftp->pfx); + splitter->setwdfinc(ftp->wdfinc); + if (!splitter->text_to_words(meta_it->second)) + LOGDEB(("Db::xattrOnly: split failed for %s\n", + meta_it->first.c_str())); + } + xdoc.add_value(VALUE_SIG, doc.sig); + + // Parse current data record into a dict for ease of processing + ConfSimple datadic(data); + if (!datadic.ok()) { + LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n")); + return false; + } + + // For each "stored" field, check if set in doc metadata and + // update the value if it is + const set& stored = m_rcldb->m_config->getStoredFields(); + for (set::const_iterator it = stored.begin(); + it != stored.end(); it++) { + string nm = m_rcldb->m_config->fieldCanon(*it); + if (doc.getmeta(nm, 0)) { + string value = + neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); + datadic.set(nm, value, ""); + } + } + + // Recreate the record. We want to do this with the local RECORD_APPEND + // method for consistency in format, instead of using ConfSimple print + vector names = datadic.getNames(""); + data.clear(); + for (vector::const_iterator it = names.begin(); + it != names.end(); it++) { + string value; + datadic.get(*it, value, ""); + RECORD_APPEND(data, *it, value); + } + RECORD_APPEND(data, Doc::keysig, doc.sig); + xdoc.set_data(data); + return true; +} + #ifdef IDX_THREADS void Db::waitUpdIdle() { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 1962c3b4..477f7cbe 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -237,6 +237,10 @@ class Db { */ bool needUpdate(const string &udi, const string& sig, bool *existed=0); + /** Indicate if we are doing a systematic reindex. This complements + needUpdate() return */ + bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;} + /** Add or update document identified by unique identifier. * @param config Config object to use. Can be the same as the member config * or a clone, to avoid sharing when called in multithread context. diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 65dceee8..fda1b018 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -66,6 +66,8 @@ public: }; #endif // IDX_THREADS +class TextSplitDb; + // A class for data and methods that would have to expose // Xapian-specific stuff if they were in Rcl::Db. There could actually be // 2 different ones for indexing or query as there is not much in @@ -141,6 +143,16 @@ class Db::Native { /** Check if doc is indexed by term */ bool hasTerm(const string& udi, int idxi, const string& term); + /** Update existing Xapian document for pure extended attrs change */ + bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, + Doc &doc, Xapian::Document& xdoc); + /** Remove all terms currently indexed for field defined by idx prefix */ + bool clearField(Xapian::Document& xdoc, const string& pfx, + Xapian::termcount wdfdec); + + /** Check if term wdf is 0 and remove term if so */ + bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term); + /** Compute list of subdocuments for a given udi. We look for documents * indexed by a parent term matching the udi, the posting list for the * parentterm(udi) (As suggested by James Aylett) diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index e37f0045..2ba5b4ca 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -131,6 +131,10 @@ class Doc { // ipath descendants. bool haschildren; + // During indexing: only fields from extended attributes were set, no + // doc content. Allows for faster reindexing of existing doc + bool onlyxattr; + /////////////////////////////////////////////////////////////////// void erase() { @@ -154,10 +158,11 @@ class Doc { idxi = 0; haspages = false; haschildren = false; + onlyxattr = false; } Doc() : idxi(0), syntabs(false), pc(0), xdocid(0), - haspages(false), haschildren(false) + haspages(false), haschildren(false), onlyxattr(false) { } /** Get value for named field. If value pointer is 0, just test existence */ diff --git a/src/sampleconf/fields b/src/sampleconf/fields index 0ec4d846..b38030e5 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -13,7 +13,7 @@ ##################################################### # This section defines what prefix the terms inside named fields will be # indexed with (in addition to prefix-less indexing for general search) -# ALL prefixes MUST be all UPPERCASE. +# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS) # # The field names should be the canonic ones, not the aliases defined in # the following section. Don't change those which are predefined here, diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index d2885db6..c6626ac7 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -5,6 +5,7 @@ daemloglevel = 6 daemlogfilename = /tmp/rclmontrace indexStripChars = 1 +detectxattronly = 1 topdirs = /home/dockes/projets/fulltext/testrecoll/ diff --git a/tests/empty/empty.txt b/tests/empty/empty.txt index dbc3778c..47eb4c03 100644 --- a/tests/empty/empty.txt +++ b/tests/empty/empty.txt @@ -1,2 +1,2 @@ 1 results -application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes +inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes diff --git a/tests/runtests.sh b/tests/runtests.sh index a0076b16..0e0442d2 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -11,6 +11,37 @@ if test ! x$reroot = x ; then rerootResults fi +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \ + iscmdresult=$d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + result=0 + for cmd in $*;do + if iscmd $cmd + then + echo $cmd is $iscmdresult + else + echo $cmd not found + result=1 + fi + done + return $result +} + +checkcmds recollq recollindex pxattr xadump || exit 1 + makeindex() { echo "Zeroing Index" rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws diff --git a/tests/xattr/fields b/tests/xattr/fields new file mode 100644 index 00000000..91995081 --- /dev/null +++ b/tests/xattr/fields @@ -0,0 +1,4 @@ +[prefixes] +myattr = XYXATA +[stored] +myattr = diff --git a/tests/xattr/xattr.sh b/tests/xattr/xattr.sh new file mode 100755 index 00000000..6e46e613 --- /dev/null +++ b/tests/xattr/xattr.sh @@ -0,0 +1,85 @@ +#!/bin/sh + +# Test extended attributes indexing. This should work both with +# "detectxattronly" set or unset in the config, but should be run with +# the variable set, because we test its function by exploiting a bug +# (see comments further) +# +# We use the RECOLL_CONFTOP variable to add our own fields configuration + +thisdir=`dirname $0` +topdir=$thisdir/.. +. $topdir/shared.sh + +initvariables $0 + +RECOLL_CONFTOP=$thisdir +export RECOLL_CONFTOP + +xrun() +{ + echo $* + $* +} + +tstfile=${tstdata}/xattrs/tstxattrs.txt +rm -f $tstfile + +( + # Create the file with an extended attribute, index, and query it + # by content and field + echo xattruniqueinfile > $tstfile + xrun pxattr -n myattr -v xattrunique1 $tstfile + xrun recollindex -Zi $tstfile + echo "1 result expected" + xrun recollq xattruniqueinfile + echo "1 result expected" + xrun recollq myattr:xattrunique1 + + sleep 1 + + # Change the value for the field, check that the old value is gone + # and the new works + xrun pxattr -n myattr -v xattrunique2 $tstfile + xrun recollindex -i $tstfile + echo "1 result expected" + xrun recollq xattruniqueinfile + echo "0 result expected:" + xrun recollq myattr:xattrunique1 + echo "1 result expected:" + xrun recollq myattr:xattrunique2 + + # Change the contents then the xattr. With xattronly set, recoll + # should miss the contents change and index only the xattr. That's + # a bug but we use it to check that pure xattr update indexing + # works + echo xattruniqueinfile1 > $tstfile + sleep 2 + xrun pxattr -n myattr -v xattrunique3 $tstfile + xrun recollindex -i $tstfile + echo "1 result expected" + xrun recollq xattruniqueinfile + echo "0 result expected" + xrun recollq xattruniqueinfile1 + echo "0 result expected:" + xrun recollq myattr:xattrunique1 + echo "0 result expected:" + xrun recollq myattr:xattrunique2 + echo "1 result expected:" + xrun recollq myattr:xattrunique3 + + # Reset the index and check that the contents were seen all right + xrun recollindex -Zi $tstfile + echo "0 result expected" + xrun recollq xattruniqueinfile + echo "1 result expected" + xrun recollq xattruniqueinfile1 + echo "0 result expected:" + xrun recollq myattr:xattrunique2 + echo "1 result expected:" + xrun recollq myattr:xattrunique3 + +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout + +diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 +checkresult diff --git a/tests/xattr/xattr.txt b/tests/xattr/xattr.txt new file mode 100644 index 00000000..2e01ef28 --- /dev/null +++ b/tests/xattr/xattr.txt @@ -0,0 +1,57 @@ +pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +1 result expected +recollq xattruniqueinfile +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +1 result expected +recollq myattr:xattrunique1 +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +1 result expected +recollq xattruniqueinfile +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +0 result expected: +recollq myattr:xattrunique1 +0 results +1 result expected: +recollq myattr:xattrunique2 +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +1 result expected +recollq xattruniqueinfile +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +0 result expected +recollq xattruniqueinfile1 +0 results +0 result expected: +recollq myattr:xattrunique1 +0 results +0 result expected: +recollq myattr:xattrunique2 +0 results +1 result expected: +recollq myattr:xattrunique3 +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes +recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt +0 result expected +recollq xattruniqueinfile +0 results +1 result expected +recollq xattruniqueinfile1 +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes +0 result expected: +recollq myattr:xattrunique2 +0 results +1 result expected: +recollq myattr:xattrunique3 +1 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes