Handle partial indexing of document restricted to metadata from extended attributes
This commit is contained in:
parent
b2eeec067b
commit
56a56500c1
@ -1 +1 @@
|
||||
1.19.5
|
||||
1.20.0
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
#include "cancelcheck.h"
|
||||
#include "rclinit.h"
|
||||
#include "execmd.h"
|
||||
|
||||
#include "extrameta.h"
|
||||
|
||||
// When using extended attributes, we have to use the ctime, because
|
||||
// this is all that gets set when the attributes are modified.
|
||||
@ -104,7 +104,7 @@ public:
|
||||
|
||||
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_updater(updfunc),
|
||||
m_missing(new FSIFIMissingStore)
|
||||
m_missing(new FSIFIMissingStore), m_detectxattronly(false)
|
||||
#ifdef IDX_THREADS
|
||||
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
|
||||
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
|
||||
@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||
{
|
||||
LOGDEB1(("FsIndexer::FsIndexer\n"));
|
||||
m_havelocalfields = m_config->hasNameAnywhere("localfields");
|
||||
m_config->getConfParam("detectxattronly", &m_detectxattronly);
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
m_stableconfig = new RclConfig(*m_config);
|
||||
@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
bool existingDoc;
|
||||
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
|
||||
|
||||
// If ctime (which we use for the sig) differs from mtime, then at most
|
||||
// the extended attributes were changed, no need to index content.
|
||||
// This unfortunately leaves open the case where the data was
|
||||
// modified, then the extended attributes, in which case we will
|
||||
// miss the data update. We would have to store both the mtime and
|
||||
// the ctime to avoid this
|
||||
bool xattronly = m_detectxattronly && !m_db->inFullReset() &&
|
||||
existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
|
||||
|
||||
if (!needupdate) {
|
||||
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
|
||||
if (m_updater) {
|
||||
@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
LOGDEB0(("processone: processing: [%s] %s\n",
|
||||
displayableBytes(stp->st_size).c_str(), fn.c_str()));
|
||||
|
||||
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
||||
if (!interner.ok()) {
|
||||
// no indexing whatsoever in this case. This typically means that
|
||||
// indexallfilenames is not set
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
interner.setMissingStore(m_missing);
|
||||
|
||||
string utf8fn = compute_utf8fn(config, fn);
|
||||
|
||||
// parent_udi is initially the same as udi, it will be used if there
|
||||
@ -662,8 +664,20 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
char ascdate[30];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNullIpath = false;
|
||||
string mimetype;
|
||||
|
||||
if (!xattronly) {
|
||||
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
||||
if (!interner.ok()) {
|
||||
// no indexing whatsoever in this case. This typically means that
|
||||
// indexallfilenames is not set
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
mimetype = interner.getMimetype();
|
||||
|
||||
interner.setMissingStore(m_missing);
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNonNullIpath = false;
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
doc.erase();
|
||||
@ -722,8 +736,8 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, doc);
|
||||
|
||||
// Add document to database. If there is an ipath, add it as a children
|
||||
// of the file document.
|
||||
// Add document to database. If there is an ipath, add it
|
||||
// as a child of the file document.
|
||||
#ifdef IDX_THREADS
|
||||
if (m_haveSplitQ) {
|
||||
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
|
||||
@ -767,23 +781,35 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
if (existingDoc && hadNonNullIpath) {
|
||||
m_purgeCandidates.record(parent_udi);
|
||||
}
|
||||
}
|
||||
|
||||
// If we had no instance with a null ipath, we create an empty
|
||||
// document to stand for the file itself, to be used mainly for up
|
||||
// to date checks. Typically this happens for an mbox file.
|
||||
if (hadNullIpath == false) {
|
||||
LOGDEB1(("Creating empty doc for file\n"));
|
||||
//
|
||||
// If xattronly is set, ONLY the extattr metadata is valid and will be used
|
||||
// by the following step.
|
||||
if (xattronly || hadNullIpath == false) {
|
||||
LOGDEB(("Creating empty doc for file or pure xattr update\n"));
|
||||
Rcl::Doc fileDoc;
|
||||
if (xattronly) {
|
||||
map<string, string> xfields;
|
||||
reapXAttrs(config, fn, xfields);
|
||||
docFieldsFromXattrs(config, xfields, fileDoc);
|
||||
fileDoc.onlyxattr = true;
|
||||
} else {
|
||||
fileDoc.fmtime = ascdate;
|
||||
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||
fileDoc.haschildren = true;
|
||||
fileDoc.mimetype = interner.getMimetype();
|
||||
fileDoc.mimetype = mimetype;
|
||||
fileDoc.url = cstr_fileu + fn;
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, fileDoc);
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||
fileDoc.pcbytes = cbuf;
|
||||
}
|
||||
|
||||
fileDoc.sig = sig;
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
|
||||
@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
|
||||
string m_slocalfields;
|
||||
map<string, string> m_localfields;
|
||||
|
||||
// Activate detection of xattr-only document updates. Experimental, so
|
||||
// needs a config option
|
||||
bool m_detectxattronly;
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
friend void *FsIndexerDbUpdWorker(void*);
|
||||
friend void *FsIndexerInternfileWorker(void*);
|
||||
|
||||
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
|
||||
code &= ~(IN_ISDIR|IN_ONESHOT);
|
||||
switch (code) {
|
||||
case IN_ACCESS: return "IN_ACCESS";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_ATTRIB: return "IN_ATTRIB";
|
||||
case IN_CLOSE: return "IN_CLOSE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE: return "IN_CLOSE";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_CREATE: return "IN_CREATE";
|
||||
case IN_DELETE: return "IN_DELETE";
|
||||
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
||||
case IN_IGNORED: return "IN_IGNORED";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_IGNORED: return "IN_IGNORED";
|
||||
default: {
|
||||
static char msg[50];
|
||||
sprintf(msg, "Unknown event 0x%x", code);
|
||||
@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
|
||||
uint32_t mask = IN_MODIFY | IN_CREATE
|
||||
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
||||
#ifdef RCL_USE_XATTR
|
||||
// It seems that IN_ATTRIB is not needed to receive extattr
|
||||
// modification events, which is a bit weird because only ctime is
|
||||
// set.
|
||||
// | IN_ATTRIB
|
||||
// IN_ATTRIB used to be not needed to receive extattr
|
||||
// modification events, which was a bit weird because only ctime is
|
||||
// set, and now it is...
|
||||
| IN_ATTRIB
|
||||
#endif
|
||||
#ifdef IN_DONT_FOLLOW
|
||||
| IN_DONT_FOLLOW
|
||||
@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
|
||||
eraseWatchSubTree(m_idtopath, ev.m_path);
|
||||
}
|
||||
|
||||
// IN_ATTRIB apparently not needed, see comment above
|
||||
if (evp->mask & (IN_MODIFY)) {
|
||||
// IN_ATTRIB used to be not needed, but now it is
|
||||
if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
|
||||
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
||||
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
||||
ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
|
||||
|
||||
@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear term from document if its frequency is 0. This should
|
||||
// probably be done by Xapian when the freq goes to 0 when removing a
|
||||
// posting, but we have to do it ourselves
|
||||
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
|
||||
{
|
||||
LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
|
||||
|
||||
// Find the term
|
||||
Xapian::TermIterator xit;
|
||||
XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
|
||||
xrdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n",
|
||||
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (xit == xdoc.termlist_end() || term.compare(*xit)) {
|
||||
LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n",
|
||||
term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear the term if its frequency is 0
|
||||
if (xit.get_wdf() == 0) {
|
||||
LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
|
||||
XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n",
|
||||
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Holder for term + pos
|
||||
struct DocPosting {
|
||||
DocPosting(string t, Xapian::termpos ps)
|
||||
: term(t), pos(ps) {}
|
||||
string term;
|
||||
Xapian::termpos pos;
|
||||
};
|
||||
|
||||
// Clear all terms for given field for given document.
|
||||
// The terms to be cleared are all those with the appropriate
|
||||
// prefix. We also remove the postings for the unprefixed terms (that
|
||||
// is, we undo what we did when indexing).
|
||||
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
|
||||
Xapian::termcount wdfdec)
|
||||
{
|
||||
LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
|
||||
pfx.c_str(), unsigned(xdoc.get_docid())));
|
||||
|
||||
vector<DocPosting> eraselist;
|
||||
|
||||
string wrapd = wrap_prefix(pfx);
|
||||
|
||||
m_rcldb->m_reason.clear();
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator xit;
|
||||
xit = xdoc.termlist_begin();
|
||||
xit.skip_to(wrapd);
|
||||
while (xit != xdoc.termlist_end() &&
|
||||
!(*xit).compare(0, wrapd.size(), wrapd)) {
|
||||
LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
|
||||
Xapian::PositionIterator posit;
|
||||
for (posit = xit.positionlist_begin();
|
||||
posit != xit.positionlist_end(); posit++) {
|
||||
eraselist.push_back(DocPosting(*xit, *posit));
|
||||
eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
|
||||
}
|
||||
xit++;
|
||||
}
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
m_rcldb->m_reason = e.get_msg();
|
||||
xrdb.reopen();
|
||||
continue;
|
||||
} XCATCHERROR(m_rcldb->m_reason);
|
||||
break;
|
||||
}
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::clearField: failed building erase list: %s\n",
|
||||
m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Now remove the found positions, and the terms if the wdf is 0
|
||||
for (vector<DocPosting>::const_iterator it = eraselist.begin();
|
||||
it != eraselist.end(); it++) {
|
||||
LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n",
|
||||
it->term.c_str(), int(it->pos)));
|
||||
XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);,
|
||||
xwdb,m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
// Not that this normally fails for non-prefixed XXST and
|
||||
// ND, don't make a fuss
|
||||
LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
|
||||
it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
|
||||
}
|
||||
clearDocTermIfWdf0(xdoc, it->term);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if doc given by udi is indexed by term
|
||||
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
||||
{
|
||||
@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||
{
|
||||
#ifdef IDX_THREADS
|
||||
Chrono chron;
|
||||
// In the case where there is a separate (single) db update
|
||||
// thread, we only need to protect the update map update below
|
||||
// (against interaction with threads calling needUpdate()). Else,
|
||||
// all threads from above need to synchronize here
|
||||
PTMutexLocker lock(m_mutex, m_havewriteq);
|
||||
PTMutexLocker lock(m_mutex);
|
||||
#endif
|
||||
|
||||
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
||||
@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||
try {
|
||||
Xapian::docid did =
|
||||
xwdb.replace_document(uniterm, newdocument);
|
||||
#ifdef IDX_THREADS
|
||||
// Need to protect against interaction with the up-to-date checks
|
||||
// which also update the existence map
|
||||
PTMutexLocker lock(m_mutex, !m_havewriteq);
|
||||
#endif
|
||||
if (did < m_rcldb->updated.size()) {
|
||||
m_rcldb->updated[did] = true;
|
||||
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
||||
@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// The splitter breaks text into words and adds postings to the Xapian
|
||||
// document. We use a single object to split all of the document
|
||||
// fields and position jumps to separate fields
|
||||
@ -1165,6 +1259,20 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
TextSplitDb splitter(newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
||||
// Udi unique term: this is used for file existence/uptodate
|
||||
// checks, and unique id for the replace_document() call.
|
||||
string uniterm = make_uniterm(udi);
|
||||
|
||||
if (doc.onlyxattr) {
|
||||
// Only updating an existing doc with new extended attributes
|
||||
// data. Need to read the old doc and its data record
|
||||
// first. This is so different from the normal processing that
|
||||
// it uses a fully separate code path (with some duplication
|
||||
// unfortunately)
|
||||
if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
|
||||
return false;
|
||||
} else {
|
||||
|
||||
// If the ipath is like a path, index the last element. This is
|
||||
// for compound documents like zip and chm for which the filter
|
||||
// uses the file path as ipath.
|
||||
@ -1285,12 +1393,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
}
|
||||
}
|
||||
|
||||
// Udi unique term: this is used for file existence/uptodate
|
||||
// checks, and unique id for the replace_document() call.
|
||||
string uniterm = make_uniterm(udi);
|
||||
newdocument.add_boolean_term(uniterm);
|
||||
// Parent term. This is used to find all descendents, mostly to delete them
|
||||
// when the parent goes away
|
||||
// Parent term. This is used to find all descendents, mostly
|
||||
// to delete them when the parent goes away
|
||||
if (!parent_udi.empty()) {
|
||||
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
||||
}
|
||||
@ -1434,7 +1539,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
|
||||
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
}
|
||||
#ifdef IDX_THREADS
|
||||
if (m_ndb->m_havewriteq) {
|
||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
||||
@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
doc.text.length());
|
||||
}
|
||||
|
||||
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||
Doc &doc, Xapian::Document& xdoc)
|
||||
{
|
||||
LOGDEB0(("Db::docToXdocXattrOnly\n"));
|
||||
PTMutexLocker lock(m_mutex);
|
||||
|
||||
// Read existing document and its data record
|
||||
if (getDoc(udi, 0, xdoc) == 0) {
|
||||
LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
|
||||
return false;
|
||||
}
|
||||
string data;
|
||||
XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear the term lists for the incoming fields and index the new values
|
||||
map<string, string>::iterator meta_it;
|
||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||
const FieldTraits *ftp;
|
||||
if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
|
||||
LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
|
||||
meta_it->first.c_str()));
|
||||
continue;
|
||||
}
|
||||
// Clear the previous terms for the field
|
||||
clearField(xdoc, ftp->pfx, ftp->wdfinc);
|
||||
LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||
meta_it->second.c_str()));
|
||||
splitter->setprefix(ftp->pfx);
|
||||
splitter->setwdfinc(ftp->wdfinc);
|
||||
if (!splitter->text_to_words(meta_it->second))
|
||||
LOGDEB(("Db::xattrOnly: split failed for %s\n",
|
||||
meta_it->first.c_str()));
|
||||
}
|
||||
xdoc.add_value(VALUE_SIG, doc.sig);
|
||||
|
||||
// Parse current data record into a dict for ease of processing
|
||||
ConfSimple datadic(data);
|
||||
if (!datadic.ok()) {
|
||||
LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
// For each "stored" field, check if set in doc metadata and
|
||||
// update the value if it is
|
||||
const set<string>& stored = m_rcldb->m_config->getStoredFields();
|
||||
for (set<string>::const_iterator it = stored.begin();
|
||||
it != stored.end(); it++) {
|
||||
string nm = m_rcldb->m_config->fieldCanon(*it);
|
||||
if (doc.getmeta(nm, 0)) {
|
||||
string value =
|
||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||
datadic.set(nm, value, "");
|
||||
}
|
||||
}
|
||||
|
||||
// Recreate the record. We want to do this with the local RECORD_APPEND
|
||||
// method for consistency in format, instead of using ConfSimple print
|
||||
vector<string> names = datadic.getNames("");
|
||||
data.clear();
|
||||
for (vector<string>::const_iterator it = names.begin();
|
||||
it != names.end(); it++) {
|
||||
string value;
|
||||
datadic.get(*it, value, "");
|
||||
RECORD_APPEND(data, *it, value);
|
||||
}
|
||||
RECORD_APPEND(data, Doc::keysig, doc.sig);
|
||||
xdoc.set_data(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
void Db::waitUpdIdle()
|
||||
{
|
||||
|
||||
@ -237,6 +237,10 @@ class Db {
|
||||
*/
|
||||
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
|
||||
|
||||
/** Indicate if we are doing a systematic reindex. This complements
|
||||
needUpdate() return */
|
||||
bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
|
||||
|
||||
/** Add or update document identified by unique identifier.
|
||||
* @param config Config object to use. Can be the same as the member config
|
||||
* or a clone, to avoid sharing when called in multithread context.
|
||||
|
||||
@ -66,6 +66,8 @@ public:
|
||||
};
|
||||
#endif // IDX_THREADS
|
||||
|
||||
class TextSplitDb;
|
||||
|
||||
// A class for data and methods that would have to expose
|
||||
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
||||
// 2 different ones for indexing or query as there is not much in
|
||||
@ -141,6 +143,16 @@ class Db::Native {
|
||||
/** Check if doc is indexed by term */
|
||||
bool hasTerm(const string& udi, int idxi, const string& term);
|
||||
|
||||
/** Update existing Xapian document for pure extended attrs change */
|
||||
bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||
Doc &doc, Xapian::Document& xdoc);
|
||||
/** Remove all terms currently indexed for field defined by idx prefix */
|
||||
bool clearField(Xapian::Document& xdoc, const string& pfx,
|
||||
Xapian::termcount wdfdec);
|
||||
|
||||
/** Check if term wdf is 0 and remove term if so */
|
||||
bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
|
||||
|
||||
/** Compute list of subdocuments for a given udi. We look for documents
|
||||
* indexed by a parent term matching the udi, the posting list for the
|
||||
* parentterm(udi) (As suggested by James Aylett)
|
||||
|
||||
@ -131,6 +131,10 @@ class Doc {
|
||||
// ipath descendants.
|
||||
bool haschildren;
|
||||
|
||||
// During indexing: only fields from extended attributes were set, no
|
||||
// doc content. Allows for faster reindexing of existing doc
|
||||
bool onlyxattr;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
void erase() {
|
||||
@ -154,10 +158,11 @@ class Doc {
|
||||
idxi = 0;
|
||||
haspages = false;
|
||||
haschildren = false;
|
||||
onlyxattr = false;
|
||||
}
|
||||
Doc()
|
||||
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
||||
haspages(false), haschildren(false)
|
||||
haspages(false), haschildren(false), onlyxattr(false)
|
||||
{
|
||||
}
|
||||
/** Get value for named field. If value pointer is 0, just test existence */
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
#####################################################
|
||||
# This section defines what prefix the terms inside named fields will be
|
||||
# indexed with (in addition to prefix-less indexing for general search)
|
||||
# ALL prefixes MUST be all UPPERCASE.
|
||||
# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
|
||||
#
|
||||
# The field names should be the canonic ones, not the aliases defined in
|
||||
# the following section. Don't change those which are predefined here,
|
||||
|
||||
@ -5,6 +5,7 @@ daemloglevel = 6
|
||||
daemlogfilename = /tmp/rclmontrace
|
||||
|
||||
indexStripChars = 1
|
||||
detectxattronly = 1
|
||||
|
||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
1 results
|
||||
application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
||||
inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
||||
|
||||
@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
|
||||
rerootResults
|
||||
fi
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
|
||||
iscmdresult=$d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
result=0
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
echo $cmd is $iscmdresult
|
||||
else
|
||||
echo $cmd not found
|
||||
result=1
|
||||
fi
|
||||
done
|
||||
return $result
|
||||
}
|
||||
|
||||
checkcmds recollq recollindex pxattr xadump || exit 1
|
||||
|
||||
makeindex() {
|
||||
echo "Zeroing Index"
|
||||
rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
|
||||
|
||||
4
tests/xattr/fields
Normal file
4
tests/xattr/fields
Normal file
@ -0,0 +1,4 @@
|
||||
[prefixes]
|
||||
myattr = XYXATA
|
||||
[stored]
|
||||
myattr =
|
||||
85
tests/xattr/xattr.sh
Executable file
85
tests/xattr/xattr.sh
Executable file
@ -0,0 +1,85 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Test extended attributes indexing. This should work both with
|
||||
# "detectxattronly" set or unset in the config, but should be run with
|
||||
# the variable set, because we test its function by exploiting a bug
|
||||
# (see comments further)
|
||||
#
|
||||
# We use the RECOLL_CONFTOP variable to add our own fields configuration
|
||||
|
||||
thisdir=`dirname $0`
|
||||
topdir=$thisdir/..
|
||||
. $topdir/shared.sh
|
||||
|
||||
initvariables $0
|
||||
|
||||
RECOLL_CONFTOP=$thisdir
|
||||
export RECOLL_CONFTOP
|
||||
|
||||
xrun()
|
||||
{
|
||||
echo $*
|
||||
$*
|
||||
}
|
||||
|
||||
tstfile=${tstdata}/xattrs/tstxattrs.txt
|
||||
rm -f $tstfile
|
||||
|
||||
(
|
||||
# Create the file with an extended attribute, index, and query it
|
||||
# by content and field
|
||||
echo xattruniqueinfile > $tstfile
|
||||
xrun pxattr -n myattr -v xattrunique1 $tstfile
|
||||
xrun recollindex -Zi $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "1 result expected"
|
||||
xrun recollq myattr:xattrunique1
|
||||
|
||||
sleep 1
|
||||
|
||||
# Change the value for the field, check that the old value is gone
|
||||
# and the new works
|
||||
xrun pxattr -n myattr -v xattrunique2 $tstfile
|
||||
xrun recollindex -i $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique1
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
|
||||
# Change the contents then the xattr. With xattronly set, recoll
|
||||
# should miss the contents change and index only the xattr. That's
|
||||
# a bug but we use it to check that pure xattr update indexing
|
||||
# works
|
||||
echo xattruniqueinfile1 > $tstfile
|
||||
sleep 2
|
||||
xrun pxattr -n myattr -v xattrunique3 $tstfile
|
||||
xrun recollindex -i $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "0 result expected"
|
||||
xrun recollq xattruniqueinfile1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique3
|
||||
|
||||
# Reset the index and check that the contents were seen all right
|
||||
xrun recollindex -Zi $tstfile
|
||||
echo "0 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique3
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
checkresult
|
||||
57
tests/xattr/xattr.txt
Normal file
57
tests/xattr/xattr.txt
Normal file
@ -0,0 +1,57 @@
|
||||
pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
1 result expected
|
||||
recollq myattr:xattrunique1
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique1
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
0 result expected
|
||||
recollq xattruniqueinfile1
|
||||
0 results
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique1
|
||||
0 results
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique3
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
0 result expected
|
||||
recollq xattruniqueinfile
|
||||
0 results
|
||||
1 result expected
|
||||
recollq xattruniqueinfile1
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique3
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||
Loading…
x
Reference in New Issue
Block a user