Handle partial indexing of document restricted to metadata from extended attributes

This commit is contained in:
Jean-Francois Dockes 2013-10-04 10:57:11 +02:00
parent b2eeec067b
commit 56a56500c1
15 changed files with 811 additions and 402 deletions

View File

@ -1 +1 @@
1.19.5
1.20.0

View File

@ -45,7 +45,7 @@
#include "cancelcheck.h"
#include "rclinit.h"
#include "execmd.h"
#include "extrameta.h"
// When using extended attributes, we have to use the ctime, because
// this is all that gets set when the attributes are modified.
@ -104,7 +104,7 @@ public:
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_updater(updfunc),
m_missing(new FSIFIMissingStore)
m_missing(new FSIFIMissingStore), m_detectxattronly(false)
#ifdef IDX_THREADS
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
{
LOGDEB1(("FsIndexer::FsIndexer\n"));
m_havelocalfields = m_config->hasNameAnywhere("localfields");
m_config->getConfParam("detectxattronly", &m_detectxattronly);
#ifdef IDX_THREADS
m_stableconfig = new RclConfig(*m_config);
@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
bool existingDoc;
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
// If ctime (which we use for the sig) differs from mtime, then at most
// the extended attributes were changed, no need to index content.
// This unfortunately leaves open the case where the data was
// modified, then the extended attributes, in which case we will
// miss the data update. We would have to store both the mtime and
// the ctime to avoid this
bool xattronly = m_detectxattronly && !m_db->inFullReset() &&
existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
if (!needupdate) {
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
if (m_updater) {
@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
LOGDEB0(("processone: processing: [%s] %s\n",
displayableBytes(stp->st_size).c_str(), fn.c_str()));
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
if (!interner.ok()) {
// no indexing whatsoever in this case. This typically means that
// indexallfilenames is not set
return FsTreeWalker::FtwOk;
}
interner.setMissingStore(m_missing);
string utf8fn = compute_utf8fn(config, fn);
// parent_udi is initially the same as udi, it will be used if there
@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config,
char ascdate[30];
sprintf(ascdate, "%ld", long(stp->st_mtime));
FileInterner::Status fis = FileInterner::FIAgain;
bool hadNullIpath = false;
bool hadNonNullIpath = false;
while (fis == FileInterner::FIAgain) {
doc.erase();
try {
fis = interner.internfile(doc);
} catch (CancelExcept) {
LOGERR(("fsIndexer::processone: interrupted\n"));
return FsTreeWalker::FtwStop;
}
string mimetype;
// We index at least the file name even if there was an error.
// We'll change the signature to ensure that the indexing will
// be retried every time.
if (!xattronly) {
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
if (!interner.ok()) {
// no indexing whatsoever in this case. This typically means that
// indexallfilenames is not set
return FsTreeWalker::FtwOk;
}
mimetype = interner.getMimetype();
// Internal access path for multi-document files. If empty, this is
// for the main file.
if (doc.ipath.empty()) {
hadNullIpath = true;
if (hadNonNullIpath) {
// Note that only the filters can reliably compute
// this. What we do is dependant of the doc order (if
// we see the top doc first, we won't set the flag)
doc.haschildren = true;
interner.setMissingStore(m_missing);
FileInterner::Status fis = FileInterner::FIAgain;
bool hadNonNullIpath = false;
while (fis == FileInterner::FIAgain) {
doc.erase();
try {
fis = interner.internfile(doc);
} catch (CancelExcept) {
LOGERR(("fsIndexer::processone: interrupted\n"));
return FsTreeWalker::FtwStop;
}
} else {
hadNonNullIpath = true;
make_udi(fn, doc.ipath, udi);
}
// Set file name, mod time and url if not done by filter
if (doc.fmtime.empty())
doc.fmtime = ascdate;
if (doc.url.empty())
doc.url = cstr_fileu + fn;
const string *fnp = 0;
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
doc.meta[Rcl::Doc::keyfn] = utf8fn;
// We index at least the file name even if there was an error.
// We'll change the signature to ensure that the indexing will
// be retried every time.
char cbuf[100];
sprintf(cbuf, "%lld", (long long)stp->st_size);
doc.pcbytes = cbuf;
// Document signature for up to date checks. All subdocs inherit the
// file's.
doc.sig = sig;
// If there was an error, ensure indexing will be
// retried. This is for the once missing, later installed
// filter case. It can make indexing much slower (if there are
// myriads of such files, the ext script is executed for them
// and fails every time)
if (fis == FileInterner::FIError) {
doc.sig += cstr_plus;
}
// Possibly add fields from local config
if (m_havelocalfields)
setlocalfields(localfields, doc);
// Add document to database. If there is an ipath, add it as a children
// of the file document.
#ifdef IDX_THREADS
if (m_haveSplitQ) {
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
cstr_null : parent_udi, doc);
if (!m_dwqueue.put(tp)) {
LOGERR(("processonefile: wqueue.put failed\n"));
return FsTreeWalker::FtwError;
}
} else {
#endif
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
cstr_null : parent_udi, doc)) {
return FsTreeWalker::FtwError;
// Internal access path for multi-document files. If empty, this is
// for the main file.
if (doc.ipath.empty()) {
hadNullIpath = true;
if (hadNonNullIpath) {
// Note that only the filters can reliably compute
// this. What we do is dependant of the doc order (if
// we see the top doc first, we won't set the flag)
doc.haschildren = true;
}
} else {
hadNonNullIpath = true;
make_udi(fn, doc.ipath, udi);
}
// Set file name, mod time and url if not done by filter
if (doc.fmtime.empty())
doc.fmtime = ascdate;
if (doc.url.empty())
doc.url = cstr_fileu + fn;
const string *fnp = 0;
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
doc.meta[Rcl::Doc::keyfn] = utf8fn;
char cbuf[100];
sprintf(cbuf, "%lld", (long long)stp->st_size);
doc.pcbytes = cbuf;
// Document signature for up to date checks. All subdocs inherit the
// file's.
doc.sig = sig;
// If there was an error, ensure indexing will be
// retried. This is for the once missing, later installed
// filter case. It can make indexing much slower (if there are
// myriads of such files, the ext script is executed for them
// and fails every time)
if (fis == FileInterner::FIError) {
doc.sig += cstr_plus;
}
// Possibly add fields from local config
if (m_havelocalfields)
setlocalfields(localfields, doc);
// Add document to database. If there is an ipath, add it
// as a child of the file document.
#ifdef IDX_THREADS
}
if (m_haveSplitQ) {
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
cstr_null : parent_udi, doc);
if (!m_dwqueue.put(tp)) {
LOGERR(("processonefile: wqueue.put failed\n"));
return FsTreeWalker::FtwError;
}
} else {
#endif
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
cstr_null : parent_udi, doc)) {
return FsTreeWalker::FtwError;
}
#ifdef IDX_THREADS
}
#endif
// Tell what we are doing and check for interrupt request
if (m_updater) {
// Tell what we are doing and check for interrupt request
if (m_updater) {
#ifdef IDX_THREADS
PTMutexLocker locker(m_updater->m_mutex);
PTMutexLocker locker(m_updater->m_mutex);
#endif
++(m_updater->status.docsdone);
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
m_updater->status.dbtotdocs = m_updater->status.docsdone;
m_updater->status.fn = fn;
if (!doc.ipath.empty())
m_updater->status.fn += "|" + doc.ipath;
if (!m_updater->update()) {
return FsTreeWalker::FtwStop;
}
++(m_updater->status.docsdone);
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
m_updater->status.dbtotdocs = m_updater->status.docsdone;
m_updater->status.fn = fn;
if (!doc.ipath.empty())
m_updater->status.fn += "|" + doc.ipath;
if (!m_updater->update()) {
return FsTreeWalker::FtwStop;
}
}
}
}
// If this doc existed and it's a container, recording for
// possible subdoc purge (this will be used only if we don't do a
// db-wide purge, e.g. if we're called from indexfiles()).
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
existingDoc, hadNonNullIpath));
if (existingDoc && hadNonNullIpath) {
m_purgeCandidates.record(parent_udi);
// If this doc existed and it's a container, recording for
// possible subdoc purge (this will be used only if we don't do a
// db-wide purge, e.g. if we're called from indexfiles()).
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
existingDoc, hadNonNullIpath));
if (existingDoc && hadNonNullIpath) {
m_purgeCandidates.record(parent_udi);
}
}
// If we had no instance with a null ipath, we create an empty
// document to stand for the file itself, to be used mainly for up
// to date checks. Typically this happens for an mbox file.
if (hadNullIpath == false) {
LOGDEB1(("Creating empty doc for file\n"));
//
// If xattronly is set, ONLY the extattr metadata is valid and will be used
// by the following step.
if (xattronly || hadNullIpath == false) {
LOGDEB(("Creating empty doc for file or pure xattr update\n"));
Rcl::Doc fileDoc;
fileDoc.fmtime = ascdate;
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
fileDoc.haschildren = true;
fileDoc.mimetype = interner.getMimetype();
fileDoc.url = cstr_fileu + fn;
if (m_havelocalfields)
setlocalfields(localfields, fileDoc);
char cbuf[100];
sprintf(cbuf, "%lld", (long long)stp->st_size);
fileDoc.pcbytes = cbuf;
if (xattronly) {
map<string, string> xfields;
reapXAttrs(config, fn, xfields);
docFieldsFromXattrs(config, xfields, fileDoc);
fileDoc.onlyxattr = true;
} else {
fileDoc.fmtime = ascdate;
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
fileDoc.haschildren = true;
fileDoc.mimetype = mimetype;
fileDoc.url = cstr_fileu + fn;
if (m_havelocalfields)
setlocalfields(localfields, fileDoc);
char cbuf[100];
sprintf(cbuf, "%lld", (long long)stp->st_size);
fileDoc.pcbytes = cbuf;
}
fileDoc.sig = sig;
#ifdef IDX_THREADS

View File

@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
string m_slocalfields;
map<string, string> m_localfields;
// Activate detection of xattr-only document updates. Experimental, so
// needs a config option
bool m_detectxattronly;
#ifdef IDX_THREADS
friend void *FsIndexerDbUpdWorker(void*);
friend void *FsIndexerInternfileWorker(void*);

View File

@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
code &= ~(IN_ISDIR|IN_ONESHOT);
switch (code) {
case IN_ACCESS: return "IN_ACCESS";
case IN_MODIFY: return "IN_MODIFY";
case IN_ATTRIB: return "IN_ATTRIB";
case IN_CLOSE: return "IN_CLOSE";
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE: return "IN_CLOSE";
case IN_OPEN: return "IN_OPEN";
case IN_MOVED_FROM: return "IN_MOVED_FROM";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE: return "IN_MOVE";
case IN_CREATE: return "IN_CREATE";
case IN_DELETE: return "IN_DELETE";
case IN_DELETE_SELF: return "IN_DELETE_SELF";
case IN_IGNORED: return "IN_IGNORED";
case IN_MODIFY: return "IN_MODIFY";
case IN_MOVE: return "IN_MOVE";
case IN_MOVED_FROM: return "IN_MOVED_FROM";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE_SELF: return "IN_MOVE_SELF";
case IN_OPEN: return "IN_OPEN";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_UNMOUNT: return "IN_UNMOUNT";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_IGNORED: return "IN_IGNORED";
default: {
static char msg[50];
sprintf(msg, "Unknown event 0x%x", code);
@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
uint32_t mask = IN_MODIFY | IN_CREATE
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
#ifdef RCL_USE_XATTR
// It seems that IN_ATTRIB is not needed to receive extattr
// modification events, which is a bit weird because only ctime is
// set.
// | IN_ATTRIB
// IN_ATTRIB used to be not needed to receive extattr
// modification events, which was a bit weird because only ctime is
// set, and now it is...
| IN_ATTRIB
#endif
#ifdef IN_DONT_FOLLOW
| IN_DONT_FOLLOW
@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
eraseWatchSubTree(m_idtopath, ev.m_path);
}
// IN_ATTRIB apparently not needed, see comment above
if (evp->mask & (IN_MODIFY)) {
// IN_ATTRIB used to be not needed, but now it is
if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
ev.m_etyp = RclMonEvent::RCLEVT_DELETE;

View File

@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
return false;
}
// Clear term from document if its frequency is 0. This should
// probably be done by Xapian when the freq goes to 0 when removing a
// posting, but we have to do it ourselves
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
{
LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
// Find the term
Xapian::TermIterator xit;
XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
xrdb, m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n",
term.c_str(), m_rcldb->m_reason.c_str()));
return false;
}
if (xit == xdoc.termlist_end() || term.compare(*xit)) {
LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n",
term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
return false;
}
// Clear the term if its frequency is 0
if (xit.get_wdf() == 0) {
LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n",
term.c_str(), m_rcldb->m_reason.c_str()));
}
}
return true;
}
// Holder for term + pos
struct DocPosting {
DocPosting(string t, Xapian::termpos ps)
: term(t), pos(ps) {}
string term;
Xapian::termpos pos;
};
// Clear all terms for given field for given document.
// The terms to be cleared are all those with the appropriate
// prefix. We also remove the postings for the unprefixed terms (that
// is, we undo what we did when indexing).
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
Xapian::termcount wdfdec)
{
LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
pfx.c_str(), unsigned(xdoc.get_docid())));
vector<DocPosting> eraselist;
string wrapd = wrap_prefix(pfx);
m_rcldb->m_reason.clear();
for (int tries = 0; tries < 2; tries++) {
try {
Xapian::TermIterator xit;
xit = xdoc.termlist_begin();
xit.skip_to(wrapd);
while (xit != xdoc.termlist_end() &&
!(*xit).compare(0, wrapd.size(), wrapd)) {
LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
Xapian::PositionIterator posit;
for (posit = xit.positionlist_begin();
posit != xit.positionlist_end(); posit++) {
eraselist.push_back(DocPosting(*xit, *posit));
eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
}
xit++;
}
} catch (const Xapian::DatabaseModifiedError &e) {
m_rcldb->m_reason = e.get_msg();
xrdb.reopen();
continue;
} XCATCHERROR(m_rcldb->m_reason);
break;
}
if (!m_rcldb->m_reason.empty()) {
LOGERR(("Db::clearField: failed building erase list: %s\n",
m_rcldb->m_reason.c_str()));
return false;
}
// Now remove the found positions, and the terms if the wdf is 0
for (vector<DocPosting>::const_iterator it = eraselist.begin();
it != eraselist.end(); it++) {
LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n",
it->term.c_str(), int(it->pos)));
XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);,
xwdb,m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
// Not that this normally fails for non-prefixed XXST and
// ND, don't make a fuss
LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
}
clearDocTermIfWdf0(xdoc, it->term);
}
return true;
}
// Check if doc given by udi is indexed by term
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
{
@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
{
#ifdef IDX_THREADS
Chrono chron;
// In the case where there is a separate (single) db update
// thread, we only need to protect the update map update below
// (against interaction with threads calling needUpdate()). Else,
// all threads from above need to synchronize here
PTMutexLocker lock(m_mutex, m_havewriteq);
PTMutexLocker lock(m_mutex);
#endif
// Check file system full every mbyte of indexed text. It's a bit wasteful
@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
try {
Xapian::docid did =
xwdb.replace_document(uniterm, newdocument);
#ifdef IDX_THREADS
// Need to protect against interaction with the up-to-date checks
// which also update the existence map
PTMutexLocker lock(m_mutex, !m_havewriteq);
#endif
if (did < m_rcldb->updated.size()) {
m_rcldb->updated[did] = true;
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
return false;
}
// The splitter breaks text into words and adds postings to the Xapian
// document. We use a single object to split all of the document
// fields and position jumps to separate fields
@ -1151,7 +1245,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
return false;
Xapian::Document newdocument;
// The term processing pipeline:
TermProcIdx tpidx;
TermProc *nxt = &tpidx;
@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
TextSplitDb splitter(newdocument, nxt);
tpidx.setTSD(&splitter);
// If the ipath is like a path, index the last element. This is
// for compound documents like zip and chm for which the filter
// uses the file path as ipath.
if (!doc.ipath.empty() &&
doc.ipath.find_first_not_of("0123456789") != string::npos) {
string utf8ipathlast;
// There is no way in hell we could have an idea of the
// charset here, so let's hope it's ascii or utf-8. We call
// transcode to strip the bad chars and pray
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
"UTF-8", "UTF-8")) {
splitter.text_to_words(utf8ipathlast);
}
}
// Split and index the path from the url for path-based filtering
{
string path = url_gpath(doc.url);
vector<string> vpath;
stringToTokens(path, vpath, "/");
// If vpath is not /, the last elt is the file/dir name, not a
// part of the path.
if (vpath.size())
vpath.resize(vpath.size()-1);
splitter.curpos = 0;
newdocument.add_posting(wrap_prefix(pathelt_prefix),
splitter.basepos + splitter.curpos++);
for (vector<string>::iterator it = vpath.begin();
it != vpath.end(); it++){
if (it->length() > 230) {
// Just truncate it. May still be useful because of wildcards
*it = it->substr(0, 230);
}
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
splitter.basepos + splitter.curpos++);
}
}
// Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this
// makes no sense for keywords by the way).
//
// The order has no importance, and we set a position gap of 100
// between fields to avoid false proximity matches.
map<string, string>::iterator meta_it;
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
if (!meta_it->second.empty()) {
const FieldTraits *ftp;
// We don't test for an empty prefix here. Some fields are part
// of the internal conf with an empty prefix (ie: abstract).
if (!fieldToTraits(meta_it->first, &ftp)) {
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
meta_it->first.c_str()));
continue;
}
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
meta_it->second.c_str()));
splitter.setprefix(ftp->pfx);
splitter.setwdfinc(ftp->wdfinc);
if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
meta_it->first.c_str()));
}
}
splitter.setprefix(string());
splitter.setwdfinc(1);
if (splitter.curpos < baseTextPosition)
splitter.basepos = baseTextPosition;
// Split and index body text
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
#ifdef TEXTSPLIT_STATS
splitter.resetStats();
#endif
if (!splitter.text_to_words(doc.text))
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
#ifdef TEXTSPLIT_STATS
// Reject bad data. unrecognized base64 text is characterized by
// high avg word length and high variation (because there are
// word-splitters like +/ inside the data).
TextSplit::Stats::Values v = splitter.getStats();
// v.avglen > 15 && v.sigma > 12
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
v.count, v.avglen, v.sigma, doc.url.c_str(),
doc.ipath.c_str(), doc.text.c_str()));
return true;
}
#endif
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
// Simple file name indexed unsplit for specific "file name"
// searches. This is not the same as a filename: clause inside the
// query language.
// We also add a term for the filename extension if any.
string utf8fn;
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
string fn;
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
// We should truncate after extracting the extension, but this is
// a pathological case anyway
if (fn.size() > 230)
utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
fn.substr(pos + 1));
}
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
}
}
// Udi unique term: this is used for file existence/uptodate
// checks, and unique id for the replace_document() call.
string uniterm = make_uniterm(udi);
newdocument.add_boolean_term(uniterm);
// Parent term. This is used to find all descendents, mostly to delete them
// when the parent goes away
if (!parent_udi.empty()) {
newdocument.add_boolean_term(make_parentterm(parent_udi));
}
// Dates etc.
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
doc.dmtime.c_str());
struct tm *tm = localtime(&mtime);
char buf[9];
snprintf(buf, 9, "%04d%02d%02d",
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
// Date (YYYYMMDD)
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
// Month (YYYYMM)
buf[6] = '\0';
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
// Year (YYYY)
buf[4] = '\0';
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
//////////////////////////////////////////////////////////////////
// Document data record. omindex has the following nl separated fields:
// - url
// - sample
// - caption (title limited to 100 chars)
// - mime type
//
// The title, author, abstract and keywords fields are special,
// they always get stored in the document data
// record. Configurable other fields can be, too.
//
// We truncate stored fields abstract, title and keywords to
// reasonable lengths and suppress newlines (so that the data
// record can keep a simple syntax)
string record;
RECORD_APPEND(record, Doc::keyurl, doc.url);
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
// We left-zero-pad the times so that they are lexico-sortable
leftzeropad(doc.fmtime, 11);
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
if (!doc.dmtime.empty()) {
leftzeropad(doc.dmtime, 11);
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
}
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
if (doc.fbytes.empty())
doc.fbytes = doc.pcbytes;
if (!doc.fbytes.empty()) {
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
leftzeropad(doc.fbytes, 12);
newdocument.add_value(VALUE_SIZE, doc.fbytes);
}
if (doc.haschildren) {
newdocument.add_boolean_term(has_children_term);
}
if (!doc.pcbytes.empty())
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
char sizebuf[30];
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
RECORD_APPEND(record, Doc::keyds, sizebuf);
// Note that we add the signature both as a value and in the data record
if (!doc.sig.empty()) {
RECORD_APPEND(record, Doc::keysig, doc.sig);
newdocument.add_value(VALUE_SIG, doc.sig);
}
if (!doc.ipath.empty())
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
doc.meta[Doc::keytt] =
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
if (!doc.meta[Doc::keytt].empty())
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
trimstring(doc.meta[Doc::keykw], " \t\r\n");
doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
// No need to explicitly append the keywords, this will be done by
// the "stored" loop
// If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so
// that we can return it to a query without having to decode the
// original file.
bool syntabs = false;
// Note that the map accesses by operator[] create empty entries if they
// don't exist yet.
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
if (doc.meta[Doc::keyabs].empty()) {
syntabs = true;
if (!doc.text.empty())
doc.meta[Doc::keyabs] = cstr_syntAbs +
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
if (doc.onlyxattr) {
// Only updating an existing doc with new extended attributes
// data. Need to read the old doc and its data record
// first. This is so different from the normal processing that
// it uses a fully separate code path (with some duplication
// unfortunately)
if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
return false;
} else {
doc.meta[Doc::keyabs] =
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
cstr_nc);
}
const set<string>& stored = m_config->getStoredFields();
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = m_config->fieldCanon(*it);
if (!doc.meta[nm].empty()) {
string value =
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
RECORD_APPEND(record, nm, value);
// If the ipath is like a path, index the last element. This is
// for compound documents like zip and chm for which the filter
// uses the file path as ipath.
if (!doc.ipath.empty() &&
doc.ipath.find_first_not_of("0123456789") != string::npos) {
string utf8ipathlast;
// There is no way in hell we could have an idea of the
// charset here, so let's hope it's ascii or utf-8. We call
// transcode to strip the bad chars and pray
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
"UTF-8", "UTF-8")) {
splitter.text_to_words(utf8ipathlast);
}
}
}
// If empty pages (multiple break at same pos) were recorded, save
// them (this is because we have no way to record them in the
// Xapian list
if (!tpidx.m_pageincrvec.empty()) {
ostringstream multibreaks;
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
if (i != 0)
multibreaks << ",";
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
tpidx.m_pageincrvec[i].second;
// Split and index the path from the url for path-based filtering
{
string path = url_gpath(doc.url);
vector<string> vpath;
stringToTokens(path, vpath, "/");
// If vpath is not /, the last elt is the file/dir name, not a
// part of the path.
if (vpath.size())
vpath.resize(vpath.size()-1);
splitter.curpos = 0;
newdocument.add_posting(wrap_prefix(pathelt_prefix),
splitter.basepos + splitter.curpos++);
for (vector<string>::iterator it = vpath.begin();
it != vpath.end(); it++){
if (it->length() > 230) {
// Just truncate it. May still be useful because of wildcards
*it = it->substr(0, 230);
}
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
splitter.basepos + splitter.curpos++);
}
}
// Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this
// makes no sense for keywords by the way).
//
// The order has no importance, and we set a position gap of 100
// between fields to avoid false proximity matches.
map<string, string>::iterator meta_it;
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
if (!meta_it->second.empty()) {
const FieldTraits *ftp;
// We don't test for an empty prefix here. Some fields are part
// of the internal conf with an empty prefix (ie: abstract).
if (!fieldToTraits(meta_it->first, &ftp)) {
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
meta_it->first.c_str()));
continue;
}
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
meta_it->second.c_str()));
splitter.setprefix(ftp->pfx);
splitter.setwdfinc(ftp->wdfinc);
if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
meta_it->first.c_str()));
}
}
splitter.setprefix(string());
splitter.setwdfinc(1);
if (splitter.curpos < baseTextPosition)
splitter.basepos = baseTextPosition;
// Split and index body text
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
#ifdef TEXTSPLIT_STATS
splitter.resetStats();
#endif
if (!splitter.text_to_words(doc.text))
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
#ifdef TEXTSPLIT_STATS
// Reject bad data. unrecognized base64 text is characterized by
// high avg word length and high variation (because there are
// word-splitters like +/ inside the data).
TextSplit::Stats::Values v = splitter.getStats();
// v.avglen > 15 && v.sigma > 12
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
v.count, v.avglen, v.sigma, doc.url.c_str(),
doc.ipath.c_str(), doc.text.c_str()));
return true;
}
#endif
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
// Simple file name indexed unsplit for specific "file name"
// searches. This is not the same as a filename: clause inside the
// query language.
// We also add a term for the filename extension if any.
string utf8fn;
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
string fn;
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
// We should truncate after extracting the extension, but this is
// a pathological case anyway
if (fn.size() > 230)
utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
fn.substr(pos + 1));
}
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
}
}
newdocument.add_boolean_term(uniterm);
// Parent term. This is used to find all descendents, mostly
// to delete them when the parent goes away
if (!parent_udi.empty()) {
newdocument.add_boolean_term(make_parentterm(parent_udi));
}
// Dates etc.
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
doc.dmtime.c_str());
struct tm *tm = localtime(&mtime);
char buf[9];
snprintf(buf, 9, "%04d%02d%02d",
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
// Date (YYYYMMDD)
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
// Month (YYYYMM)
buf[6] = '\0';
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
// Year (YYYY)
buf[4] = '\0';
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
//////////////////////////////////////////////////////////////////
// Document data record. omindex has the following nl separated fields:
// - url
// - sample
// - caption (title limited to 100 chars)
// - mime type
//
// The title, author, abstract and keywords fields are special,
// they always get stored in the document data
// record. Configurable other fields can be, too.
//
// We truncate stored fields abstract, title and keywords to
// reasonable lengths and suppress newlines (so that the data
// record can keep a simple syntax)
string record;
RECORD_APPEND(record, Doc::keyurl, doc.url);
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
// We left-zero-pad the times so that they are lexico-sortable
leftzeropad(doc.fmtime, 11);
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
if (!doc.dmtime.empty()) {
leftzeropad(doc.dmtime, 11);
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
}
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
if (doc.fbytes.empty())
doc.fbytes = doc.pcbytes;
if (!doc.fbytes.empty()) {
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
leftzeropad(doc.fbytes, 12);
newdocument.add_value(VALUE_SIZE, doc.fbytes);
}
if (doc.haschildren) {
newdocument.add_boolean_term(has_children_term);
}
if (!doc.pcbytes.empty())
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
char sizebuf[30];
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
RECORD_APPEND(record, Doc::keyds, sizebuf);
// Note that we add the signature both as a value and in the data record
if (!doc.sig.empty()) {
RECORD_APPEND(record, Doc::keysig, doc.sig);
newdocument.add_value(VALUE_SIG, doc.sig);
}
if (!doc.ipath.empty())
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
doc.meta[Doc::keytt] =
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
if (!doc.meta[Doc::keytt].empty())
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
trimstring(doc.meta[Doc::keykw], " \t\r\n");
doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
// No need to explicitly append the keywords, this will be done by
// the "stored" loop
// If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so
// that we can return it to a query without having to decode the
// original file.
bool syntabs = false;
// Note that the map accesses by operator[] create empty entries if they
// don't exist yet.
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
if (doc.meta[Doc::keyabs].empty()) {
syntabs = true;
if (!doc.text.empty())
doc.meta[Doc::keyabs] = cstr_syntAbs +
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
} else {
doc.meta[Doc::keyabs] =
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
cstr_nc);
}
const set<string>& stored = m_config->getStoredFields();
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = m_config->fieldCanon(*it);
if (!doc.meta[nm].empty()) {
string value =
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
RECORD_APPEND(record, nm, value);
}
}
// If empty pages (multiple break at same pos) were recorded, save
// them (this is because we have no way to record them in the
// Xapian list
if (!tpidx.m_pageincrvec.empty()) {
ostringstream multibreaks;
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
if (i != 0)
multibreaks << ",";
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
tpidx.m_pageincrvec[i].second;
}
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
}
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
}
// If the file's md5 was computed, add value and term.
// The value is optionally used for query result duplicate elimination,
// and the term to find the duplicates.
// We don't do this for empty docs.
const string *md5;
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
md5->compare(cstr_md5empty)) {
string digest;
MD5HexScan(*md5, digest);
newdocument.add_value(VALUE_MD5, digest);
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
// If the file's md5 was computed, add value and term.
// The value is optionally used for query result duplicate elimination,
// and the term to find the duplicates.
// We don't do this for empty docs.
const string *md5;
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
md5->compare(cstr_md5empty)) {
string digest;
MD5HexScan(*md5, digest);
newdocument.add_value(VALUE_MD5, digest);
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
}
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
newdocument.set_data(record);
}
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
newdocument.set_data(record);
#ifdef IDX_THREADS
if (m_ndb->m_havewriteq) {
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
doc.text.length());
}
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
Doc &doc, Xapian::Document& xdoc)
{
LOGDEB0(("Db::docToXdocXattrOnly\n"));
PTMutexLocker lock(m_mutex);
// Read existing document and its data record
if (getDoc(udi, 0, xdoc) == 0) {
LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
return false;
}
string data;
XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
return false;
}
// Clear the term lists for the incoming fields and index the new values
map<string, string>::iterator meta_it;
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
const FieldTraits *ftp;
if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
meta_it->first.c_str()));
continue;
}
// Clear the previous terms for the field
clearField(xdoc, ftp->pfx, ftp->wdfinc);
LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n",
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
meta_it->second.c_str()));
splitter->setprefix(ftp->pfx);
splitter->setwdfinc(ftp->wdfinc);
if (!splitter->text_to_words(meta_it->second))
LOGDEB(("Db::xattrOnly: split failed for %s\n",
meta_it->first.c_str()));
}
xdoc.add_value(VALUE_SIG, doc.sig);
// Parse current data record into a dict for ease of processing
ConfSimple datadic(data);
if (!datadic.ok()) {
LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
return false;
}
// For each "stored" field, check if set in doc metadata and
// update the value if it is
const set<string>& stored = m_rcldb->m_config->getStoredFields();
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = m_rcldb->m_config->fieldCanon(*it);
if (doc.getmeta(nm, 0)) {
string value =
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
datadic.set(nm, value, "");
}
}
// Recreate the record. We want to do this with the local RECORD_APPEND
// method for consistency in format, instead of using ConfSimple print
vector<string> names = datadic.getNames("");
data.clear();
for (vector<string>::const_iterator it = names.begin();
it != names.end(); it++) {
string value;
datadic.get(*it, value, "");
RECORD_APPEND(data, *it, value);
}
RECORD_APPEND(data, Doc::keysig, doc.sig);
xdoc.set_data(data);
return true;
}
#ifdef IDX_THREADS
void Db::waitUpdIdle()
{

View File

@ -237,6 +237,10 @@ class Db {
*/
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
/** Indicate if we are doing a systematic reindex. This complements
needUpdate() return */
bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
/** Add or update document identified by unique identifier.
* @param config Config object to use. Can be the same as the member config
* or a clone, to avoid sharing when called in multithread context.

View File

@ -66,6 +66,8 @@ public:
};
#endif // IDX_THREADS
class TextSplitDb;
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
// 2 different ones for indexing or query as there is not much in
@ -141,6 +143,16 @@ class Db::Native {
/** Check if doc is indexed by term */
bool hasTerm(const string& udi, int idxi, const string& term);
/** Update existing Xapian document for pure extended attrs change */
bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
Doc &doc, Xapian::Document& xdoc);
/** Remove all terms currently indexed for field defined by idx prefix */
bool clearField(Xapian::Document& xdoc, const string& pfx,
Xapian::termcount wdfdec);
/** Check if term wdf is 0 and remove term if so */
bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
/** Compute list of subdocuments for a given udi. We look for documents
* indexed by a parent term matching the udi, the posting list for the
* parentterm(udi) (As suggested by James Aylett)

View File

@ -131,6 +131,10 @@ class Doc {
// ipath descendants.
bool haschildren;
// During indexing: only fields from extended attributes were set, no
// doc content. Allows for faster reindexing of existing doc
bool onlyxattr;
///////////////////////////////////////////////////////////////////
void erase() {
@ -154,10 +158,11 @@ class Doc {
idxi = 0;
haspages = false;
haschildren = false;
onlyxattr = false;
}
Doc()
: idxi(0), syntabs(false), pc(0), xdocid(0),
haspages(false), haschildren(false)
haspages(false), haschildren(false), onlyxattr(false)
{
}
/** Get value for named field. If value pointer is 0, just test existence */

View File

@ -13,7 +13,7 @@
#####################################################
# This section defines what prefix the terms inside named fields will be
# indexed with (in addition to prefix-less indexing for general search)
# ALL prefixes MUST be all UPPERCASE.
# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
#
# The field names should be the canonic ones, not the aliases defined in
# the following section. Don't change those which are predefined here,

View File

@ -5,6 +5,7 @@ daemloglevel = 6
daemlogfilename = /tmp/rclmontrace
indexStripChars = 1
detectxattronly = 1
topdirs = /home/dockes/projets/fulltext/testrecoll/

View File

@ -1,2 +1,2 @@
1 results
application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes

View File

@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
rerootResults
fi
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
iscmdresult=$d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
result=0
for cmd in $*;do
if iscmd $cmd
then
echo $cmd is $iscmdresult
else
echo $cmd not found
result=1
fi
done
return $result
}
checkcmds recollq recollindex pxattr xadump || exit 1
makeindex() {
echo "Zeroing Index"
rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws

4
tests/xattr/fields Normal file
View File

@ -0,0 +1,4 @@
[prefixes]
myattr = XYXATA
[stored]
myattr =

85
tests/xattr/xattr.sh Executable file
View File

@ -0,0 +1,85 @@
#!/bin/sh
# Test extended attributes indexing. This should work both with
# "detectxattronly" set or unset in the config, but should be run with
# the variable set, because we test its function by exploiting a bug
# (see comments further)
#
# We use the RECOLL_CONFTOP variable to add our own fields configuration
thisdir=`dirname $0`
topdir=$thisdir/..
. $topdir/shared.sh
initvariables $0
RECOLL_CONFTOP=$thisdir
export RECOLL_CONFTOP
xrun()
{
echo $*
$*
}
tstfile=${tstdata}/xattrs/tstxattrs.txt
rm -f $tstfile
(
# Create the file with an extended attribute, index, and query it
# by content and field
echo xattruniqueinfile > $tstfile
xrun pxattr -n myattr -v xattrunique1 $tstfile
xrun recollindex -Zi $tstfile
echo "1 result expected"
xrun recollq xattruniqueinfile
echo "1 result expected"
xrun recollq myattr:xattrunique1
sleep 1
# Change the value for the field, check that the old value is gone
# and the new works
xrun pxattr -n myattr -v xattrunique2 $tstfile
xrun recollindex -i $tstfile
echo "1 result expected"
xrun recollq xattruniqueinfile
echo "0 result expected:"
xrun recollq myattr:xattrunique1
echo "1 result expected:"
xrun recollq myattr:xattrunique2
# Change the contents then the xattr. With xattronly set, recoll
# should miss the contents change and index only the xattr. That's
# a bug but we use it to check that pure xattr update indexing
# works
echo xattruniqueinfile1 > $tstfile
sleep 2
xrun pxattr -n myattr -v xattrunique3 $tstfile
xrun recollindex -i $tstfile
echo "1 result expected"
xrun recollq xattruniqueinfile
echo "0 result expected"
xrun recollq xattruniqueinfile1
echo "0 result expected:"
xrun recollq myattr:xattrunique1
echo "0 result expected:"
xrun recollq myattr:xattrunique2
echo "1 result expected:"
xrun recollq myattr:xattrunique3
# Reset the index and check that the contents were seen all right
xrun recollindex -Zi $tstfile
echo "0 result expected"
xrun recollq xattruniqueinfile
echo "1 result expected"
xrun recollq xattruniqueinfile1
echo "0 result expected:"
xrun recollq myattr:xattrunique2
echo "1 result expected:"
xrun recollq myattr:xattrunique3
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

57
tests/xattr/xattr.txt Normal file
View File

@ -0,0 +1,57 @@
pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
1 result expected
recollq xattruniqueinfile
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
1 result expected
recollq myattr:xattrunique1
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
1 result expected
recollq xattruniqueinfile
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
0 result expected:
recollq myattr:xattrunique1
0 results
1 result expected:
recollq myattr:xattrunique2
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
1 result expected
recollq xattruniqueinfile
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
0 result expected
recollq xattruniqueinfile1
0 results
0 result expected:
recollq myattr:xattrunique1
0 results
0 result expected:
recollq myattr:xattrunique2
0 results
1 result expected:
recollq myattr:xattrunique3
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
0 result expected
recollq xattruniqueinfile
0 results
1 result expected
recollq xattruniqueinfile1
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
0 result expected:
recollq myattr:xattrunique2
0 results
1 result expected:
recollq myattr:xattrunique3
1 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes