Handle partial indexing of document restricted to metadata from extended attributes
This commit is contained in:
parent
b2eeec067b
commit
56a56500c1
@ -1 +1 @@
|
||||
1.19.5
|
||||
1.20.0
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
#include "cancelcheck.h"
|
||||
#include "rclinit.h"
|
||||
#include "execmd.h"
|
||||
|
||||
#include "extrameta.h"
|
||||
|
||||
// When using extended attributes, we have to use the ctime, because
|
||||
// this is all that gets set when the attributes are modified.
|
||||
@ -104,7 +104,7 @@ public:
|
||||
|
||||
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_updater(updfunc),
|
||||
m_missing(new FSIFIMissingStore)
|
||||
m_missing(new FSIFIMissingStore), m_detectxattronly(false)
|
||||
#ifdef IDX_THREADS
|
||||
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
|
||||
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
|
||||
@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||
{
|
||||
LOGDEB1(("FsIndexer::FsIndexer\n"));
|
||||
m_havelocalfields = m_config->hasNameAnywhere("localfields");
|
||||
m_config->getConfParam("detectxattronly", &m_detectxattronly);
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
m_stableconfig = new RclConfig(*m_config);
|
||||
@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
bool existingDoc;
|
||||
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
|
||||
|
||||
// If ctime (which we use for the sig) differs from mtime, then at most
|
||||
// the extended attributes were changed, no need to index content.
|
||||
// This unfortunately leaves open the case where the data was
|
||||
// modified, then the extended attributes, in which case we will
|
||||
// miss the data update. We would have to store both the mtime and
|
||||
// the ctime to avoid this
|
||||
bool xattronly = m_detectxattronly && !m_db->inFullReset() &&
|
||||
existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
|
||||
|
||||
if (!needupdate) {
|
||||
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
|
||||
if (m_updater) {
|
||||
@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
LOGDEB0(("processone: processing: [%s] %s\n",
|
||||
displayableBytes(stp->st_size).c_str(), fn.c_str()));
|
||||
|
||||
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
||||
if (!interner.ok()) {
|
||||
// no indexing whatsoever in this case. This typically means that
|
||||
// indexallfilenames is not set
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
interner.setMissingStore(m_missing);
|
||||
|
||||
string utf8fn = compute_utf8fn(config, fn);
|
||||
|
||||
// parent_udi is initially the same as udi, it will be used if there
|
||||
@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config,
|
||||
char ascdate[30];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNullIpath = false;
|
||||
bool hadNonNullIpath = false;
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
doc.erase();
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR(("fsIndexer::processone: interrupted\n"));
|
||||
return FsTreeWalker::FtwStop;
|
||||
}
|
||||
string mimetype;
|
||||
|
||||
// We index at least the file name even if there was an error.
|
||||
// We'll change the signature to ensure that the indexing will
|
||||
// be retried every time.
|
||||
if (!xattronly) {
|
||||
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
||||
if (!interner.ok()) {
|
||||
// no indexing whatsoever in this case. This typically means that
|
||||
// indexallfilenames is not set
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
mimetype = interner.getMimetype();
|
||||
|
||||
// Internal access path for multi-document files. If empty, this is
|
||||
// for the main file.
|
||||
if (doc.ipath.empty()) {
|
||||
hadNullIpath = true;
|
||||
if (hadNonNullIpath) {
|
||||
// Note that only the filters can reliably compute
|
||||
// this. What we do is dependant of the doc order (if
|
||||
// we see the top doc first, we won't set the flag)
|
||||
doc.haschildren = true;
|
||||
interner.setMissingStore(m_missing);
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNonNullIpath = false;
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
doc.erase();
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR(("fsIndexer::processone: interrupted\n"));
|
||||
return FsTreeWalker::FtwStop;
|
||||
}
|
||||
} else {
|
||||
hadNonNullIpath = true;
|
||||
make_udi(fn, doc.ipath, udi);
|
||||
}
|
||||
|
||||
// Set file name, mod time and url if not done by filter
|
||||
if (doc.fmtime.empty())
|
||||
doc.fmtime = ascdate;
|
||||
if (doc.url.empty())
|
||||
doc.url = cstr_fileu + fn;
|
||||
const string *fnp = 0;
|
||||
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
||||
doc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||
// We index at least the file name even if there was an error.
|
||||
// We'll change the signature to ensure that the indexing will
|
||||
// be retried every time.
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||
doc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks. All subdocs inherit the
|
||||
// file's.
|
||||
doc.sig = sig;
|
||||
|
||||
// If there was an error, ensure indexing will be
|
||||
// retried. This is for the once missing, later installed
|
||||
// filter case. It can make indexing much slower (if there are
|
||||
// myriads of such files, the ext script is executed for them
|
||||
// and fails every time)
|
||||
if (fis == FileInterner::FIError) {
|
||||
doc.sig += cstr_plus;
|
||||
}
|
||||
|
||||
// Possibly add fields from local config
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, doc);
|
||||
|
||||
// Add document to database. If there is an ipath, add it as a children
|
||||
// of the file document.
|
||||
#ifdef IDX_THREADS
|
||||
if (m_haveSplitQ) {
|
||||
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
|
||||
cstr_null : parent_udi, doc);
|
||||
if (!m_dwqueue.put(tp)) {
|
||||
LOGERR(("processonefile: wqueue.put failed\n"));
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
|
||||
cstr_null : parent_udi, doc)) {
|
||||
return FsTreeWalker::FtwError;
|
||||
// Internal access path for multi-document files. If empty, this is
|
||||
// for the main file.
|
||||
if (doc.ipath.empty()) {
|
||||
hadNullIpath = true;
|
||||
if (hadNonNullIpath) {
|
||||
// Note that only the filters can reliably compute
|
||||
// this. What we do is dependant of the doc order (if
|
||||
// we see the top doc first, we won't set the flag)
|
||||
doc.haschildren = true;
|
||||
}
|
||||
} else {
|
||||
hadNonNullIpath = true;
|
||||
make_udi(fn, doc.ipath, udi);
|
||||
}
|
||||
|
||||
// Set file name, mod time and url if not done by filter
|
||||
if (doc.fmtime.empty())
|
||||
doc.fmtime = ascdate;
|
||||
if (doc.url.empty())
|
||||
doc.url = cstr_fileu + fn;
|
||||
const string *fnp = 0;
|
||||
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
||||
doc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||
doc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks. All subdocs inherit the
|
||||
// file's.
|
||||
doc.sig = sig;
|
||||
|
||||
// If there was an error, ensure indexing will be
|
||||
// retried. This is for the once missing, later installed
|
||||
// filter case. It can make indexing much slower (if there are
|
||||
// myriads of such files, the ext script is executed for them
|
||||
// and fails every time)
|
||||
if (fis == FileInterner::FIError) {
|
||||
doc.sig += cstr_plus;
|
||||
}
|
||||
|
||||
// Possibly add fields from local config
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, doc);
|
||||
|
||||
// Add document to database. If there is an ipath, add it
|
||||
// as a child of the file document.
|
||||
#ifdef IDX_THREADS
|
||||
}
|
||||
if (m_haveSplitQ) {
|
||||
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
|
||||
cstr_null : parent_udi, doc);
|
||||
if (!m_dwqueue.put(tp)) {
|
||||
LOGERR(("processonefile: wqueue.put failed\n"));
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
|
||||
cstr_null : parent_udi, doc)) {
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
#ifdef IDX_THREADS
|
||||
}
|
||||
#endif
|
||||
|
||||
// Tell what we are doing and check for interrupt request
|
||||
if (m_updater) {
|
||||
// Tell what we are doing and check for interrupt request
|
||||
if (m_updater) {
|
||||
#ifdef IDX_THREADS
|
||||
PTMutexLocker locker(m_updater->m_mutex);
|
||||
PTMutexLocker locker(m_updater->m_mutex);
|
||||
#endif
|
||||
++(m_updater->status.docsdone);
|
||||
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
|
||||
m_updater->status.dbtotdocs = m_updater->status.docsdone;
|
||||
m_updater->status.fn = fn;
|
||||
if (!doc.ipath.empty())
|
||||
m_updater->status.fn += "|" + doc.ipath;
|
||||
if (!m_updater->update()) {
|
||||
return FsTreeWalker::FtwStop;
|
||||
}
|
||||
++(m_updater->status.docsdone);
|
||||
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
|
||||
m_updater->status.dbtotdocs = m_updater->status.docsdone;
|
||||
m_updater->status.fn = fn;
|
||||
if (!doc.ipath.empty())
|
||||
m_updater->status.fn += "|" + doc.ipath;
|
||||
if (!m_updater->update()) {
|
||||
return FsTreeWalker::FtwStop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If this doc existed and it's a container, recording for
|
||||
// possible subdoc purge (this will be used only if we don't do a
|
||||
// db-wide purge, e.g. if we're called from indexfiles()).
|
||||
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
|
||||
existingDoc, hadNonNullIpath));
|
||||
if (existingDoc && hadNonNullIpath) {
|
||||
m_purgeCandidates.record(parent_udi);
|
||||
// If this doc existed and it's a container, recording for
|
||||
// possible subdoc purge (this will be used only if we don't do a
|
||||
// db-wide purge, e.g. if we're called from indexfiles()).
|
||||
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
|
||||
existingDoc, hadNonNullIpath));
|
||||
if (existingDoc && hadNonNullIpath) {
|
||||
m_purgeCandidates.record(parent_udi);
|
||||
}
|
||||
}
|
||||
|
||||
// If we had no instance with a null ipath, we create an empty
|
||||
// document to stand for the file itself, to be used mainly for up
|
||||
// to date checks. Typically this happens for an mbox file.
|
||||
if (hadNullIpath == false) {
|
||||
LOGDEB1(("Creating empty doc for file\n"));
|
||||
//
|
||||
// If xattronly is set, ONLY the extattr metadata is valid and will be used
|
||||
// by the following step.
|
||||
if (xattronly || hadNullIpath == false) {
|
||||
LOGDEB(("Creating empty doc for file or pure xattr update\n"));
|
||||
Rcl::Doc fileDoc;
|
||||
fileDoc.fmtime = ascdate;
|
||||
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||
fileDoc.haschildren = true;
|
||||
fileDoc.mimetype = interner.getMimetype();
|
||||
fileDoc.url = cstr_fileu + fn;
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, fileDoc);
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||
fileDoc.pcbytes = cbuf;
|
||||
if (xattronly) {
|
||||
map<string, string> xfields;
|
||||
reapXAttrs(config, fn, xfields);
|
||||
docFieldsFromXattrs(config, xfields, fileDoc);
|
||||
fileDoc.onlyxattr = true;
|
||||
} else {
|
||||
fileDoc.fmtime = ascdate;
|
||||
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||
fileDoc.haschildren = true;
|
||||
fileDoc.mimetype = mimetype;
|
||||
fileDoc.url = cstr_fileu + fn;
|
||||
if (m_havelocalfields)
|
||||
setlocalfields(localfields, fileDoc);
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||
fileDoc.pcbytes = cbuf;
|
||||
}
|
||||
|
||||
fileDoc.sig = sig;
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
|
||||
@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
|
||||
string m_slocalfields;
|
||||
map<string, string> m_localfields;
|
||||
|
||||
// Activate detection of xattr-only document updates. Experimental, so
|
||||
// needs a config option
|
||||
bool m_detectxattronly;
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
friend void *FsIndexerDbUpdWorker(void*);
|
||||
friend void *FsIndexerInternfileWorker(void*);
|
||||
|
||||
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
|
||||
code &= ~(IN_ISDIR|IN_ONESHOT);
|
||||
switch (code) {
|
||||
case IN_ACCESS: return "IN_ACCESS";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_ATTRIB: return "IN_ATTRIB";
|
||||
case IN_CLOSE: return "IN_CLOSE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE: return "IN_CLOSE";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_CREATE: return "IN_CREATE";
|
||||
case IN_DELETE: return "IN_DELETE";
|
||||
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
||||
case IN_IGNORED: return "IN_IGNORED";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_IGNORED: return "IN_IGNORED";
|
||||
default: {
|
||||
static char msg[50];
|
||||
sprintf(msg, "Unknown event 0x%x", code);
|
||||
@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
|
||||
uint32_t mask = IN_MODIFY | IN_CREATE
|
||||
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
||||
#ifdef RCL_USE_XATTR
|
||||
// It seems that IN_ATTRIB is not needed to receive extattr
|
||||
// modification events, which is a bit weird because only ctime is
|
||||
// set.
|
||||
// | IN_ATTRIB
|
||||
// IN_ATTRIB used to be not needed to receive extattr
|
||||
// modification events, which was a bit weird because only ctime is
|
||||
// set, and now it is...
|
||||
| IN_ATTRIB
|
||||
#endif
|
||||
#ifdef IN_DONT_FOLLOW
|
||||
| IN_DONT_FOLLOW
|
||||
@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
|
||||
eraseWatchSubTree(m_idtopath, ev.m_path);
|
||||
}
|
||||
|
||||
// IN_ATTRIB apparently not needed, see comment above
|
||||
if (evp->mask & (IN_MODIFY)) {
|
||||
// IN_ATTRIB used to be not needed, but now it is
|
||||
if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
|
||||
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
||||
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
||||
ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
|
||||
|
||||
@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear term from document if its frequency is 0. This should
|
||||
// probably be done by Xapian when the freq goes to 0 when removing a
|
||||
// posting, but we have to do it ourselves
|
||||
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
|
||||
{
|
||||
LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
|
||||
|
||||
// Find the term
|
||||
Xapian::TermIterator xit;
|
||||
XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
|
||||
xrdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n",
|
||||
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (xit == xdoc.termlist_end() || term.compare(*xit)) {
|
||||
LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n",
|
||||
term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear the term if its frequency is 0
|
||||
if (xit.get_wdf() == 0) {
|
||||
LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
|
||||
XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n",
|
||||
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Holder for term + pos
|
||||
struct DocPosting {
|
||||
DocPosting(string t, Xapian::termpos ps)
|
||||
: term(t), pos(ps) {}
|
||||
string term;
|
||||
Xapian::termpos pos;
|
||||
};
|
||||
|
||||
// Clear all terms for given field for given document.
|
||||
// The terms to be cleared are all those with the appropriate
|
||||
// prefix. We also remove the postings for the unprefixed terms (that
|
||||
// is, we undo what we did when indexing).
|
||||
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
|
||||
Xapian::termcount wdfdec)
|
||||
{
|
||||
LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
|
||||
pfx.c_str(), unsigned(xdoc.get_docid())));
|
||||
|
||||
vector<DocPosting> eraselist;
|
||||
|
||||
string wrapd = wrap_prefix(pfx);
|
||||
|
||||
m_rcldb->m_reason.clear();
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator xit;
|
||||
xit = xdoc.termlist_begin();
|
||||
xit.skip_to(wrapd);
|
||||
while (xit != xdoc.termlist_end() &&
|
||||
!(*xit).compare(0, wrapd.size(), wrapd)) {
|
||||
LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
|
||||
Xapian::PositionIterator posit;
|
||||
for (posit = xit.positionlist_begin();
|
||||
posit != xit.positionlist_end(); posit++) {
|
||||
eraselist.push_back(DocPosting(*xit, *posit));
|
||||
eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
|
||||
}
|
||||
xit++;
|
||||
}
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
m_rcldb->m_reason = e.get_msg();
|
||||
xrdb.reopen();
|
||||
continue;
|
||||
} XCATCHERROR(m_rcldb->m_reason);
|
||||
break;
|
||||
}
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::clearField: failed building erase list: %s\n",
|
||||
m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Now remove the found positions, and the terms if the wdf is 0
|
||||
for (vector<DocPosting>::const_iterator it = eraselist.begin();
|
||||
it != eraselist.end(); it++) {
|
||||
LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n",
|
||||
it->term.c_str(), int(it->pos)));
|
||||
XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);,
|
||||
xwdb,m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
// Not that this normally fails for non-prefixed XXST and
|
||||
// ND, don't make a fuss
|
||||
LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
|
||||
it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
|
||||
}
|
||||
clearDocTermIfWdf0(xdoc, it->term);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if doc given by udi is indexed by term
|
||||
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
||||
{
|
||||
@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||
{
|
||||
#ifdef IDX_THREADS
|
||||
Chrono chron;
|
||||
// In the case where there is a separate (single) db update
|
||||
// thread, we only need to protect the update map update below
|
||||
// (against interaction with threads calling needUpdate()). Else,
|
||||
// all threads from above need to synchronize here
|
||||
PTMutexLocker lock(m_mutex, m_havewriteq);
|
||||
PTMutexLocker lock(m_mutex);
|
||||
#endif
|
||||
|
||||
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
||||
@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||
try {
|
||||
Xapian::docid did =
|
||||
xwdb.replace_document(uniterm, newdocument);
|
||||
#ifdef IDX_THREADS
|
||||
// Need to protect against interaction with the up-to-date checks
|
||||
// which also update the existence map
|
||||
PTMutexLocker lock(m_mutex, !m_havewriteq);
|
||||
#endif
|
||||
if (did < m_rcldb->updated.size()) {
|
||||
m_rcldb->updated[did] = true;
|
||||
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
||||
@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// The splitter breaks text into words and adds postings to the Xapian
|
||||
// document. We use a single object to split all of the document
|
||||
// fields and position jumps to separate fields
|
||||
@ -1151,7 +1245,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
return false;
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
|
||||
// The term processing pipeline:
|
||||
TermProcIdx tpidx;
|
||||
TermProc *nxt = &tpidx;
|
||||
@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
TextSplitDb splitter(newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
||||
// If the ipath is like a path, index the last element. This is
|
||||
// for compound documents like zip and chm for which the filter
|
||||
// uses the file path as ipath.
|
||||
if (!doc.ipath.empty() &&
|
||||
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
||||
string utf8ipathlast;
|
||||
// There is no way in hell we could have an idea of the
|
||||
// charset here, so let's hope it's ascii or utf-8. We call
|
||||
// transcode to strip the bad chars and pray
|
||||
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
||||
"UTF-8", "UTF-8")) {
|
||||
splitter.text_to_words(utf8ipathlast);
|
||||
}
|
||||
}
|
||||
|
||||
// Split and index the path from the url for path-based filtering
|
||||
{
|
||||
string path = url_gpath(doc.url);
|
||||
vector<string> vpath;
|
||||
stringToTokens(path, vpath, "/");
|
||||
// If vpath is not /, the last elt is the file/dir name, not a
|
||||
// part of the path.
|
||||
if (vpath.size())
|
||||
vpath.resize(vpath.size()-1);
|
||||
splitter.curpos = 0;
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
||||
splitter.basepos + splitter.curpos++);
|
||||
for (vector<string>::iterator it = vpath.begin();
|
||||
it != vpath.end(); it++){
|
||||
if (it->length() > 230) {
|
||||
// Just truncate it. May still be useful because of wildcards
|
||||
*it = it->substr(0, 230);
|
||||
}
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
||||
splitter.basepos + splitter.curpos++);
|
||||
}
|
||||
}
|
||||
|
||||
// Index textual metadata. These are all indexed as text with
|
||||
// positions, as we may want to do phrase searches with them (this
|
||||
// makes no sense for keywords by the way).
|
||||
//
|
||||
// The order has no importance, and we set a position gap of 100
|
||||
// between fields to avoid false proximity matches.
|
||||
map<string, string>::iterator meta_it;
|
||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||
if (!meta_it->second.empty()) {
|
||||
const FieldTraits *ftp;
|
||||
// We don't test for an empty prefix here. Some fields are part
|
||||
// of the internal conf with an empty prefix (ie: abstract).
|
||||
if (!fieldToTraits(meta_it->first, &ftp)) {
|
||||
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
||||
meta_it->first.c_str()));
|
||||
continue;
|
||||
}
|
||||
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||
meta_it->second.c_str()));
|
||||
splitter.setprefix(ftp->pfx);
|
||||
splitter.setwdfinc(ftp->wdfinc);
|
||||
if (!splitter.text_to_words(meta_it->second))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||
meta_it->first.c_str()));
|
||||
}
|
||||
}
|
||||
splitter.setprefix(string());
|
||||
splitter.setwdfinc(1);
|
||||
|
||||
if (splitter.curpos < baseTextPosition)
|
||||
splitter.basepos = baseTextPosition;
|
||||
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
splitter.resetStats();
|
||||
#endif
|
||||
if (!splitter.text_to_words(doc.text))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Reject bad data. unrecognized base64 text is characterized by
|
||||
// high avg word length and high variation (because there are
|
||||
// word-splitters like +/ inside the data).
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
// v.avglen > 15 && v.sigma > 12
|
||||
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
||||
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
||||
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
||||
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
||||
doc.ipath.c_str(), doc.text.c_str()));
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||
|
||||
// Simple file name indexed unsplit for specific "file name"
|
||||
// searches. This is not the same as a filename: clause inside the
|
||||
// query language.
|
||||
// We also add a term for the filename extension if any.
|
||||
string utf8fn;
|
||||
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
||||
string fn;
|
||||
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
// We should truncate after extracting the extension, but this is
|
||||
// a pathological case anyway
|
||||
if (fn.size() > 230)
|
||||
utf8truncate(fn, 230);
|
||||
string::size_type pos = fn.rfind('.');
|
||||
if (pos != string::npos && pos != fn.length() - 1) {
|
||||
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
||||
fn.substr(pos + 1));
|
||||
}
|
||||
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Udi unique term: this is used for file existence/uptodate
|
||||
// checks, and unique id for the replace_document() call.
|
||||
string uniterm = make_uniterm(udi);
|
||||
newdocument.add_boolean_term(uniterm);
|
||||
// Parent term. This is used to find all descendents, mostly to delete them
|
||||
// when the parent goes away
|
||||
if (!parent_udi.empty()) {
|
||||
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
||||
}
|
||||
// Dates etc.
|
||||
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||
doc.dmtime.c_str());
|
||||
struct tm *tm = localtime(&mtime);
|
||||
char buf[9];
|
||||
snprintf(buf, 9, "%04d%02d%02d",
|
||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||
// Date (YYYYMMDD)
|
||||
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
||||
// Month (YYYYMM)
|
||||
buf[6] = '\0';
|
||||
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
||||
// Year (YYYY)
|
||||
buf[4] = '\0';
|
||||
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Document data record. omindex has the following nl separated fields:
|
||||
// - url
|
||||
// - sample
|
||||
// - caption (title limited to 100 chars)
|
||||
// - mime type
|
||||
//
|
||||
// The title, author, abstract and keywords fields are special,
|
||||
// they always get stored in the document data
|
||||
// record. Configurable other fields can be, too.
|
||||
//
|
||||
// We truncate stored fields abstract, title and keywords to
|
||||
// reasonable lengths and suppress newlines (so that the data
|
||||
// record can keep a simple syntax)
|
||||
|
||||
string record;
|
||||
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
||||
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
||||
// We left-zero-pad the times so that they are lexico-sortable
|
||||
leftzeropad(doc.fmtime, 11);
|
||||
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
||||
if (!doc.dmtime.empty()) {
|
||||
leftzeropad(doc.dmtime, 11);
|
||||
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
||||
}
|
||||
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
||||
|
||||
if (doc.fbytes.empty())
|
||||
doc.fbytes = doc.pcbytes;
|
||||
|
||||
if (!doc.fbytes.empty()) {
|
||||
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
||||
leftzeropad(doc.fbytes, 12);
|
||||
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
||||
}
|
||||
if (doc.haschildren) {
|
||||
newdocument.add_boolean_term(has_children_term);
|
||||
}
|
||||
if (!doc.pcbytes.empty())
|
||||
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
||||
char sizebuf[30];
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
||||
|
||||
// Note that we add the signature both as a value and in the data record
|
||||
if (!doc.sig.empty()) {
|
||||
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
||||
newdocument.add_value(VALUE_SIG, doc.sig);
|
||||
}
|
||||
|
||||
if (!doc.ipath.empty())
|
||||
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
||||
|
||||
doc.meta[Doc::keytt] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
||||
if (!doc.meta[Doc::keytt].empty())
|
||||
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
||||
|
||||
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
||||
doc.meta[Doc::keykw] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
||||
// No need to explicitly append the keywords, this will be done by
|
||||
// the "stored" loop
|
||||
|
||||
// If abstract is empty, we make up one with the beginning of the
|
||||
// document. This is then not indexed, but part of the doc data so
|
||||
// that we can return it to a query without having to decode the
|
||||
// original file.
|
||||
bool syntabs = false;
|
||||
// Note that the map accesses by operator[] create empty entries if they
|
||||
// don't exist yet.
|
||||
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
||||
if (doc.meta[Doc::keyabs].empty()) {
|
||||
syntabs = true;
|
||||
if (!doc.text.empty())
|
||||
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
||||
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
||||
if (doc.onlyxattr) {
|
||||
// Only updating an existing doc with new extended attributes
|
||||
// data. Need to read the old doc and its data record
|
||||
// first. This is so different from the normal processing that
|
||||
// it uses a fully separate code path (with some duplication
|
||||
// unfortunately)
|
||||
if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
|
||||
return false;
|
||||
} else {
|
||||
doc.meta[Doc::keyabs] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
||||
cstr_nc);
|
||||
}
|
||||
|
||||
const set<string>& stored = m_config->getStoredFields();
|
||||
for (set<string>::const_iterator it = stored.begin();
|
||||
it != stored.end(); it++) {
|
||||
string nm = m_config->fieldCanon(*it);
|
||||
if (!doc.meta[nm].empty()) {
|
||||
string value =
|
||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||
RECORD_APPEND(record, nm, value);
|
||||
// If the ipath is like a path, index the last element. This is
|
||||
// for compound documents like zip and chm for which the filter
|
||||
// uses the file path as ipath.
|
||||
if (!doc.ipath.empty() &&
|
||||
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
||||
string utf8ipathlast;
|
||||
// There is no way in hell we could have an idea of the
|
||||
// charset here, so let's hope it's ascii or utf-8. We call
|
||||
// transcode to strip the bad chars and pray
|
||||
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
||||
"UTF-8", "UTF-8")) {
|
||||
splitter.text_to_words(utf8ipathlast);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If empty pages (multiple break at same pos) were recorded, save
|
||||
// them (this is because we have no way to record them in the
|
||||
// Xapian list
|
||||
if (!tpidx.m_pageincrvec.empty()) {
|
||||
ostringstream multibreaks;
|
||||
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
||||
if (i != 0)
|
||||
multibreaks << ",";
|
||||
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
||||
tpidx.m_pageincrvec[i].second;
|
||||
// Split and index the path from the url for path-based filtering
|
||||
{
|
||||
string path = url_gpath(doc.url);
|
||||
vector<string> vpath;
|
||||
stringToTokens(path, vpath, "/");
|
||||
// If vpath is not /, the last elt is the file/dir name, not a
|
||||
// part of the path.
|
||||
if (vpath.size())
|
||||
vpath.resize(vpath.size()-1);
|
||||
splitter.curpos = 0;
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
||||
splitter.basepos + splitter.curpos++);
|
||||
for (vector<string>::iterator it = vpath.begin();
|
||||
it != vpath.end(); it++){
|
||||
if (it->length() > 230) {
|
||||
// Just truncate it. May still be useful because of wildcards
|
||||
*it = it->substr(0, 230);
|
||||
}
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
||||
splitter.basepos + splitter.curpos++);
|
||||
}
|
||||
}
|
||||
|
||||
// Index textual metadata. These are all indexed as text with
|
||||
// positions, as we may want to do phrase searches with them (this
|
||||
// makes no sense for keywords by the way).
|
||||
//
|
||||
// The order has no importance, and we set a position gap of 100
|
||||
// between fields to avoid false proximity matches.
|
||||
map<string, string>::iterator meta_it;
|
||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||
if (!meta_it->second.empty()) {
|
||||
const FieldTraits *ftp;
|
||||
// We don't test for an empty prefix here. Some fields are part
|
||||
// of the internal conf with an empty prefix (ie: abstract).
|
||||
if (!fieldToTraits(meta_it->first, &ftp)) {
|
||||
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
||||
meta_it->first.c_str()));
|
||||
continue;
|
||||
}
|
||||
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||
meta_it->second.c_str()));
|
||||
splitter.setprefix(ftp->pfx);
|
||||
splitter.setwdfinc(ftp->wdfinc);
|
||||
if (!splitter.text_to_words(meta_it->second))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||
meta_it->first.c_str()));
|
||||
}
|
||||
}
|
||||
splitter.setprefix(string());
|
||||
splitter.setwdfinc(1);
|
||||
|
||||
if (splitter.curpos < baseTextPosition)
|
||||
splitter.basepos = baseTextPosition;
|
||||
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
splitter.resetStats();
|
||||
#endif
|
||||
if (!splitter.text_to_words(doc.text))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Reject bad data. unrecognized base64 text is characterized by
|
||||
// high avg word length and high variation (because there are
|
||||
// word-splitters like +/ inside the data).
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
// v.avglen > 15 && v.sigma > 12
|
||||
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
||||
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
||||
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
||||
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
||||
doc.ipath.c_str(), doc.text.c_str()));
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||
|
||||
// Simple file name indexed unsplit for specific "file name"
|
||||
// searches. This is not the same as a filename: clause inside the
|
||||
// query language.
|
||||
// We also add a term for the filename extension if any.
|
||||
string utf8fn;
|
||||
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
||||
string fn;
|
||||
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
// We should truncate after extracting the extension, but this is
|
||||
// a pathological case anyway
|
||||
if (fn.size() > 230)
|
||||
utf8truncate(fn, 230);
|
||||
string::size_type pos = fn.rfind('.');
|
||||
if (pos != string::npos && pos != fn.length() - 1) {
|
||||
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
||||
fn.substr(pos + 1));
|
||||
}
|
||||
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
||||
}
|
||||
}
|
||||
|
||||
newdocument.add_boolean_term(uniterm);
|
||||
// Parent term. This is used to find all descendents, mostly
|
||||
// to delete them when the parent goes away
|
||||
if (!parent_udi.empty()) {
|
||||
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
||||
}
|
||||
// Dates etc.
|
||||
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||
doc.dmtime.c_str());
|
||||
struct tm *tm = localtime(&mtime);
|
||||
char buf[9];
|
||||
snprintf(buf, 9, "%04d%02d%02d",
|
||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||
// Date (YYYYMMDD)
|
||||
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
||||
// Month (YYYYMM)
|
||||
buf[6] = '\0';
|
||||
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
||||
// Year (YYYY)
|
||||
buf[4] = '\0';
|
||||
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Document data record. omindex has the following nl separated fields:
|
||||
// - url
|
||||
// - sample
|
||||
// - caption (title limited to 100 chars)
|
||||
// - mime type
|
||||
//
|
||||
// The title, author, abstract and keywords fields are special,
|
||||
// they always get stored in the document data
|
||||
// record. Configurable other fields can be, too.
|
||||
//
|
||||
// We truncate stored fields abstract, title and keywords to
|
||||
// reasonable lengths and suppress newlines (so that the data
|
||||
// record can keep a simple syntax)
|
||||
|
||||
string record;
|
||||
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
||||
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
||||
// We left-zero-pad the times so that they are lexico-sortable
|
||||
leftzeropad(doc.fmtime, 11);
|
||||
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
||||
if (!doc.dmtime.empty()) {
|
||||
leftzeropad(doc.dmtime, 11);
|
||||
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
||||
}
|
||||
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
||||
|
||||
if (doc.fbytes.empty())
|
||||
doc.fbytes = doc.pcbytes;
|
||||
|
||||
if (!doc.fbytes.empty()) {
|
||||
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
||||
leftzeropad(doc.fbytes, 12);
|
||||
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
||||
}
|
||||
if (doc.haschildren) {
|
||||
newdocument.add_boolean_term(has_children_term);
|
||||
}
|
||||
if (!doc.pcbytes.empty())
|
||||
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
||||
char sizebuf[30];
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
||||
|
||||
// Note that we add the signature both as a value and in the data record
|
||||
if (!doc.sig.empty()) {
|
||||
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
||||
newdocument.add_value(VALUE_SIG, doc.sig);
|
||||
}
|
||||
|
||||
if (!doc.ipath.empty())
|
||||
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
||||
|
||||
doc.meta[Doc::keytt] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
||||
if (!doc.meta[Doc::keytt].empty())
|
||||
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
||||
|
||||
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
||||
doc.meta[Doc::keykw] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
||||
// No need to explicitly append the keywords, this will be done by
|
||||
// the "stored" loop
|
||||
|
||||
// If abstract is empty, we make up one with the beginning of the
|
||||
// document. This is then not indexed, but part of the doc data so
|
||||
// that we can return it to a query without having to decode the
|
||||
// original file.
|
||||
bool syntabs = false;
|
||||
// Note that the map accesses by operator[] create empty entries if they
|
||||
// don't exist yet.
|
||||
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
||||
if (doc.meta[Doc::keyabs].empty()) {
|
||||
syntabs = true;
|
||||
if (!doc.text.empty())
|
||||
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
||||
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
||||
} else {
|
||||
doc.meta[Doc::keyabs] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
||||
cstr_nc);
|
||||
}
|
||||
|
||||
const set<string>& stored = m_config->getStoredFields();
|
||||
for (set<string>::const_iterator it = stored.begin();
|
||||
it != stored.end(); it++) {
|
||||
string nm = m_config->fieldCanon(*it);
|
||||
if (!doc.meta[nm].empty()) {
|
||||
string value =
|
||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||
RECORD_APPEND(record, nm, value);
|
||||
}
|
||||
}
|
||||
|
||||
// If empty pages (multiple break at same pos) were recorded, save
|
||||
// them (this is because we have no way to record them in the
|
||||
// Xapian list
|
||||
if (!tpidx.m_pageincrvec.empty()) {
|
||||
ostringstream multibreaks;
|
||||
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
||||
if (i != 0)
|
||||
multibreaks << ",";
|
||||
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
||||
tpidx.m_pageincrvec[i].second;
|
||||
}
|
||||
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
||||
}
|
||||
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
||||
}
|
||||
|
||||
// If the file's md5 was computed, add value and term.
|
||||
// The value is optionally used for query result duplicate elimination,
|
||||
// and the term to find the duplicates.
|
||||
// We don't do this for empty docs.
|
||||
const string *md5;
|
||||
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
||||
md5->compare(cstr_md5empty)) {
|
||||
string digest;
|
||||
MD5HexScan(*md5, digest);
|
||||
newdocument.add_value(VALUE_MD5, digest);
|
||||
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
||||
// If the file's md5 was computed, add value and term.
|
||||
// The value is optionally used for query result duplicate elimination,
|
||||
// and the term to find the duplicates.
|
||||
// We don't do this for empty docs.
|
||||
const string *md5;
|
||||
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
||||
md5->compare(cstr_md5empty)) {
|
||||
string digest;
|
||||
MD5HexScan(*md5, digest);
|
||||
newdocument.add_value(VALUE_MD5, digest);
|
||||
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
||||
}
|
||||
|
||||
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
}
|
||||
|
||||
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
if (m_ndb->m_havewriteq) {
|
||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
||||
@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
doc.text.length());
|
||||
}
|
||||
|
||||
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||
Doc &doc, Xapian::Document& xdoc)
|
||||
{
|
||||
LOGDEB0(("Db::docToXdocXattrOnly\n"));
|
||||
PTMutexLocker lock(m_mutex);
|
||||
|
||||
// Read existing document and its data record
|
||||
if (getDoc(udi, 0, xdoc) == 0) {
|
||||
LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
|
||||
return false;
|
||||
}
|
||||
string data;
|
||||
XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
|
||||
if (!m_rcldb->m_reason.empty()) {
|
||||
LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clear the term lists for the incoming fields and index the new values
|
||||
map<string, string>::iterator meta_it;
|
||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||
const FieldTraits *ftp;
|
||||
if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
|
||||
LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
|
||||
meta_it->first.c_str()));
|
||||
continue;
|
||||
}
|
||||
// Clear the previous terms for the field
|
||||
clearField(xdoc, ftp->pfx, ftp->wdfinc);
|
||||
LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||
meta_it->second.c_str()));
|
||||
splitter->setprefix(ftp->pfx);
|
||||
splitter->setwdfinc(ftp->wdfinc);
|
||||
if (!splitter->text_to_words(meta_it->second))
|
||||
LOGDEB(("Db::xattrOnly: split failed for %s\n",
|
||||
meta_it->first.c_str()));
|
||||
}
|
||||
xdoc.add_value(VALUE_SIG, doc.sig);
|
||||
|
||||
// Parse current data record into a dict for ease of processing
|
||||
ConfSimple datadic(data);
|
||||
if (!datadic.ok()) {
|
||||
LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
// For each "stored" field, check if set in doc metadata and
|
||||
// update the value if it is
|
||||
const set<string>& stored = m_rcldb->m_config->getStoredFields();
|
||||
for (set<string>::const_iterator it = stored.begin();
|
||||
it != stored.end(); it++) {
|
||||
string nm = m_rcldb->m_config->fieldCanon(*it);
|
||||
if (doc.getmeta(nm, 0)) {
|
||||
string value =
|
||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||
datadic.set(nm, value, "");
|
||||
}
|
||||
}
|
||||
|
||||
// Recreate the record. We want to do this with the local RECORD_APPEND
|
||||
// method for consistency in format, instead of using ConfSimple print
|
||||
vector<string> names = datadic.getNames("");
|
||||
data.clear();
|
||||
for (vector<string>::const_iterator it = names.begin();
|
||||
it != names.end(); it++) {
|
||||
string value;
|
||||
datadic.get(*it, value, "");
|
||||
RECORD_APPEND(data, *it, value);
|
||||
}
|
||||
RECORD_APPEND(data, Doc::keysig, doc.sig);
|
||||
xdoc.set_data(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
void Db::waitUpdIdle()
|
||||
{
|
||||
|
||||
@ -237,6 +237,10 @@ class Db {
|
||||
*/
|
||||
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
|
||||
|
||||
/** Indicate if we are doing a systematic reindex. This complements
|
||||
needUpdate() return */
|
||||
bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
|
||||
|
||||
/** Add or update document identified by unique identifier.
|
||||
* @param config Config object to use. Can be the same as the member config
|
||||
* or a clone, to avoid sharing when called in multithread context.
|
||||
|
||||
@ -66,6 +66,8 @@ public:
|
||||
};
|
||||
#endif // IDX_THREADS
|
||||
|
||||
class TextSplitDb;
|
||||
|
||||
// A class for data and methods that would have to expose
|
||||
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
||||
// 2 different ones for indexing or query as there is not much in
|
||||
@ -141,6 +143,16 @@ class Db::Native {
|
||||
/** Check if doc is indexed by term */
|
||||
bool hasTerm(const string& udi, int idxi, const string& term);
|
||||
|
||||
/** Update existing Xapian document for pure extended attrs change */
|
||||
bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||
Doc &doc, Xapian::Document& xdoc);
|
||||
/** Remove all terms currently indexed for field defined by idx prefix */
|
||||
bool clearField(Xapian::Document& xdoc, const string& pfx,
|
||||
Xapian::termcount wdfdec);
|
||||
|
||||
/** Check if term wdf is 0 and remove term if so */
|
||||
bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
|
||||
|
||||
/** Compute list of subdocuments for a given udi. We look for documents
|
||||
* indexed by a parent term matching the udi, the posting list for the
|
||||
* parentterm(udi) (As suggested by James Aylett)
|
||||
|
||||
@ -131,6 +131,10 @@ class Doc {
|
||||
// ipath descendants.
|
||||
bool haschildren;
|
||||
|
||||
// During indexing: only fields from extended attributes were set, no
|
||||
// doc content. Allows for faster reindexing of existing doc
|
||||
bool onlyxattr;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
void erase() {
|
||||
@ -154,10 +158,11 @@ class Doc {
|
||||
idxi = 0;
|
||||
haspages = false;
|
||||
haschildren = false;
|
||||
onlyxattr = false;
|
||||
}
|
||||
Doc()
|
||||
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
||||
haspages(false), haschildren(false)
|
||||
haspages(false), haschildren(false), onlyxattr(false)
|
||||
{
|
||||
}
|
||||
/** Get value for named field. If value pointer is 0, just test existence */
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
#####################################################
|
||||
# This section defines what prefix the terms inside named fields will be
|
||||
# indexed with (in addition to prefix-less indexing for general search)
|
||||
# ALL prefixes MUST be all UPPERCASE.
|
||||
# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
|
||||
#
|
||||
# The field names should be the canonic ones, not the aliases defined in
|
||||
# the following section. Don't change those which are predefined here,
|
||||
|
||||
@ -5,6 +5,7 @@ daemloglevel = 6
|
||||
daemlogfilename = /tmp/rclmontrace
|
||||
|
||||
indexStripChars = 1
|
||||
detectxattronly = 1
|
||||
|
||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
1 results
|
||||
application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
||||
inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
||||
|
||||
@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
|
||||
rerootResults
|
||||
fi
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
|
||||
iscmdresult=$d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
result=0
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
echo $cmd is $iscmdresult
|
||||
else
|
||||
echo $cmd not found
|
||||
result=1
|
||||
fi
|
||||
done
|
||||
return $result
|
||||
}
|
||||
|
||||
checkcmds recollq recollindex pxattr xadump || exit 1
|
||||
|
||||
makeindex() {
|
||||
echo "Zeroing Index"
|
||||
rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
|
||||
|
||||
4
tests/xattr/fields
Normal file
4
tests/xattr/fields
Normal file
@ -0,0 +1,4 @@
|
||||
[prefixes]
|
||||
myattr = XYXATA
|
||||
[stored]
|
||||
myattr =
|
||||
85
tests/xattr/xattr.sh
Executable file
85
tests/xattr/xattr.sh
Executable file
@ -0,0 +1,85 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Test extended attributes indexing. This should work both with
|
||||
# "detectxattronly" set or unset in the config, but should be run with
|
||||
# the variable set, because we test its function by exploiting a bug
|
||||
# (see comments further)
|
||||
#
|
||||
# We use the RECOLL_CONFTOP variable to add our own fields configuration
|
||||
|
||||
thisdir=`dirname $0`
|
||||
topdir=$thisdir/..
|
||||
. $topdir/shared.sh
|
||||
|
||||
initvariables $0
|
||||
|
||||
RECOLL_CONFTOP=$thisdir
|
||||
export RECOLL_CONFTOP
|
||||
|
||||
xrun()
|
||||
{
|
||||
echo $*
|
||||
$*
|
||||
}
|
||||
|
||||
tstfile=${tstdata}/xattrs/tstxattrs.txt
|
||||
rm -f $tstfile
|
||||
|
||||
(
|
||||
# Create the file with an extended attribute, index, and query it
|
||||
# by content and field
|
||||
echo xattruniqueinfile > $tstfile
|
||||
xrun pxattr -n myattr -v xattrunique1 $tstfile
|
||||
xrun recollindex -Zi $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "1 result expected"
|
||||
xrun recollq myattr:xattrunique1
|
||||
|
||||
sleep 1
|
||||
|
||||
# Change the value for the field, check that the old value is gone
|
||||
# and the new works
|
||||
xrun pxattr -n myattr -v xattrunique2 $tstfile
|
||||
xrun recollindex -i $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique1
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
|
||||
# Change the contents then the xattr. With xattronly set, recoll
|
||||
# should miss the contents change and index only the xattr. That's
|
||||
# a bug but we use it to check that pure xattr update indexing
|
||||
# works
|
||||
echo xattruniqueinfile1 > $tstfile
|
||||
sleep 2
|
||||
xrun pxattr -n myattr -v xattrunique3 $tstfile
|
||||
xrun recollindex -i $tstfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "0 result expected"
|
||||
xrun recollq xattruniqueinfile1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique3
|
||||
|
||||
# Reset the index and check that the contents were seen all right
|
||||
xrun recollindex -Zi $tstfile
|
||||
echo "0 result expected"
|
||||
xrun recollq xattruniqueinfile
|
||||
echo "1 result expected"
|
||||
xrun recollq xattruniqueinfile1
|
||||
echo "0 result expected:"
|
||||
xrun recollq myattr:xattrunique2
|
||||
echo "1 result expected:"
|
||||
xrun recollq myattr:xattrunique3
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
checkresult
|
||||
57
tests/xattr/xattr.txt
Normal file
57
tests/xattr/xattr.txt
Normal file
@ -0,0 +1,57 @@
|
||||
pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
1 result expected
|
||||
recollq myattr:xattrunique1
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique1
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
1 result expected
|
||||
recollq xattruniqueinfile
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
0 result expected
|
||||
recollq xattruniqueinfile1
|
||||
0 results
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique1
|
||||
0 results
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique3
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||
0 result expected
|
||||
recollq xattruniqueinfile
|
||||
0 results
|
||||
1 result expected
|
||||
recollq xattruniqueinfile1
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||
0 result expected:
|
||||
recollq myattr:xattrunique2
|
||||
0 results
|
||||
1 result expected:
|
||||
recollq myattr:xattrunique3
|
||||
1 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||
Loading…
x
Reference in New Issue
Block a user