Handle partial indexing of document restricted to metadata from extended attributes
This commit is contained in:
parent
b2eeec067b
commit
56a56500c1
@ -1 +1 @@
|
|||||||
1.19.5
|
1.20.0
|
||||||
|
|||||||
@ -45,7 +45,7 @@
|
|||||||
#include "cancelcheck.h"
|
#include "cancelcheck.h"
|
||||||
#include "rclinit.h"
|
#include "rclinit.h"
|
||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
|
#include "extrameta.h"
|
||||||
|
|
||||||
// When using extended attributes, we have to use the ctime, because
|
// When using extended attributes, we have to use the ctime, because
|
||||||
// this is all that gets set when the attributes are modified.
|
// this is all that gets set when the attributes are modified.
|
||||||
@ -104,7 +104,7 @@ public:
|
|||||||
|
|
||||||
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
||||||
: m_config(cnf), m_db(db), m_updater(updfunc),
|
: m_config(cnf), m_db(db), m_updater(updfunc),
|
||||||
m_missing(new FSIFIMissingStore)
|
m_missing(new FSIFIMissingStore), m_detectxattronly(false)
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
|
, m_iwqueue("Internfile", cnf->getThrConf(RclConfig::ThrIntern).first),
|
||||||
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
|
m_dwqueue("Split", cnf->getThrConf(RclConfig::ThrSplit).first)
|
||||||
@ -112,6 +112,7 @@ FsIndexer::FsIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc)
|
|||||||
{
|
{
|
||||||
LOGDEB1(("FsIndexer::FsIndexer\n"));
|
LOGDEB1(("FsIndexer::FsIndexer\n"));
|
||||||
m_havelocalfields = m_config->hasNameAnywhere("localfields");
|
m_havelocalfields = m_config->hasNameAnywhere("localfields");
|
||||||
|
m_config->getConfParam("detectxattronly", &m_detectxattronly);
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
m_stableconfig = new RclConfig(*m_config);
|
m_stableconfig = new RclConfig(*m_config);
|
||||||
@ -625,6 +626,15 @@ FsIndexer::processonefile(RclConfig *config,
|
|||||||
bool existingDoc;
|
bool existingDoc;
|
||||||
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
|
bool needupdate = m_db->needUpdate(udi, sig, &existingDoc);
|
||||||
|
|
||||||
|
// If ctime (which we use for the sig) differs from mtime, then at most
|
||||||
|
// the extended attributes were changed, no need to index content.
|
||||||
|
// This unfortunately leaves open the case where the data was
|
||||||
|
// modified, then the extended attributes, in which case we will
|
||||||
|
// miss the data update. We would have to store both the mtime and
|
||||||
|
// the ctime to avoid this
|
||||||
|
bool xattronly = m_detectxattronly && !m_db->inFullReset() &&
|
||||||
|
existingDoc && needupdate && (stp->st_mtime < stp->st_ctime);
|
||||||
|
|
||||||
if (!needupdate) {
|
if (!needupdate) {
|
||||||
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
|
LOGDEB0(("processone: up to date: %s\n", fn.c_str()));
|
||||||
if (m_updater) {
|
if (m_updater) {
|
||||||
@ -644,14 +654,6 @@ FsIndexer::processonefile(RclConfig *config,
|
|||||||
LOGDEB0(("processone: processing: [%s] %s\n",
|
LOGDEB0(("processone: processing: [%s] %s\n",
|
||||||
displayableBytes(stp->st_size).c_str(), fn.c_str()));
|
displayableBytes(stp->st_size).c_str(), fn.c_str()));
|
||||||
|
|
||||||
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
|
||||||
if (!interner.ok()) {
|
|
||||||
// no indexing whatsoever in this case. This typically means that
|
|
||||||
// indexallfilenames is not set
|
|
||||||
return FsTreeWalker::FtwOk;
|
|
||||||
}
|
|
||||||
interner.setMissingStore(m_missing);
|
|
||||||
|
|
||||||
string utf8fn = compute_utf8fn(config, fn);
|
string utf8fn = compute_utf8fn(config, fn);
|
||||||
|
|
||||||
// parent_udi is initially the same as udi, it will be used if there
|
// parent_udi is initially the same as udi, it will be used if there
|
||||||
@ -662,128 +664,152 @@ FsIndexer::processonefile(RclConfig *config,
|
|||||||
char ascdate[30];
|
char ascdate[30];
|
||||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||||
|
|
||||||
FileInterner::Status fis = FileInterner::FIAgain;
|
|
||||||
bool hadNullIpath = false;
|
bool hadNullIpath = false;
|
||||||
bool hadNonNullIpath = false;
|
string mimetype;
|
||||||
while (fis == FileInterner::FIAgain) {
|
|
||||||
doc.erase();
|
|
||||||
try {
|
|
||||||
fis = interner.internfile(doc);
|
|
||||||
} catch (CancelExcept) {
|
|
||||||
LOGERR(("fsIndexer::processone: interrupted\n"));
|
|
||||||
return FsTreeWalker::FtwStop;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We index at least the file name even if there was an error.
|
if (!xattronly) {
|
||||||
// We'll change the signature to ensure that the indexing will
|
FileInterner interner(fn, stp, config, FileInterner::FIF_none);
|
||||||
// be retried every time.
|
if (!interner.ok()) {
|
||||||
|
// no indexing whatsoever in this case. This typically means that
|
||||||
|
// indexallfilenames is not set
|
||||||
|
return FsTreeWalker::FtwOk;
|
||||||
|
}
|
||||||
|
mimetype = interner.getMimetype();
|
||||||
|
|
||||||
// Internal access path for multi-document files. If empty, this is
|
interner.setMissingStore(m_missing);
|
||||||
// for the main file.
|
FileInterner::Status fis = FileInterner::FIAgain;
|
||||||
if (doc.ipath.empty()) {
|
bool hadNonNullIpath = false;
|
||||||
hadNullIpath = true;
|
while (fis == FileInterner::FIAgain) {
|
||||||
if (hadNonNullIpath) {
|
doc.erase();
|
||||||
// Note that only the filters can reliably compute
|
try {
|
||||||
// this. What we do is dependant of the doc order (if
|
fis = interner.internfile(doc);
|
||||||
// we see the top doc first, we won't set the flag)
|
} catch (CancelExcept) {
|
||||||
doc.haschildren = true;
|
LOGERR(("fsIndexer::processone: interrupted\n"));
|
||||||
|
return FsTreeWalker::FtwStop;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
hadNonNullIpath = true;
|
|
||||||
make_udi(fn, doc.ipath, udi);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set file name, mod time and url if not done by filter
|
// We index at least the file name even if there was an error.
|
||||||
if (doc.fmtime.empty())
|
// We'll change the signature to ensure that the indexing will
|
||||||
doc.fmtime = ascdate;
|
// be retried every time.
|
||||||
if (doc.url.empty())
|
|
||||||
doc.url = cstr_fileu + fn;
|
|
||||||
const string *fnp = 0;
|
|
||||||
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
|
||||||
doc.meta[Rcl::Doc::keyfn] = utf8fn;
|
|
||||||
|
|
||||||
char cbuf[100];
|
// Internal access path for multi-document files. If empty, this is
|
||||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
// for the main file.
|
||||||
doc.pcbytes = cbuf;
|
if (doc.ipath.empty()) {
|
||||||
// Document signature for up to date checks. All subdocs inherit the
|
hadNullIpath = true;
|
||||||
// file's.
|
if (hadNonNullIpath) {
|
||||||
doc.sig = sig;
|
// Note that only the filters can reliably compute
|
||||||
|
// this. What we do is dependant of the doc order (if
|
||||||
|
// we see the top doc first, we won't set the flag)
|
||||||
|
doc.haschildren = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
hadNonNullIpath = true;
|
||||||
|
make_udi(fn, doc.ipath, udi);
|
||||||
|
}
|
||||||
|
|
||||||
// If there was an error, ensure indexing will be
|
// Set file name, mod time and url if not done by filter
|
||||||
// retried. This is for the once missing, later installed
|
if (doc.fmtime.empty())
|
||||||
// filter case. It can make indexing much slower (if there are
|
doc.fmtime = ascdate;
|
||||||
// myriads of such files, the ext script is executed for them
|
if (doc.url.empty())
|
||||||
// and fails every time)
|
doc.url = cstr_fileu + fn;
|
||||||
if (fis == FileInterner::FIError) {
|
const string *fnp = 0;
|
||||||
doc.sig += cstr_plus;
|
if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty())
|
||||||
}
|
doc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||||
|
|
||||||
// Possibly add fields from local config
|
char cbuf[100];
|
||||||
if (m_havelocalfields)
|
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||||
setlocalfields(localfields, doc);
|
doc.pcbytes = cbuf;
|
||||||
|
// Document signature for up to date checks. All subdocs inherit the
|
||||||
|
// file's.
|
||||||
|
doc.sig = sig;
|
||||||
|
|
||||||
// Add document to database. If there is an ipath, add it as a children
|
// If there was an error, ensure indexing will be
|
||||||
// of the file document.
|
// retried. This is for the once missing, later installed
|
||||||
|
// filter case. It can make indexing much slower (if there are
|
||||||
|
// myriads of such files, the ext script is executed for them
|
||||||
|
// and fails every time)
|
||||||
|
if (fis == FileInterner::FIError) {
|
||||||
|
doc.sig += cstr_plus;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Possibly add fields from local config
|
||||||
|
if (m_havelocalfields)
|
||||||
|
setlocalfields(localfields, doc);
|
||||||
|
|
||||||
|
// Add document to database. If there is an ipath, add it
|
||||||
|
// as a child of the file document.
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
if (m_haveSplitQ) {
|
if (m_haveSplitQ) {
|
||||||
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
|
DbUpdTask *tp = new DbUpdTask(udi, doc.ipath.empty() ?
|
||||||
cstr_null : parent_udi, doc);
|
cstr_null : parent_udi, doc);
|
||||||
if (!m_dwqueue.put(tp)) {
|
if (!m_dwqueue.put(tp)) {
|
||||||
LOGERR(("processonefile: wqueue.put failed\n"));
|
LOGERR(("processonefile: wqueue.put failed\n"));
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#endif
|
#endif
|
||||||
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
|
if (!m_db->addOrUpdate(udi, doc.ipath.empty() ?
|
||||||
cstr_null : parent_udi, doc)) {
|
cstr_null : parent_udi, doc)) {
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
}
|
}
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Tell what we are doing and check for interrupt request
|
// Tell what we are doing and check for interrupt request
|
||||||
if (m_updater) {
|
if (m_updater) {
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
PTMutexLocker locker(m_updater->m_mutex);
|
PTMutexLocker locker(m_updater->m_mutex);
|
||||||
#endif
|
#endif
|
||||||
++(m_updater->status.docsdone);
|
++(m_updater->status.docsdone);
|
||||||
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
|
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
|
||||||
m_updater->status.dbtotdocs = m_updater->status.docsdone;
|
m_updater->status.dbtotdocs = m_updater->status.docsdone;
|
||||||
m_updater->status.fn = fn;
|
m_updater->status.fn = fn;
|
||||||
if (!doc.ipath.empty())
|
if (!doc.ipath.empty())
|
||||||
m_updater->status.fn += "|" + doc.ipath;
|
m_updater->status.fn += "|" + doc.ipath;
|
||||||
if (!m_updater->update()) {
|
if (!m_updater->update()) {
|
||||||
return FsTreeWalker::FtwStop;
|
return FsTreeWalker::FtwStop;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// If this doc existed and it's a container, recording for
|
// If this doc existed and it's a container, recording for
|
||||||
// possible subdoc purge (this will be used only if we don't do a
|
// possible subdoc purge (this will be used only if we don't do a
|
||||||
// db-wide purge, e.g. if we're called from indexfiles()).
|
// db-wide purge, e.g. if we're called from indexfiles()).
|
||||||
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
|
LOGDEB2(("processOnefile: existingDoc %d hadNonNullIpath %d\n",
|
||||||
existingDoc, hadNonNullIpath));
|
existingDoc, hadNonNullIpath));
|
||||||
if (existingDoc && hadNonNullIpath) {
|
if (existingDoc && hadNonNullIpath) {
|
||||||
m_purgeCandidates.record(parent_udi);
|
m_purgeCandidates.record(parent_udi);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we had no instance with a null ipath, we create an empty
|
// If we had no instance with a null ipath, we create an empty
|
||||||
// document to stand for the file itself, to be used mainly for up
|
// document to stand for the file itself, to be used mainly for up
|
||||||
// to date checks. Typically this happens for an mbox file.
|
// to date checks. Typically this happens for an mbox file.
|
||||||
if (hadNullIpath == false) {
|
//
|
||||||
LOGDEB1(("Creating empty doc for file\n"));
|
// If xattronly is set, ONLY the extattr metadata is valid and will be used
|
||||||
|
// by the following step.
|
||||||
|
if (xattronly || hadNullIpath == false) {
|
||||||
|
LOGDEB(("Creating empty doc for file or pure xattr update\n"));
|
||||||
Rcl::Doc fileDoc;
|
Rcl::Doc fileDoc;
|
||||||
fileDoc.fmtime = ascdate;
|
if (xattronly) {
|
||||||
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
map<string, string> xfields;
|
||||||
fileDoc.haschildren = true;
|
reapXAttrs(config, fn, xfields);
|
||||||
fileDoc.mimetype = interner.getMimetype();
|
docFieldsFromXattrs(config, xfields, fileDoc);
|
||||||
fileDoc.url = cstr_fileu + fn;
|
fileDoc.onlyxattr = true;
|
||||||
if (m_havelocalfields)
|
} else {
|
||||||
setlocalfields(localfields, fileDoc);
|
fileDoc.fmtime = ascdate;
|
||||||
char cbuf[100];
|
fileDoc.meta[Rcl::Doc::keyfn] = utf8fn;
|
||||||
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
fileDoc.haschildren = true;
|
||||||
fileDoc.pcbytes = cbuf;
|
fileDoc.mimetype = mimetype;
|
||||||
|
fileDoc.url = cstr_fileu + fn;
|
||||||
|
if (m_havelocalfields)
|
||||||
|
setlocalfields(localfields, fileDoc);
|
||||||
|
char cbuf[100];
|
||||||
|
sprintf(cbuf, "%lld", (long long)stp->st_size);
|
||||||
|
fileDoc.pcbytes = cbuf;
|
||||||
|
}
|
||||||
|
|
||||||
fileDoc.sig = sig;
|
fileDoc.sig = sig;
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
|
|||||||
@ -132,6 +132,10 @@ class FsIndexer : public FsTreeWalkerCB {
|
|||||||
string m_slocalfields;
|
string m_slocalfields;
|
||||||
map<string, string> m_localfields;
|
map<string, string> m_localfields;
|
||||||
|
|
||||||
|
// Activate detection of xattr-only document updates. Experimental, so
|
||||||
|
// needs a config option
|
||||||
|
bool m_detectxattronly;
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
friend void *FsIndexerDbUpdWorker(void*);
|
friend void *FsIndexerDbUpdWorker(void*);
|
||||||
friend void *FsIndexerInternfileWorker(void*);
|
friend void *FsIndexerInternfileWorker(void*);
|
||||||
|
|||||||
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
|
|||||||
code &= ~(IN_ISDIR|IN_ONESHOT);
|
code &= ~(IN_ISDIR|IN_ONESHOT);
|
||||||
switch (code) {
|
switch (code) {
|
||||||
case IN_ACCESS: return "IN_ACCESS";
|
case IN_ACCESS: return "IN_ACCESS";
|
||||||
|
case IN_MODIFY: return "IN_MODIFY";
|
||||||
case IN_ATTRIB: return "IN_ATTRIB";
|
case IN_ATTRIB: return "IN_ATTRIB";
|
||||||
case IN_CLOSE: return "IN_CLOSE";
|
|
||||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
|
||||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||||
|
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||||
|
case IN_CLOSE: return "IN_CLOSE";
|
||||||
|
case IN_OPEN: return "IN_OPEN";
|
||||||
|
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||||
|
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||||
|
case IN_MOVE: return "IN_MOVE";
|
||||||
case IN_CREATE: return "IN_CREATE";
|
case IN_CREATE: return "IN_CREATE";
|
||||||
case IN_DELETE: return "IN_DELETE";
|
case IN_DELETE: return "IN_DELETE";
|
||||||
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
||||||
case IN_IGNORED: return "IN_IGNORED";
|
|
||||||
case IN_MODIFY: return "IN_MODIFY";
|
|
||||||
case IN_MOVE: return "IN_MOVE";
|
|
||||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
|
||||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
|
||||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||||
case IN_OPEN: return "IN_OPEN";
|
|
||||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
|
||||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||||
|
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||||
|
case IN_IGNORED: return "IN_IGNORED";
|
||||||
default: {
|
default: {
|
||||||
static char msg[50];
|
static char msg[50];
|
||||||
sprintf(msg, "Unknown event 0x%x", code);
|
sprintf(msg, "Unknown event 0x%x", code);
|
||||||
@ -600,10 +600,10 @@ bool RclIntf::addWatch(const string& path, bool)
|
|||||||
uint32_t mask = IN_MODIFY | IN_CREATE
|
uint32_t mask = IN_MODIFY | IN_CREATE
|
||||||
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
||||||
#ifdef RCL_USE_XATTR
|
#ifdef RCL_USE_XATTR
|
||||||
// It seems that IN_ATTRIB is not needed to receive extattr
|
// IN_ATTRIB used to be not needed to receive extattr
|
||||||
// modification events, which is a bit weird because only ctime is
|
// modification events, which was a bit weird because only ctime is
|
||||||
// set.
|
// set, and now it is...
|
||||||
// | IN_ATTRIB
|
| IN_ATTRIB
|
||||||
#endif
|
#endif
|
||||||
#ifdef IN_DONT_FOLLOW
|
#ifdef IN_DONT_FOLLOW
|
||||||
| IN_DONT_FOLLOW
|
| IN_DONT_FOLLOW
|
||||||
@ -698,8 +698,8 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
|
|||||||
eraseWatchSubTree(m_idtopath, ev.m_path);
|
eraseWatchSubTree(m_idtopath, ev.m_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
// IN_ATTRIB apparently not needed, see comment above
|
// IN_ATTRIB used to be not needed, but now it is
|
||||||
if (evp->mask & (IN_MODIFY)) {
|
if (evp->mask & (IN_MODIFY|IN_ATTRIB)) {
|
||||||
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
||||||
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
||||||
ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
|
ev.m_etyp = RclMonEvent::RCLEVT_DELETE;
|
||||||
|
|||||||
@ -263,6 +263,110 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clear term from document if its frequency is 0. This should
|
||||||
|
// probably be done by Xapian when the freq goes to 0 when removing a
|
||||||
|
// posting, but we have to do it ourselves
|
||||||
|
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
|
||||||
|
{
|
||||||
|
LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
|
||||||
|
|
||||||
|
// Find the term
|
||||||
|
Xapian::TermIterator xit;
|
||||||
|
XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
|
||||||
|
xrdb, m_rcldb->m_reason);
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n",
|
||||||
|
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (xit == xdoc.termlist_end() || term.compare(*xit)) {
|
||||||
|
LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n",
|
||||||
|
term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the term if its frequency is 0
|
||||||
|
if (xit.get_wdf() == 0) {
|
||||||
|
LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
|
||||||
|
XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n",
|
||||||
|
term.c_str(), m_rcldb->m_reason.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Holder for term + pos
|
||||||
|
struct DocPosting {
|
||||||
|
DocPosting(string t, Xapian::termpos ps)
|
||||||
|
: term(t), pos(ps) {}
|
||||||
|
string term;
|
||||||
|
Xapian::termpos pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Clear all terms for given field for given document.
|
||||||
|
// The terms to be cleared are all those with the appropriate
|
||||||
|
// prefix. We also remove the postings for the unprefixed terms (that
|
||||||
|
// is, we undo what we did when indexing).
|
||||||
|
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
|
||||||
|
Xapian::termcount wdfdec)
|
||||||
|
{
|
||||||
|
LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
|
||||||
|
pfx.c_str(), unsigned(xdoc.get_docid())));
|
||||||
|
|
||||||
|
vector<DocPosting> eraselist;
|
||||||
|
|
||||||
|
string wrapd = wrap_prefix(pfx);
|
||||||
|
|
||||||
|
m_rcldb->m_reason.clear();
|
||||||
|
for (int tries = 0; tries < 2; tries++) {
|
||||||
|
try {
|
||||||
|
Xapian::TermIterator xit;
|
||||||
|
xit = xdoc.termlist_begin();
|
||||||
|
xit.skip_to(wrapd);
|
||||||
|
while (xit != xdoc.termlist_end() &&
|
||||||
|
!(*xit).compare(0, wrapd.size(), wrapd)) {
|
||||||
|
LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
|
||||||
|
Xapian::PositionIterator posit;
|
||||||
|
for (posit = xit.positionlist_begin();
|
||||||
|
posit != xit.positionlist_end(); posit++) {
|
||||||
|
eraselist.push_back(DocPosting(*xit, *posit));
|
||||||
|
eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
|
||||||
|
}
|
||||||
|
xit++;
|
||||||
|
}
|
||||||
|
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||||
|
m_rcldb->m_reason = e.get_msg();
|
||||||
|
xrdb.reopen();
|
||||||
|
continue;
|
||||||
|
} XCATCHERROR(m_rcldb->m_reason);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
LOGERR(("Db::clearField: failed building erase list: %s\n",
|
||||||
|
m_rcldb->m_reason.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now remove the found positions, and the terms if the wdf is 0
|
||||||
|
for (vector<DocPosting>::const_iterator it = eraselist.begin();
|
||||||
|
it != eraselist.end(); it++) {
|
||||||
|
LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n",
|
||||||
|
it->term.c_str(), int(it->pos)));
|
||||||
|
XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);,
|
||||||
|
xwdb,m_rcldb->m_reason);
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
// Not that this normally fails for non-prefixed XXST and
|
||||||
|
// ND, don't make a fuss
|
||||||
|
LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
|
||||||
|
it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
|
||||||
|
}
|
||||||
|
clearDocTermIfWdf0(xdoc, it->term);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Check if doc given by udi is indexed by term
|
// Check if doc given by udi is indexed by term
|
||||||
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
||||||
{
|
{
|
||||||
@ -460,11 +564,7 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
|||||||
{
|
{
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
// In the case where there is a separate (single) db update
|
PTMutexLocker lock(m_mutex);
|
||||||
// thread, we only need to protect the update map update below
|
|
||||||
// (against interaction with threads calling needUpdate()). Else,
|
|
||||||
// all threads from above need to synchronize here
|
|
||||||
PTMutexLocker lock(m_mutex, m_havewriteq);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
||||||
@ -491,11 +591,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
|||||||
try {
|
try {
|
||||||
Xapian::docid did =
|
Xapian::docid did =
|
||||||
xwdb.replace_document(uniterm, newdocument);
|
xwdb.replace_document(uniterm, newdocument);
|
||||||
#ifdef IDX_THREADS
|
|
||||||
// Need to protect against interaction with the up-to-date checks
|
|
||||||
// which also update the existence map
|
|
||||||
PTMutexLocker lock(m_mutex, !m_havewriteq);
|
|
||||||
#endif
|
|
||||||
if (did < m_rcldb->updated.size()) {
|
if (did < m_rcldb->updated.size()) {
|
||||||
m_rcldb->updated[did] = true;
|
m_rcldb->updated[did] = true;
|
||||||
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
||||||
@ -934,7 +1029,6 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// The splitter breaks text into words and adds postings to the Xapian
|
// The splitter breaks text into words and adds postings to the Xapian
|
||||||
// document. We use a single object to split all of the document
|
// document. We use a single object to split all of the document
|
||||||
// fields and position jumps to separate fields
|
// fields and position jumps to separate fields
|
||||||
@ -1165,276 +1259,287 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
TextSplitDb splitter(newdocument, nxt);
|
TextSplitDb splitter(newdocument, nxt);
|
||||||
tpidx.setTSD(&splitter);
|
tpidx.setTSD(&splitter);
|
||||||
|
|
||||||
// If the ipath is like a path, index the last element. This is
|
|
||||||
// for compound documents like zip and chm for which the filter
|
|
||||||
// uses the file path as ipath.
|
|
||||||
if (!doc.ipath.empty() &&
|
|
||||||
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
|
||||||
string utf8ipathlast;
|
|
||||||
// There is no way in hell we could have an idea of the
|
|
||||||
// charset here, so let's hope it's ascii or utf-8. We call
|
|
||||||
// transcode to strip the bad chars and pray
|
|
||||||
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
|
||||||
"UTF-8", "UTF-8")) {
|
|
||||||
splitter.text_to_words(utf8ipathlast);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Split and index the path from the url for path-based filtering
|
|
||||||
{
|
|
||||||
string path = url_gpath(doc.url);
|
|
||||||
vector<string> vpath;
|
|
||||||
stringToTokens(path, vpath, "/");
|
|
||||||
// If vpath is not /, the last elt is the file/dir name, not a
|
|
||||||
// part of the path.
|
|
||||||
if (vpath.size())
|
|
||||||
vpath.resize(vpath.size()-1);
|
|
||||||
splitter.curpos = 0;
|
|
||||||
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
|
||||||
splitter.basepos + splitter.curpos++);
|
|
||||||
for (vector<string>::iterator it = vpath.begin();
|
|
||||||
it != vpath.end(); it++){
|
|
||||||
if (it->length() > 230) {
|
|
||||||
// Just truncate it. May still be useful because of wildcards
|
|
||||||
*it = it->substr(0, 230);
|
|
||||||
}
|
|
||||||
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
|
||||||
splitter.basepos + splitter.curpos++);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Index textual metadata. These are all indexed as text with
|
|
||||||
// positions, as we may want to do phrase searches with them (this
|
|
||||||
// makes no sense for keywords by the way).
|
|
||||||
//
|
|
||||||
// The order has no importance, and we set a position gap of 100
|
|
||||||
// between fields to avoid false proximity matches.
|
|
||||||
map<string, string>::iterator meta_it;
|
|
||||||
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
|
||||||
if (!meta_it->second.empty()) {
|
|
||||||
const FieldTraits *ftp;
|
|
||||||
// We don't test for an empty prefix here. Some fields are part
|
|
||||||
// of the internal conf with an empty prefix (ie: abstract).
|
|
||||||
if (!fieldToTraits(meta_it->first, &ftp)) {
|
|
||||||
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
|
||||||
meta_it->first.c_str()));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
|
||||||
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
|
||||||
meta_it->second.c_str()));
|
|
||||||
splitter.setprefix(ftp->pfx);
|
|
||||||
splitter.setwdfinc(ftp->wdfinc);
|
|
||||||
if (!splitter.text_to_words(meta_it->second))
|
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
|
||||||
meta_it->first.c_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
splitter.setprefix(string());
|
|
||||||
splitter.setwdfinc(1);
|
|
||||||
|
|
||||||
if (splitter.curpos < baseTextPosition)
|
|
||||||
splitter.basepos = baseTextPosition;
|
|
||||||
|
|
||||||
// Split and index body text
|
|
||||||
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
|
||||||
splitter.resetStats();
|
|
||||||
#endif
|
|
||||||
if (!splitter.text_to_words(doc.text))
|
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
|
||||||
// Reject bad data. unrecognized base64 text is characterized by
|
|
||||||
// high avg word length and high variation (because there are
|
|
||||||
// word-splitters like +/ inside the data).
|
|
||||||
TextSplit::Stats::Values v = splitter.getStats();
|
|
||||||
// v.avglen > 15 && v.sigma > 12
|
|
||||||
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
|
||||||
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
|
||||||
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
|
||||||
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
|
||||||
doc.ipath.c_str(), doc.text.c_str()));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
////// Special terms for other metadata. No positions for these.
|
|
||||||
// Mime type
|
|
||||||
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
|
||||||
|
|
||||||
// Simple file name indexed unsplit for specific "file name"
|
|
||||||
// searches. This is not the same as a filename: clause inside the
|
|
||||||
// query language.
|
|
||||||
// We also add a term for the filename extension if any.
|
|
||||||
string utf8fn;
|
|
||||||
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
|
||||||
string fn;
|
|
||||||
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
|
||||||
// We should truncate after extracting the extension, but this is
|
|
||||||
// a pathological case anyway
|
|
||||||
if (fn.size() > 230)
|
|
||||||
utf8truncate(fn, 230);
|
|
||||||
string::size_type pos = fn.rfind('.');
|
|
||||||
if (pos != string::npos && pos != fn.length() - 1) {
|
|
||||||
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
|
||||||
fn.substr(pos + 1));
|
|
||||||
}
|
|
||||||
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Udi unique term: this is used for file existence/uptodate
|
// Udi unique term: this is used for file existence/uptodate
|
||||||
// checks, and unique id for the replace_document() call.
|
// checks, and unique id for the replace_document() call.
|
||||||
string uniterm = make_uniterm(udi);
|
string uniterm = make_uniterm(udi);
|
||||||
newdocument.add_boolean_term(uniterm);
|
|
||||||
// Parent term. This is used to find all descendents, mostly to delete them
|
|
||||||
// when the parent goes away
|
|
||||||
if (!parent_udi.empty()) {
|
|
||||||
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
|
||||||
}
|
|
||||||
// Dates etc.
|
|
||||||
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
|
||||||
doc.dmtime.c_str());
|
|
||||||
struct tm *tm = localtime(&mtime);
|
|
||||||
char buf[9];
|
|
||||||
snprintf(buf, 9, "%04d%02d%02d",
|
|
||||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
|
||||||
// Date (YYYYMMDD)
|
|
||||||
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
|
||||||
// Month (YYYYMM)
|
|
||||||
buf[6] = '\0';
|
|
||||||
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
|
||||||
// Year (YYYY)
|
|
||||||
buf[4] = '\0';
|
|
||||||
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
|
||||||
|
|
||||||
|
if (doc.onlyxattr) {
|
||||||
//////////////////////////////////////////////////////////////////
|
// Only updating an existing doc with new extended attributes
|
||||||
// Document data record. omindex has the following nl separated fields:
|
// data. Need to read the old doc and its data record
|
||||||
// - url
|
// first. This is so different from the normal processing that
|
||||||
// - sample
|
// it uses a fully separate code path (with some duplication
|
||||||
// - caption (title limited to 100 chars)
|
// unfortunately)
|
||||||
// - mime type
|
if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
|
||||||
//
|
return false;
|
||||||
// The title, author, abstract and keywords fields are special,
|
|
||||||
// they always get stored in the document data
|
|
||||||
// record. Configurable other fields can be, too.
|
|
||||||
//
|
|
||||||
// We truncate stored fields abstract, title and keywords to
|
|
||||||
// reasonable lengths and suppress newlines (so that the data
|
|
||||||
// record can keep a simple syntax)
|
|
||||||
|
|
||||||
string record;
|
|
||||||
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
|
||||||
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
|
||||||
// We left-zero-pad the times so that they are lexico-sortable
|
|
||||||
leftzeropad(doc.fmtime, 11);
|
|
||||||
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
|
||||||
if (!doc.dmtime.empty()) {
|
|
||||||
leftzeropad(doc.dmtime, 11);
|
|
||||||
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
|
||||||
}
|
|
||||||
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
|
||||||
|
|
||||||
if (doc.fbytes.empty())
|
|
||||||
doc.fbytes = doc.pcbytes;
|
|
||||||
|
|
||||||
if (!doc.fbytes.empty()) {
|
|
||||||
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
|
||||||
leftzeropad(doc.fbytes, 12);
|
|
||||||
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
|
||||||
}
|
|
||||||
if (doc.haschildren) {
|
|
||||||
newdocument.add_boolean_term(has_children_term);
|
|
||||||
}
|
|
||||||
if (!doc.pcbytes.empty())
|
|
||||||
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
|
||||||
char sizebuf[30];
|
|
||||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
|
||||||
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
|
||||||
|
|
||||||
// Note that we add the signature both as a value and in the data record
|
|
||||||
if (!doc.sig.empty()) {
|
|
||||||
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
|
||||||
newdocument.add_value(VALUE_SIG, doc.sig);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!doc.ipath.empty())
|
|
||||||
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
|
||||||
|
|
||||||
doc.meta[Doc::keytt] =
|
|
||||||
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
|
||||||
if (!doc.meta[Doc::keytt].empty())
|
|
||||||
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
|
||||||
|
|
||||||
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
|
||||||
doc.meta[Doc::keykw] =
|
|
||||||
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
|
||||||
// No need to explicitly append the keywords, this will be done by
|
|
||||||
// the "stored" loop
|
|
||||||
|
|
||||||
// If abstract is empty, we make up one with the beginning of the
|
|
||||||
// document. This is then not indexed, but part of the doc data so
|
|
||||||
// that we can return it to a query without having to decode the
|
|
||||||
// original file.
|
|
||||||
bool syntabs = false;
|
|
||||||
// Note that the map accesses by operator[] create empty entries if they
|
|
||||||
// don't exist yet.
|
|
||||||
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
|
||||||
if (doc.meta[Doc::keyabs].empty()) {
|
|
||||||
syntabs = true;
|
|
||||||
if (!doc.text.empty())
|
|
||||||
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
|
||||||
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
|
||||||
} else {
|
} else {
|
||||||
doc.meta[Doc::keyabs] =
|
|
||||||
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
|
||||||
cstr_nc);
|
|
||||||
}
|
|
||||||
|
|
||||||
const set<string>& stored = m_config->getStoredFields();
|
// If the ipath is like a path, index the last element. This is
|
||||||
for (set<string>::const_iterator it = stored.begin();
|
// for compound documents like zip and chm for which the filter
|
||||||
it != stored.end(); it++) {
|
// uses the file path as ipath.
|
||||||
string nm = m_config->fieldCanon(*it);
|
if (!doc.ipath.empty() &&
|
||||||
if (!doc.meta[nm].empty()) {
|
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
||||||
string value =
|
string utf8ipathlast;
|
||||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
// There is no way in hell we could have an idea of the
|
||||||
RECORD_APPEND(record, nm, value);
|
// charset here, so let's hope it's ascii or utf-8. We call
|
||||||
|
// transcode to strip the bad chars and pray
|
||||||
|
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
||||||
|
"UTF-8", "UTF-8")) {
|
||||||
|
splitter.text_to_words(utf8ipathlast);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// If empty pages (multiple break at same pos) were recorded, save
|
// Split and index the path from the url for path-based filtering
|
||||||
// them (this is because we have no way to record them in the
|
{
|
||||||
// Xapian list
|
string path = url_gpath(doc.url);
|
||||||
if (!tpidx.m_pageincrvec.empty()) {
|
vector<string> vpath;
|
||||||
ostringstream multibreaks;
|
stringToTokens(path, vpath, "/");
|
||||||
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
// If vpath is not /, the last elt is the file/dir name, not a
|
||||||
if (i != 0)
|
// part of the path.
|
||||||
multibreaks << ",";
|
if (vpath.size())
|
||||||
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
vpath.resize(vpath.size()-1);
|
||||||
tpidx.m_pageincrvec[i].second;
|
splitter.curpos = 0;
|
||||||
|
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
||||||
|
splitter.basepos + splitter.curpos++);
|
||||||
|
for (vector<string>::iterator it = vpath.begin();
|
||||||
|
it != vpath.end(); it++){
|
||||||
|
if (it->length() > 230) {
|
||||||
|
// Just truncate it. May still be useful because of wildcards
|
||||||
|
*it = it->substr(0, 230);
|
||||||
|
}
|
||||||
|
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
||||||
|
splitter.basepos + splitter.curpos++);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
|
||||||
|
// Index textual metadata. These are all indexed as text with
|
||||||
|
// positions, as we may want to do phrase searches with them (this
|
||||||
|
// makes no sense for keywords by the way).
|
||||||
|
//
|
||||||
|
// The order has no importance, and we set a position gap of 100
|
||||||
|
// between fields to avoid false proximity matches.
|
||||||
|
map<string, string>::iterator meta_it;
|
||||||
|
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||||
|
if (!meta_it->second.empty()) {
|
||||||
|
const FieldTraits *ftp;
|
||||||
|
// We don't test for an empty prefix here. Some fields are part
|
||||||
|
// of the internal conf with an empty prefix (ie: abstract).
|
||||||
|
if (!fieldToTraits(meta_it->first, &ftp)) {
|
||||||
|
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
||||||
|
meta_it->first.c_str()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||||
|
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||||
|
meta_it->second.c_str()));
|
||||||
|
splitter.setprefix(ftp->pfx);
|
||||||
|
splitter.setwdfinc(ftp->wdfinc);
|
||||||
|
if (!splitter.text_to_words(meta_it->second))
|
||||||
|
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||||
|
meta_it->first.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
splitter.setprefix(string());
|
||||||
|
splitter.setwdfinc(1);
|
||||||
|
|
||||||
|
if (splitter.curpos < baseTextPosition)
|
||||||
|
splitter.basepos = baseTextPosition;
|
||||||
|
|
||||||
|
// Split and index body text
|
||||||
|
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
||||||
|
|
||||||
|
#ifdef TEXTSPLIT_STATS
|
||||||
|
splitter.resetStats();
|
||||||
|
#endif
|
||||||
|
if (!splitter.text_to_words(doc.text))
|
||||||
|
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
||||||
|
|
||||||
|
#ifdef TEXTSPLIT_STATS
|
||||||
|
// Reject bad data. unrecognized base64 text is characterized by
|
||||||
|
// high avg word length and high variation (because there are
|
||||||
|
// word-splitters like +/ inside the data).
|
||||||
|
TextSplit::Stats::Values v = splitter.getStats();
|
||||||
|
// v.avglen > 15 && v.sigma > 12
|
||||||
|
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
||||||
|
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
||||||
|
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
||||||
|
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
||||||
|
doc.ipath.c_str(), doc.text.c_str()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////// Special terms for other metadata. No positions for these.
|
||||||
|
// Mime type
|
||||||
|
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||||
|
|
||||||
|
// Simple file name indexed unsplit for specific "file name"
|
||||||
|
// searches. This is not the same as a filename: clause inside the
|
||||||
|
// query language.
|
||||||
|
// We also add a term for the filename extension if any.
|
||||||
|
string utf8fn;
|
||||||
|
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
||||||
|
string fn;
|
||||||
|
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
// We should truncate after extracting the extension, but this is
|
||||||
|
// a pathological case anyway
|
||||||
|
if (fn.size() > 230)
|
||||||
|
utf8truncate(fn, 230);
|
||||||
|
string::size_type pos = fn.rfind('.');
|
||||||
|
if (pos != string::npos && pos != fn.length() - 1) {
|
||||||
|
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
||||||
|
fn.substr(pos + 1));
|
||||||
|
}
|
||||||
|
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
newdocument.add_boolean_term(uniterm);
|
||||||
|
// Parent term. This is used to find all descendents, mostly
|
||||||
|
// to delete them when the parent goes away
|
||||||
|
if (!parent_udi.empty()) {
|
||||||
|
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
||||||
|
}
|
||||||
|
// Dates etc.
|
||||||
|
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||||
|
doc.dmtime.c_str());
|
||||||
|
struct tm *tm = localtime(&mtime);
|
||||||
|
char buf[9];
|
||||||
|
snprintf(buf, 9, "%04d%02d%02d",
|
||||||
|
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||||
|
// Date (YYYYMMDD)
|
||||||
|
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
||||||
|
// Month (YYYYMM)
|
||||||
|
buf[6] = '\0';
|
||||||
|
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
||||||
|
// Year (YYYY)
|
||||||
|
buf[4] = '\0';
|
||||||
|
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// Document data record. omindex has the following nl separated fields:
|
||||||
|
// - url
|
||||||
|
// - sample
|
||||||
|
// - caption (title limited to 100 chars)
|
||||||
|
// - mime type
|
||||||
|
//
|
||||||
|
// The title, author, abstract and keywords fields are special,
|
||||||
|
// they always get stored in the document data
|
||||||
|
// record. Configurable other fields can be, too.
|
||||||
|
//
|
||||||
|
// We truncate stored fields abstract, title and keywords to
|
||||||
|
// reasonable lengths and suppress newlines (so that the data
|
||||||
|
// record can keep a simple syntax)
|
||||||
|
|
||||||
|
string record;
|
||||||
|
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
||||||
|
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
||||||
|
// We left-zero-pad the times so that they are lexico-sortable
|
||||||
|
leftzeropad(doc.fmtime, 11);
|
||||||
|
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
||||||
|
if (!doc.dmtime.empty()) {
|
||||||
|
leftzeropad(doc.dmtime, 11);
|
||||||
|
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
||||||
|
}
|
||||||
|
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
||||||
|
|
||||||
|
if (doc.fbytes.empty())
|
||||||
|
doc.fbytes = doc.pcbytes;
|
||||||
|
|
||||||
|
if (!doc.fbytes.empty()) {
|
||||||
|
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
||||||
|
leftzeropad(doc.fbytes, 12);
|
||||||
|
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
||||||
|
}
|
||||||
|
if (doc.haschildren) {
|
||||||
|
newdocument.add_boolean_term(has_children_term);
|
||||||
|
}
|
||||||
|
if (!doc.pcbytes.empty())
|
||||||
|
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
||||||
|
char sizebuf[30];
|
||||||
|
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||||
|
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
||||||
|
|
||||||
|
// Note that we add the signature both as a value and in the data record
|
||||||
|
if (!doc.sig.empty()) {
|
||||||
|
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
||||||
|
newdocument.add_value(VALUE_SIG, doc.sig);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!doc.ipath.empty())
|
||||||
|
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
||||||
|
|
||||||
|
doc.meta[Doc::keytt] =
|
||||||
|
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
||||||
|
if (!doc.meta[Doc::keytt].empty())
|
||||||
|
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
||||||
|
|
||||||
|
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
||||||
|
doc.meta[Doc::keykw] =
|
||||||
|
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
||||||
|
// No need to explicitly append the keywords, this will be done by
|
||||||
|
// the "stored" loop
|
||||||
|
|
||||||
|
// If abstract is empty, we make up one with the beginning of the
|
||||||
|
// document. This is then not indexed, but part of the doc data so
|
||||||
|
// that we can return it to a query without having to decode the
|
||||||
|
// original file.
|
||||||
|
bool syntabs = false;
|
||||||
|
// Note that the map accesses by operator[] create empty entries if they
|
||||||
|
// don't exist yet.
|
||||||
|
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
||||||
|
if (doc.meta[Doc::keyabs].empty()) {
|
||||||
|
syntabs = true;
|
||||||
|
if (!doc.text.empty())
|
||||||
|
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
||||||
|
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
||||||
|
} else {
|
||||||
|
doc.meta[Doc::keyabs] =
|
||||||
|
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
||||||
|
cstr_nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
const set<string>& stored = m_config->getStoredFields();
|
||||||
|
for (set<string>::const_iterator it = stored.begin();
|
||||||
|
it != stored.end(); it++) {
|
||||||
|
string nm = m_config->fieldCanon(*it);
|
||||||
|
if (!doc.meta[nm].empty()) {
|
||||||
|
string value =
|
||||||
|
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||||
|
RECORD_APPEND(record, nm, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If empty pages (multiple break at same pos) were recorded, save
|
||||||
|
// them (this is because we have no way to record them in the
|
||||||
|
// Xapian list
|
||||||
|
if (!tpidx.m_pageincrvec.empty()) {
|
||||||
|
ostringstream multibreaks;
|
||||||
|
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
||||||
|
if (i != 0)
|
||||||
|
multibreaks << ",";
|
||||||
|
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
||||||
|
tpidx.m_pageincrvec[i].second;
|
||||||
|
}
|
||||||
|
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the file's md5 was computed, add value and term.
|
||||||
|
// The value is optionally used for query result duplicate elimination,
|
||||||
|
// and the term to find the duplicates.
|
||||||
|
// We don't do this for empty docs.
|
||||||
|
const string *md5;
|
||||||
|
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
||||||
|
md5->compare(cstr_md5empty)) {
|
||||||
|
string digest;
|
||||||
|
MD5HexScan(*md5, digest);
|
||||||
|
newdocument.add_value(VALUE_MD5, digest);
|
||||||
|
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
||||||
|
newdocument.set_data(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the file's md5 was computed, add value and term.
|
|
||||||
// The value is optionally used for query result duplicate elimination,
|
|
||||||
// and the term to find the duplicates.
|
|
||||||
// We don't do this for empty docs.
|
|
||||||
const string *md5;
|
|
||||||
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
|
||||||
md5->compare(cstr_md5empty)) {
|
|
||||||
string digest;
|
|
||||||
MD5HexScan(*md5, digest);
|
|
||||||
newdocument.add_value(VALUE_MD5, digest);
|
|
||||||
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
|
||||||
newdocument.set_data(record);
|
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
if (m_ndb->m_havewriteq) {
|
if (m_ndb->m_havewriteq) {
|
||||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
||||||
@ -1452,6 +1557,81 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
doc.text.length());
|
doc.text.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||||
|
Doc &doc, Xapian::Document& xdoc)
|
||||||
|
{
|
||||||
|
LOGDEB0(("Db::docToXdocXattrOnly\n"));
|
||||||
|
PTMutexLocker lock(m_mutex);
|
||||||
|
|
||||||
|
// Read existing document and its data record
|
||||||
|
if (getDoc(udi, 0, xdoc) == 0) {
|
||||||
|
LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
string data;
|
||||||
|
XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the term lists for the incoming fields and index the new values
|
||||||
|
map<string, string>::iterator meta_it;
|
||||||
|
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
||||||
|
const FieldTraits *ftp;
|
||||||
|
if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
|
||||||
|
LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
|
||||||
|
meta_it->first.c_str()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Clear the previous terms for the field
|
||||||
|
clearField(xdoc, ftp->pfx, ftp->wdfinc);
|
||||||
|
LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n",
|
||||||
|
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
||||||
|
meta_it->second.c_str()));
|
||||||
|
splitter->setprefix(ftp->pfx);
|
||||||
|
splitter->setwdfinc(ftp->wdfinc);
|
||||||
|
if (!splitter->text_to_words(meta_it->second))
|
||||||
|
LOGDEB(("Db::xattrOnly: split failed for %s\n",
|
||||||
|
meta_it->first.c_str()));
|
||||||
|
}
|
||||||
|
xdoc.add_value(VALUE_SIG, doc.sig);
|
||||||
|
|
||||||
|
// Parse current data record into a dict for ease of processing
|
||||||
|
ConfSimple datadic(data);
|
||||||
|
if (!datadic.ok()) {
|
||||||
|
LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For each "stored" field, check if set in doc metadata and
|
||||||
|
// update the value if it is
|
||||||
|
const set<string>& stored = m_rcldb->m_config->getStoredFields();
|
||||||
|
for (set<string>::const_iterator it = stored.begin();
|
||||||
|
it != stored.end(); it++) {
|
||||||
|
string nm = m_rcldb->m_config->fieldCanon(*it);
|
||||||
|
if (doc.getmeta(nm, 0)) {
|
||||||
|
string value =
|
||||||
|
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||||
|
datadic.set(nm, value, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recreate the record. We want to do this with the local RECORD_APPEND
|
||||||
|
// method for consistency in format, instead of using ConfSimple print
|
||||||
|
vector<string> names = datadic.getNames("");
|
||||||
|
data.clear();
|
||||||
|
for (vector<string>::const_iterator it = names.begin();
|
||||||
|
it != names.end(); it++) {
|
||||||
|
string value;
|
||||||
|
datadic.get(*it, value, "");
|
||||||
|
RECORD_APPEND(data, *it, value);
|
||||||
|
}
|
||||||
|
RECORD_APPEND(data, Doc::keysig, doc.sig);
|
||||||
|
xdoc.set_data(data);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
void Db::waitUpdIdle()
|
void Db::waitUpdIdle()
|
||||||
{
|
{
|
||||||
|
|||||||
@ -237,6 +237,10 @@ class Db {
|
|||||||
*/
|
*/
|
||||||
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
|
bool needUpdate(const string &udi, const string& sig, bool *existed=0);
|
||||||
|
|
||||||
|
/** Indicate if we are doing a systematic reindex. This complements
|
||||||
|
needUpdate() return */
|
||||||
|
bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
|
||||||
|
|
||||||
/** Add or update document identified by unique identifier.
|
/** Add or update document identified by unique identifier.
|
||||||
* @param config Config object to use. Can be the same as the member config
|
* @param config Config object to use. Can be the same as the member config
|
||||||
* or a clone, to avoid sharing when called in multithread context.
|
* or a clone, to avoid sharing when called in multithread context.
|
||||||
|
|||||||
@ -66,6 +66,8 @@ public:
|
|||||||
};
|
};
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
|
||||||
|
class TextSplitDb;
|
||||||
|
|
||||||
// A class for data and methods that would have to expose
|
// A class for data and methods that would have to expose
|
||||||
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
||||||
// 2 different ones for indexing or query as there is not much in
|
// 2 different ones for indexing or query as there is not much in
|
||||||
@ -141,6 +143,16 @@ class Db::Native {
|
|||||||
/** Check if doc is indexed by term */
|
/** Check if doc is indexed by term */
|
||||||
bool hasTerm(const string& udi, int idxi, const string& term);
|
bool hasTerm(const string& udi, int idxi, const string& term);
|
||||||
|
|
||||||
|
/** Update existing Xapian document for pure extended attrs change */
|
||||||
|
bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||||
|
Doc &doc, Xapian::Document& xdoc);
|
||||||
|
/** Remove all terms currently indexed for field defined by idx prefix */
|
||||||
|
bool clearField(Xapian::Document& xdoc, const string& pfx,
|
||||||
|
Xapian::termcount wdfdec);
|
||||||
|
|
||||||
|
/** Check if term wdf is 0 and remove term if so */
|
||||||
|
bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
|
||||||
|
|
||||||
/** Compute list of subdocuments for a given udi. We look for documents
|
/** Compute list of subdocuments for a given udi. We look for documents
|
||||||
* indexed by a parent term matching the udi, the posting list for the
|
* indexed by a parent term matching the udi, the posting list for the
|
||||||
* parentterm(udi) (As suggested by James Aylett)
|
* parentterm(udi) (As suggested by James Aylett)
|
||||||
|
|||||||
@ -131,6 +131,10 @@ class Doc {
|
|||||||
// ipath descendants.
|
// ipath descendants.
|
||||||
bool haschildren;
|
bool haschildren;
|
||||||
|
|
||||||
|
// During indexing: only fields from extended attributes were set, no
|
||||||
|
// doc content. Allows for faster reindexing of existing doc
|
||||||
|
bool onlyxattr;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void erase() {
|
void erase() {
|
||||||
@ -154,10 +158,11 @@ class Doc {
|
|||||||
idxi = 0;
|
idxi = 0;
|
||||||
haspages = false;
|
haspages = false;
|
||||||
haschildren = false;
|
haschildren = false;
|
||||||
|
onlyxattr = false;
|
||||||
}
|
}
|
||||||
Doc()
|
Doc()
|
||||||
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
||||||
haspages(false), haschildren(false)
|
haspages(false), haschildren(false), onlyxattr(false)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
/** Get value for named field. If value pointer is 0, just test existence */
|
/** Get value for named field. If value pointer is 0, just test existence */
|
||||||
|
|||||||
@ -13,7 +13,7 @@
|
|||||||
#####################################################
|
#####################################################
|
||||||
# This section defines what prefix the terms inside named fields will be
|
# This section defines what prefix the terms inside named fields will be
|
||||||
# indexed with (in addition to prefix-less indexing for general search)
|
# indexed with (in addition to prefix-less indexing for general search)
|
||||||
# ALL prefixes MUST be all UPPERCASE.
|
# ALL prefixes MUST be all ASCII UPPERCASE (NO DIGITS)
|
||||||
#
|
#
|
||||||
# The field names should be the canonic ones, not the aliases defined in
|
# The field names should be the canonic ones, not the aliases defined in
|
||||||
# the following section. Don't change those which are predefined here,
|
# the following section. Don't change those which are predefined here,
|
||||||
|
|||||||
@ -5,6 +5,7 @@ daemloglevel = 6
|
|||||||
daemlogfilename = /tmp/rclmontrace
|
daemlogfilename = /tmp/rclmontrace
|
||||||
|
|
||||||
indexStripChars = 1
|
indexStripChars = 1
|
||||||
|
detectxattronly = 1
|
||||||
|
|
||||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||||
|
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
1 results
|
1 results
|
||||||
application/x-fsdirectory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
inode/directory [file:///home/dockes/projets/fulltext/testrecoll/emptyUniqueTerm] [emptyUniqueTerm] 4096 bytes
|
||||||
|
|||||||
@ -11,6 +11,37 @@ if test ! x$reroot = x ; then
|
|||||||
rerootResults
|
rerootResults
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
iscmd()
|
||||||
|
{
|
||||||
|
cmd=$1
|
||||||
|
case $cmd in
|
||||||
|
*/*)
|
||||||
|
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||||
|
*)
|
||||||
|
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||||
|
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && \
|
||||||
|
iscmdresult=$d/$cmd && return 0;done
|
||||||
|
return 1 ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
checkcmds()
|
||||||
|
{
|
||||||
|
result=0
|
||||||
|
for cmd in $*;do
|
||||||
|
if iscmd $cmd
|
||||||
|
then
|
||||||
|
echo $cmd is $iscmdresult
|
||||||
|
else
|
||||||
|
echo $cmd not found
|
||||||
|
result=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return $result
|
||||||
|
}
|
||||||
|
|
||||||
|
checkcmds recollq recollindex pxattr xadump || exit 1
|
||||||
|
|
||||||
makeindex() {
|
makeindex() {
|
||||||
echo "Zeroing Index"
|
echo "Zeroing Index"
|
||||||
rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
|
rm -rf $RECOLL_CONFDIR/xapiandb $RECOLL_CONFDIR/aspdict.*.rws
|
||||||
|
|||||||
4
tests/xattr/fields
Normal file
4
tests/xattr/fields
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[prefixes]
|
||||||
|
myattr = XYXATA
|
||||||
|
[stored]
|
||||||
|
myattr =
|
||||||
85
tests/xattr/xattr.sh
Executable file
85
tests/xattr/xattr.sh
Executable file
@ -0,0 +1,85 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Test extended attributes indexing. This should work both with
|
||||||
|
# "detectxattronly" set or unset in the config, but should be run with
|
||||||
|
# the variable set, because we test its function by exploiting a bug
|
||||||
|
# (see comments further)
|
||||||
|
#
|
||||||
|
# We use the RECOLL_CONFTOP variable to add our own fields configuration
|
||||||
|
|
||||||
|
thisdir=`dirname $0`
|
||||||
|
topdir=$thisdir/..
|
||||||
|
. $topdir/shared.sh
|
||||||
|
|
||||||
|
initvariables $0
|
||||||
|
|
||||||
|
RECOLL_CONFTOP=$thisdir
|
||||||
|
export RECOLL_CONFTOP
|
||||||
|
|
||||||
|
xrun()
|
||||||
|
{
|
||||||
|
echo $*
|
||||||
|
$*
|
||||||
|
}
|
||||||
|
|
||||||
|
tstfile=${tstdata}/xattrs/tstxattrs.txt
|
||||||
|
rm -f $tstfile
|
||||||
|
|
||||||
|
(
|
||||||
|
# Create the file with an extended attribute, index, and query it
|
||||||
|
# by content and field
|
||||||
|
echo xattruniqueinfile > $tstfile
|
||||||
|
xrun pxattr -n myattr -v xattrunique1 $tstfile
|
||||||
|
xrun recollindex -Zi $tstfile
|
||||||
|
echo "1 result expected"
|
||||||
|
xrun recollq xattruniqueinfile
|
||||||
|
echo "1 result expected"
|
||||||
|
xrun recollq myattr:xattrunique1
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
# Change the value for the field, check that the old value is gone
|
||||||
|
# and the new works
|
||||||
|
xrun pxattr -n myattr -v xattrunique2 $tstfile
|
||||||
|
xrun recollindex -i $tstfile
|
||||||
|
echo "1 result expected"
|
||||||
|
xrun recollq xattruniqueinfile
|
||||||
|
echo "0 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique1
|
||||||
|
echo "1 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique2
|
||||||
|
|
||||||
|
# Change the contents then the xattr. With xattronly set, recoll
|
||||||
|
# should miss the contents change and index only the xattr. That's
|
||||||
|
# a bug but we use it to check that pure xattr update indexing
|
||||||
|
# works
|
||||||
|
echo xattruniqueinfile1 > $tstfile
|
||||||
|
sleep 2
|
||||||
|
xrun pxattr -n myattr -v xattrunique3 $tstfile
|
||||||
|
xrun recollindex -i $tstfile
|
||||||
|
echo "1 result expected"
|
||||||
|
xrun recollq xattruniqueinfile
|
||||||
|
echo "0 result expected"
|
||||||
|
xrun recollq xattruniqueinfile1
|
||||||
|
echo "0 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique1
|
||||||
|
echo "0 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique2
|
||||||
|
echo "1 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique3
|
||||||
|
|
||||||
|
# Reset the index and check that the contents were seen all right
|
||||||
|
xrun recollindex -Zi $tstfile
|
||||||
|
echo "0 result expected"
|
||||||
|
xrun recollq xattruniqueinfile
|
||||||
|
echo "1 result expected"
|
||||||
|
xrun recollq xattruniqueinfile1
|
||||||
|
echo "0 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique2
|
||||||
|
echo "1 result expected:"
|
||||||
|
xrun recollq myattr:xattrunique3
|
||||||
|
|
||||||
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
checkresult
|
||||||
57
tests/xattr/xattr.txt
Normal file
57
tests/xattr/xattr.txt
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
pxattr -n myattr -v xattrunique1 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
1 result expected
|
||||||
|
recollq xattruniqueinfile
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
1 result expected
|
||||||
|
recollq myattr:xattrunique1
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
pxattr -n myattr -v xattrunique2 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
1 result expected
|
||||||
|
recollq xattruniqueinfile
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
0 result expected:
|
||||||
|
recollq myattr:xattrunique1
|
||||||
|
0 results
|
||||||
|
1 result expected:
|
||||||
|
recollq myattr:xattrunique2
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
pxattr -n myattr -v xattrunique3 /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
recollindex -i /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
1 result expected
|
||||||
|
recollq xattruniqueinfile
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
0 result expected
|
||||||
|
recollq xattruniqueinfile1
|
||||||
|
0 results
|
||||||
|
0 result expected:
|
||||||
|
recollq myattr:xattrunique1
|
||||||
|
0 results
|
||||||
|
0 result expected:
|
||||||
|
recollq myattr:xattrunique2
|
||||||
|
0 results
|
||||||
|
1 result expected:
|
||||||
|
recollq myattr:xattrunique3
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 18 bytes
|
||||||
|
recollindex -Zi /home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt
|
||||||
|
0 result expected
|
||||||
|
recollq xattruniqueinfile
|
||||||
|
0 results
|
||||||
|
1 result expected
|
||||||
|
recollq xattruniqueinfile1
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||||
|
0 result expected:
|
||||||
|
recollq myattr:xattrunique2
|
||||||
|
0 results
|
||||||
|
1 result expected:
|
||||||
|
recollq myattr:xattrunique3
|
||||||
|
1 results
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/xattrs/tstxattrs.txt] [tstxattrs.txt] 19 bytes
|
||||||
Loading…
x
Reference in New Issue
Block a user