Abstracts: storing raw doc text in user metadata records

This commit is contained in:
Jean-Francois Dockes 2018-01-06 11:38:24 +01:00
parent 57d9ece876
commit 2c76a70c0e
4 changed files with 107 additions and 37 deletions

View File

@ -350,8 +350,9 @@ int Query::Native::abstractFromText(
rawtext = doc.meta["RAWTEXT"];
}
#endif
#ifdef RAWTEXT_IN_VALUE
XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason);
#ifdef RAWTEXT_IN_METADATA
XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)),
ndb->xrdb, reason);
if (!reason.empty()) {
LOGERR("abstractFromText: could not get value: " << reason << endl);
return ABSRES_ERROR;
@ -367,7 +368,7 @@ int Query::Native::abstractFromText(
}
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
(defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
(defined(RAWTEXT_IN_DATA))
// Tryout the Xapian internal method.
string snippet = xmset.snippet(rawtext);
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");

View File

@ -191,7 +191,11 @@ void *DbUpdWorker(void* vdbp)
case DbUpdTask::AddOrUpdate:
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,
tsk->doc, tsk->txtlen);
tsk->doc, tsk->txtlen
#ifdef RAWTEXT_IN_METADATA
, tsk->rawztext
#endif
);
break;
case DbUpdTask::Delete:
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
@ -585,13 +589,17 @@ int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks, int pos)
// to delete it before returning.
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *newdocument_ptr,
size_t textlen)
size_t textlen
#ifdef RAWTEXT_IN_METADATA
, const string& rawztext
#endif
)
{
#ifdef IDX_THREADS
Chrono chron;
std::unique_lock<std::mutex> lock(m_mutex);
#endif
std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
// Check file system full every mbyte of indexed text. It's a bit wasteful
// to do this after having prepared the document, but it needs to be in
@ -614,9 +622,9 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
string ermsg;
// Add db entry or update existing entry:
Xapian::docid did = 0;
try {
Xapian::docid did =
xwdb.replace_document(uniterm, *newdocument_ptr);
did = xwdb.replace_document(uniterm, *newdocument_ptr);
if (did < m_rcldb->updated.size()) {
// This is necessary because only the file-level docs are tested
// by needUpdate(), so the subdocs existence flags are only set
@ -627,7 +635,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
ermsg.erase();
@ -643,6 +650,16 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
}
}
#ifdef RAWTEXT_IN_METADATA
XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
xwdb, m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
LOGERR("Db::addOrUpdate: set_metadata error: " <<
m_rcldb->m_reason << "\n");
// This only affects snippets, so let's say not fatal
}
#endif
// Test if we're over the flush threshold (limit memory usage):
bool ret = m_rcldb->maybeflush(textlen);
#ifdef IDX_THREADS
@ -682,7 +699,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
}
} else {
LOGDEB("purgeFile: delete docid " << *docid << "\n");
xwdb.delete_document(*docid);
deleteDocument(*docid);
}
vector<Xapian::docid> docids;
subDocs(udi, 0, docids);
@ -705,7 +722,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
if (!orphansOnly || sig != subdocsig) {
LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
xwdb.delete_document(*it);
deleteDocument(*it);
}
}
return true;
@ -1365,6 +1382,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
// Udi unique term: this is used for file existence/uptodate
// checks, and unique id for the replace_document() call.
string uniterm = make_uniterm(udi);
#if defined(RAWTEXT_IN_METADATA)
string rawztext; // Doc compressed text
#endif
if (doc.onlyxattr) {
// Only updating an existing doc with new extended attributes
@ -1468,13 +1488,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
if (!splitter.text_to_words(doc.text)) {
LOGDEB("Db::addOrUpdate: split failed for main text\n");
} else {
#ifdef RAWTEXT_IN_VALUE
#if defined(RAWTEXT_IN_METADATA)
if (o_index_storedoctext) {
ZLibUtBuf buf;
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
string tt;
tt.assign(buf.getBuf(), buf.getCnt());
newdocument.add_value(VALUE_RAWTEXT, tt);
rawztext.assign(buf.getBuf(), buf.getCnt());
}
#endif
}
@ -1700,7 +1718,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
#ifdef IDX_THREADS
if (m_ndb->m_havewriteq) {
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
newdocument_ptr, doc.text.length());
newdocument_ptr, doc.text.length()
#ifdef RAWTEXT_IN_METADATA
, rawztext
#endif
);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::addOrUpdate:Cant queue task\n");
delete newdocument_ptr;
@ -1712,7 +1734,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
#endif
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
doc.text.length());
doc.text.length()
#ifdef RAWTEXT_IN_METADATA
, rawztext
#endif
);
}
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
@ -2076,7 +2102,7 @@ bool Db::purge()
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
maybeflush(trms * 5);
}
m_ndb->xwdb.delete_document(docid);
m_ndb->deleteDocument(docid);
LOGDEB("Db::purge: deleted document #" << docid << "\n");
} catch (const Xapian::DocNotFoundError &) {
LOGDEB0("Db::purge: document #" << docid << " not found\n");
@ -2137,8 +2163,13 @@ bool Db::purgeFile(const string &udi, bool *existed)
#ifdef IDX_THREADS
if (m_ndb->m_havewriteq) {
string rztxt;
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
0, (size_t)-1);
0, (size_t)-1,
#if defined(RAWTEXT_IN_METADATA)
rztxt
#endif
);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::purgeFile:Cant queue task\n");
return false;
@ -2164,8 +2195,13 @@ bool Db::purgeOrphans(const string &udi)
#ifdef IDX_THREADS
if (m_ndb->m_havewriteq) {
string rztxt;
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
0, (size_t)-1);
0, (size_t)-1,
#ifdef RAWTEXT_IN_METADATA
rztxt
#endif
);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::purgeFile:Cant queue task\n");
return false;

View File

@ -70,11 +70,6 @@ enum value_slot {
////////// Recoll only:
// Doc sig as chosen by app (ex: mtime+size
VALUE_SIG = 10,
// Doc extracted text, with punctuation: splitter input. Used for
// generating snippets. This is only used if RAWTEXT_IN_VALUE is
// defined (else the text goes to the data record), but reserve
// the value in any case.
VALUE_RAWTEXT= 11,
};
class SearchData;

View File

@ -30,6 +30,10 @@
#endif // IDX_THREADS
#include "xmacros.h"
// Store raw doc text in data record or metadata ?
#undef RAWTEXT_IN_DATA
#define RAWTEXT_IN_METADATA
namespace Rcl {
class Query;
@ -51,10 +55,16 @@ public:
// available on the caller site.
// Take some care to avoid sharing string data (if string impl is cow)
DbUpdTask(Op _op, const string& ud, const string& un,
Xapian::Document *d, size_t tl)
: op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
doc(d), txtlen(tl)
{}
Xapian::Document *d, size_t tl
#ifdef RAWTEXT_IN_METADATA
, string& rztxt
#endif
) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
doc(d), txtlen(tl) {
#ifdef RAWTEXT_IN_METADATA
rawztext.swap(rztxt);
#endif
}
// Udi and uniterm equivalently designate the doc
Op op;
string udi;
@ -64,6 +74,9 @@ public:
// purge because we actually don't know it, and the code fakes a
// text length based on the term count.
size_t txtlen;
#ifdef RAWTEXT_IN_METADATA
string rawztext; // Compressed doc text
#endif
};
#endif // IDX_THREADS
@ -101,7 +114,11 @@ class Db::Native {
// Final steps of doc update, part which need to be single-threaded
bool addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *doc, size_t txtlen);
Xapian::Document *doc, size_t txtlen
#ifdef RAWTEXT_IN_METADATA
, const string& rawztext
#endif
);
/** Delete all documents which are contained in the input document,
* which must be a file-level one.
@ -171,18 +188,39 @@ class Db::Native {
/** Check if a page position list is defined */
bool hasPages(Xapian::docid id);
#ifdef RAWTEXT_IN_METADATA
std::string rawtextMetaKey(Xapian::docid did) {
// Xapian's Olly Betts avises to use a key which will
// sort the same as the docid (which we do), and to
// use Xapian's pack_uint_preserving_sort() which is
// efficient but hard to read. I'd wager that this
// does not make much of a difference. 10 ascii bytes
// gives us 10 billion docs, which is enough (says I).
char buf[30];
sprintf(buf, "%010d", did);
return buf;
}
#endif
void deleteDocument(Xapian::docid docid) {
#ifdef RAWTEXT_IN_METADATA
string metareason;
XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()),
xwdb, metareason);
if (!metareason.empty()) {
LOGERR("deleteDocument: set_metadata error: " <<
metareason << "\n");
// not fatal
}
#endif
xwdb.delete_document(docid);
}
};
// This is the word position offset at which we index the body text
// (abstract, keywords, etc.. are stored before this)
static const unsigned int baseTextPosition = 100000;
// Store raw doc text in data record or value slot ?
#if 0
#define RAWTEXT_IN_DATA 1
#elif 1
#define RAWTEXT_IN_VALUE 1
#endif
}
#endif /* _rcldb_p_h_included_ */