Abstracts: storing raw doc text in user metadata records
This commit is contained in:
parent
57d9ece876
commit
2c76a70c0e
@ -350,8 +350,9 @@ int Query::Native::abstractFromText(
|
|||||||
rawtext = doc.meta["RAWTEXT"];
|
rawtext = doc.meta["RAWTEXT"];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef RAWTEXT_IN_VALUE
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason);
|
XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)),
|
||||||
|
ndb->xrdb, reason);
|
||||||
if (!reason.empty()) {
|
if (!reason.empty()) {
|
||||||
LOGERR("abstractFromText: could not get value: " << reason << endl);
|
LOGERR("abstractFromText: could not get value: " << reason << endl);
|
||||||
return ABSRES_ERROR;
|
return ABSRES_ERROR;
|
||||||
@ -367,7 +368,7 @@ int Query::Native::abstractFromText(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
||||||
(defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
|
(defined(RAWTEXT_IN_DATA))
|
||||||
// Tryout the Xapian internal method.
|
// Tryout the Xapian internal method.
|
||||||
string snippet = xmset.snippet(rawtext);
|
string snippet = xmset.snippet(rawtext);
|
||||||
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
|
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
|
||||||
|
|||||||
@ -191,7 +191,11 @@ void *DbUpdWorker(void* vdbp)
|
|||||||
case DbUpdTask::AddOrUpdate:
|
case DbUpdTask::AddOrUpdate:
|
||||||
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
|
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
|
||||||
status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,
|
status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,
|
||||||
tsk->doc, tsk->txtlen);
|
tsk->doc, tsk->txtlen
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
, tsk->rawztext
|
||||||
|
#endif
|
||||||
|
);
|
||||||
break;
|
break;
|
||||||
case DbUpdTask::Delete:
|
case DbUpdTask::Delete:
|
||||||
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
|
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
|
||||||
@ -585,13 +589,17 @@ int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks, int pos)
|
|||||||
// to delete it before returning.
|
// to delete it before returning.
|
||||||
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||||
Xapian::Document *newdocument_ptr,
|
Xapian::Document *newdocument_ptr,
|
||||||
size_t textlen)
|
size_t textlen
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
, const string& rawztext
|
||||||
|
#endif
|
||||||
|
)
|
||||||
{
|
{
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
std::unique_lock<std::mutex> lock(m_mutex);
|
std::unique_lock<std::mutex> lock(m_mutex);
|
||||||
#endif
|
#endif
|
||||||
std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
|
std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
|
||||||
|
|
||||||
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
||||||
// to do this after having prepared the document, but it needs to be in
|
// to do this after having prepared the document, but it needs to be in
|
||||||
@ -614,9 +622,9 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
|||||||
string ermsg;
|
string ermsg;
|
||||||
|
|
||||||
// Add db entry or update existing entry:
|
// Add db entry or update existing entry:
|
||||||
|
Xapian::docid did = 0;
|
||||||
try {
|
try {
|
||||||
Xapian::docid did =
|
did = xwdb.replace_document(uniterm, *newdocument_ptr);
|
||||||
xwdb.replace_document(uniterm, *newdocument_ptr);
|
|
||||||
if (did < m_rcldb->updated.size()) {
|
if (did < m_rcldb->updated.size()) {
|
||||||
// This is necessary because only the file-level docs are tested
|
// This is necessary because only the file-level docs are tested
|
||||||
// by needUpdate(), so the subdocs existence flags are only set
|
// by needUpdate(), so the subdocs existence flags are only set
|
||||||
@ -627,7 +635,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
|||||||
LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
|
LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
|
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
|
LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
|
||||||
ermsg.erase();
|
ermsg.erase();
|
||||||
@ -643,6 +650,16 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
|
||||||
|
xwdb, m_rcldb->m_reason);
|
||||||
|
if (!m_rcldb->m_reason.empty()) {
|
||||||
|
LOGERR("Db::addOrUpdate: set_metadata error: " <<
|
||||||
|
m_rcldb->m_reason << "\n");
|
||||||
|
// This only affects snippets, so let's say not fatal
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Test if we're over the flush threshold (limit memory usage):
|
// Test if we're over the flush threshold (limit memory usage):
|
||||||
bool ret = m_rcldb->maybeflush(textlen);
|
bool ret = m_rcldb->maybeflush(textlen);
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
@ -682,7 +699,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOGDEB("purgeFile: delete docid " << *docid << "\n");
|
LOGDEB("purgeFile: delete docid " << *docid << "\n");
|
||||||
xwdb.delete_document(*docid);
|
deleteDocument(*docid);
|
||||||
}
|
}
|
||||||
vector<Xapian::docid> docids;
|
vector<Xapian::docid> docids;
|
||||||
subDocs(udi, 0, docids);
|
subDocs(udi, 0, docids);
|
||||||
@ -705,7 +722,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
|
|||||||
|
|
||||||
if (!orphansOnly || sig != subdocsig) {
|
if (!orphansOnly || sig != subdocsig) {
|
||||||
LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
|
LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
|
||||||
xwdb.delete_document(*it);
|
deleteDocument(*it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -1365,6 +1382,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
// Udi unique term: this is used for file existence/uptodate
|
// Udi unique term: this is used for file existence/uptodate
|
||||||
// checks, and unique id for the replace_document() call.
|
// checks, and unique id for the replace_document() call.
|
||||||
string uniterm = make_uniterm(udi);
|
string uniterm = make_uniterm(udi);
|
||||||
|
#if defined(RAWTEXT_IN_METADATA)
|
||||||
|
string rawztext; // Doc compressed text
|
||||||
|
#endif
|
||||||
|
|
||||||
if (doc.onlyxattr) {
|
if (doc.onlyxattr) {
|
||||||
// Only updating an existing doc with new extended attributes
|
// Only updating an existing doc with new extended attributes
|
||||||
@ -1468,13 +1488,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
if (!splitter.text_to_words(doc.text)) {
|
if (!splitter.text_to_words(doc.text)) {
|
||||||
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
||||||
} else {
|
} else {
|
||||||
#ifdef RAWTEXT_IN_VALUE
|
#if defined(RAWTEXT_IN_METADATA)
|
||||||
if (o_index_storedoctext) {
|
if (o_index_storedoctext) {
|
||||||
ZLibUtBuf buf;
|
ZLibUtBuf buf;
|
||||||
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
||||||
string tt;
|
rawztext.assign(buf.getBuf(), buf.getCnt());
|
||||||
tt.assign(buf.getBuf(), buf.getCnt());
|
|
||||||
newdocument.add_value(VALUE_RAWTEXT, tt);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -1700,7 +1718,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
if (m_ndb->m_havewriteq) {
|
if (m_ndb->m_havewriteq) {
|
||||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
||||||
newdocument_ptr, doc.text.length());
|
newdocument_ptr, doc.text.length()
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
, rawztext
|
||||||
|
#endif
|
||||||
|
);
|
||||||
if (!m_ndb->m_wqueue.put(tp)) {
|
if (!m_ndb->m_wqueue.put(tp)) {
|
||||||
LOGERR("Db::addOrUpdate:Cant queue task\n");
|
LOGERR("Db::addOrUpdate:Cant queue task\n");
|
||||||
delete newdocument_ptr;
|
delete newdocument_ptr;
|
||||||
@ -1712,7 +1734,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
|
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
|
||||||
doc.text.length());
|
doc.text.length()
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
, rawztext
|
||||||
|
#endif
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
||||||
@ -2076,7 +2102,7 @@ bool Db::purge()
|
|||||||
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
|
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
|
||||||
maybeflush(trms * 5);
|
maybeflush(trms * 5);
|
||||||
}
|
}
|
||||||
m_ndb->xwdb.delete_document(docid);
|
m_ndb->deleteDocument(docid);
|
||||||
LOGDEB("Db::purge: deleted document #" << docid << "\n");
|
LOGDEB("Db::purge: deleted document #" << docid << "\n");
|
||||||
} catch (const Xapian::DocNotFoundError &) {
|
} catch (const Xapian::DocNotFoundError &) {
|
||||||
LOGDEB0("Db::purge: document #" << docid << " not found\n");
|
LOGDEB0("Db::purge: document #" << docid << " not found\n");
|
||||||
@ -2137,8 +2163,13 @@ bool Db::purgeFile(const string &udi, bool *existed)
|
|||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
if (m_ndb->m_havewriteq) {
|
if (m_ndb->m_havewriteq) {
|
||||||
|
string rztxt;
|
||||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
|
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
|
||||||
0, (size_t)-1);
|
0, (size_t)-1,
|
||||||
|
#if defined(RAWTEXT_IN_METADATA)
|
||||||
|
rztxt
|
||||||
|
#endif
|
||||||
|
);
|
||||||
if (!m_ndb->m_wqueue.put(tp)) {
|
if (!m_ndb->m_wqueue.put(tp)) {
|
||||||
LOGERR("Db::purgeFile:Cant queue task\n");
|
LOGERR("Db::purgeFile:Cant queue task\n");
|
||||||
return false;
|
return false;
|
||||||
@ -2164,8 +2195,13 @@ bool Db::purgeOrphans(const string &udi)
|
|||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
if (m_ndb->m_havewriteq) {
|
if (m_ndb->m_havewriteq) {
|
||||||
|
string rztxt;
|
||||||
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
|
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
|
||||||
0, (size_t)-1);
|
0, (size_t)-1,
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
rztxt
|
||||||
|
#endif
|
||||||
|
);
|
||||||
if (!m_ndb->m_wqueue.put(tp)) {
|
if (!m_ndb->m_wqueue.put(tp)) {
|
||||||
LOGERR("Db::purgeFile:Cant queue task\n");
|
LOGERR("Db::purgeFile:Cant queue task\n");
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -70,11 +70,6 @@ enum value_slot {
|
|||||||
////////// Recoll only:
|
////////// Recoll only:
|
||||||
// Doc sig as chosen by app (ex: mtime+size
|
// Doc sig as chosen by app (ex: mtime+size
|
||||||
VALUE_SIG = 10,
|
VALUE_SIG = 10,
|
||||||
// Doc extracted text, with punctuation: splitter input. Used for
|
|
||||||
// generating snippets. This is only used if RAWTEXT_IN_VALUE is
|
|
||||||
// defined (else the text goes to the data record), but reserve
|
|
||||||
// the value in any case.
|
|
||||||
VALUE_RAWTEXT= 11,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class SearchData;
|
class SearchData;
|
||||||
|
|||||||
@ -30,6 +30,10 @@
|
|||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
#include "xmacros.h"
|
#include "xmacros.h"
|
||||||
|
|
||||||
|
// Store raw doc text in data record or metadata ?
|
||||||
|
#undef RAWTEXT_IN_DATA
|
||||||
|
#define RAWTEXT_IN_METADATA
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
class Query;
|
class Query;
|
||||||
@ -51,10 +55,16 @@ public:
|
|||||||
// available on the caller site.
|
// available on the caller site.
|
||||||
// Take some care to avoid sharing string data (if string impl is cow)
|
// Take some care to avoid sharing string data (if string impl is cow)
|
||||||
DbUpdTask(Op _op, const string& ud, const string& un,
|
DbUpdTask(Op _op, const string& ud, const string& un,
|
||||||
Xapian::Document *d, size_t tl)
|
Xapian::Document *d, size_t tl
|
||||||
: op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
doc(d), txtlen(tl)
|
, string& rztxt
|
||||||
{}
|
#endif
|
||||||
|
) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
|
||||||
|
doc(d), txtlen(tl) {
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
rawztext.swap(rztxt);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
// Udi and uniterm equivalently designate the doc
|
// Udi and uniterm equivalently designate the doc
|
||||||
Op op;
|
Op op;
|
||||||
string udi;
|
string udi;
|
||||||
@ -64,6 +74,9 @@ public:
|
|||||||
// purge because we actually don't know it, and the code fakes a
|
// purge because we actually don't know it, and the code fakes a
|
||||||
// text length based on the term count.
|
// text length based on the term count.
|
||||||
size_t txtlen;
|
size_t txtlen;
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
string rawztext; // Compressed doc text
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
|
||||||
@ -101,7 +114,11 @@ class Db::Native {
|
|||||||
|
|
||||||
// Final steps of doc update, part which need to be single-threaded
|
// Final steps of doc update, part which need to be single-threaded
|
||||||
bool addOrUpdateWrite(const string& udi, const string& uniterm,
|
bool addOrUpdateWrite(const string& udi, const string& uniterm,
|
||||||
Xapian::Document *doc, size_t txtlen);
|
Xapian::Document *doc, size_t txtlen
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
, const string& rawztext
|
||||||
|
#endif
|
||||||
|
);
|
||||||
|
|
||||||
/** Delete all documents which are contained in the input document,
|
/** Delete all documents which are contained in the input document,
|
||||||
* which must be a file-level one.
|
* which must be a file-level one.
|
||||||
@ -171,18 +188,39 @@ class Db::Native {
|
|||||||
|
|
||||||
/** Check if a page position list is defined */
|
/** Check if a page position list is defined */
|
||||||
bool hasPages(Xapian::docid id);
|
bool hasPages(Xapian::docid id);
|
||||||
|
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
std::string rawtextMetaKey(Xapian::docid did) {
|
||||||
|
// Xapian's Olly Betts avises to use a key which will
|
||||||
|
// sort the same as the docid (which we do), and to
|
||||||
|
// use Xapian's pack_uint_preserving_sort() which is
|
||||||
|
// efficient but hard to read. I'd wager that this
|
||||||
|
// does not make much of a difference. 10 ascii bytes
|
||||||
|
// gives us 10 billion docs, which is enough (says I).
|
||||||
|
char buf[30];
|
||||||
|
sprintf(buf, "%010d", did);
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void deleteDocument(Xapian::docid docid) {
|
||||||
|
#ifdef RAWTEXT_IN_METADATA
|
||||||
|
string metareason;
|
||||||
|
XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()),
|
||||||
|
xwdb, metareason);
|
||||||
|
if (!metareason.empty()) {
|
||||||
|
LOGERR("deleteDocument: set_metadata error: " <<
|
||||||
|
metareason << "\n");
|
||||||
|
// not fatal
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
xwdb.delete_document(docid);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// This is the word position offset at which we index the body text
|
// This is the word position offset at which we index the body text
|
||||||
// (abstract, keywords, etc.. are stored before this)
|
// (abstract, keywords, etc.. are stored before this)
|
||||||
static const unsigned int baseTextPosition = 100000;
|
static const unsigned int baseTextPosition = 100000;
|
||||||
|
|
||||||
// Store raw doc text in data record or value slot ?
|
|
||||||
#if 0
|
|
||||||
#define RAWTEXT_IN_DATA 1
|
|
||||||
#elif 1
|
|
||||||
#define RAWTEXT_IN_VALUE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* _rcldb_p_h_included_ */
|
#endif /* _rcldb_p_h_included_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user