diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index c17e76ab..164a31f0 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -350,8 +350,9 @@ int Query::Native::abstractFromText( rawtext = doc.meta["RAWTEXT"]; } #endif -#ifdef RAWTEXT_IN_VALUE - XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason); +#ifdef RAWTEXT_IN_METADATA + XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)), + ndb->xrdb, reason); if (!reason.empty()) { LOGERR("abstractFromText: could not get value: " << reason << endl); return ABSRES_ERROR; @@ -367,7 +368,7 @@ int Query::Native::abstractFromText( } #if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \ - (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE)) + (defined(RAWTEXT_IN_DATA)) // Tryout the Xapian internal method. string snippet = xmset.snippet(rawtext); LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n"); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 7874d0b3..0efa1eea 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -191,7 +191,11 @@ void *DbUpdWorker(void* vdbp) case DbUpdTask::AddOrUpdate: LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n"); status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm, - tsk->doc, tsk->txtlen); + tsk->doc, tsk->txtlen +#ifdef RAWTEXT_IN_METADATA + , tsk->rawztext +#endif + ); break; case DbUpdTask::Delete: LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n"); @@ -585,13 +589,17 @@ int Db::Native::getPageNumberForPosition(const vector& pbreaks, int pos) // to delete it before returning. bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr, - size_t textlen) + size_t textlen +#ifdef RAWTEXT_IN_METADATA + , const string& rawztext +#endif + ) { #ifdef IDX_THREADS Chrono chron; std::unique_lock lock(m_mutex); #endif - std::shared_ptr doc_cleaner(newdocument_ptr); + std::unique_ptr doc_cleaner(newdocument_ptr); // Check file system full every mbyte of indexed text. It's a bit wasteful // to do this after having prepared the document, but it needs to be in @@ -614,9 +622,9 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, string ermsg; // Add db entry or update existing entry: + Xapian::docid did = 0; try { - Xapian::docid did = - xwdb.replace_document(uniterm, *newdocument_ptr); + did = xwdb.replace_document(uniterm, *newdocument_ptr); if (did < m_rcldb->updated.size()) { // This is necessary because only the file-level docs are tested // by needUpdate(), so the subdocs existence flags are only set @@ -627,7 +635,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n"); } } XCATCHERROR(ermsg); - if (!ermsg.empty()) { LOGERR("Db::add: replace_document failed: " << ermsg << "\n"); ermsg.erase(); @@ -643,6 +650,16 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, } } +#ifdef RAWTEXT_IN_METADATA + XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext), + xwdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGERR("Db::addOrUpdate: set_metadata error: " << + m_rcldb->m_reason << "\n"); + // This only affects snippets, so let's say not fatal + } +#endif + // Test if we're over the flush threshold (limit memory usage): bool ret = m_rcldb->maybeflush(textlen); #ifdef IDX_THREADS @@ -682,7 +699,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, } } else { LOGDEB("purgeFile: delete docid " << *docid << "\n"); - xwdb.delete_document(*docid); + deleteDocument(*docid); } vector docids; subDocs(udi, 0, docids); @@ -705,7 +722,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, if (!orphansOnly || sig != subdocsig) { LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n"); - xwdb.delete_document(*it); + deleteDocument(*it); } } return true; @@ -1365,6 +1382,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) // Udi unique term: this is used for file existence/uptodate // checks, and unique id for the replace_document() call. string uniterm = make_uniterm(udi); +#if defined(RAWTEXT_IN_METADATA) + string rawztext; // Doc compressed text +#endif if (doc.onlyxattr) { // Only updating an existing doc with new extended attributes @@ -1468,13 +1488,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) if (!splitter.text_to_words(doc.text)) { LOGDEB("Db::addOrUpdate: split failed for main text\n"); } else { -#ifdef RAWTEXT_IN_VALUE +#if defined(RAWTEXT_IN_METADATA) if (o_index_storedoctext) { ZLibUtBuf buf; deflateToBuf(doc.text.c_str(), doc.text.size(), buf); - string tt; - tt.assign(buf.getBuf(), buf.getCnt()); - newdocument.add_value(VALUE_RAWTEXT, tt); + rawztext.assign(buf.getBuf(), buf.getCnt()); } #endif } @@ -1700,7 +1718,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, - newdocument_ptr, doc.text.length()); + newdocument_ptr, doc.text.length() +#ifdef RAWTEXT_IN_METADATA + , rawztext +#endif + ); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::addOrUpdate:Cant queue task\n"); delete newdocument_ptr; @@ -1712,7 +1734,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) #endif return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr, - doc.text.length()); + doc.text.length() +#ifdef RAWTEXT_IN_METADATA + , rawztext +#endif + ); } bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, @@ -2076,7 +2102,7 @@ bool Db::purge() Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid); maybeflush(trms * 5); } - m_ndb->xwdb.delete_document(docid); + m_ndb->deleteDocument(docid); LOGDEB("Db::purge: deleted document #" << docid << "\n"); } catch (const Xapian::DocNotFoundError &) { LOGDEB0("Db::purge: document #" << docid << " not found\n"); @@ -2137,8 +2163,13 @@ bool Db::purgeFile(const string &udi, bool *existed) #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { + string rztxt; DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, - 0, (size_t)-1); + 0, (size_t)-1, +#if defined(RAWTEXT_IN_METADATA) + rztxt +#endif + ); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::purgeFile:Cant queue task\n"); return false; @@ -2164,8 +2195,13 @@ bool Db::purgeOrphans(const string &udi) #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { + string rztxt; DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, - 0, (size_t)-1); + 0, (size_t)-1, +#ifdef RAWTEXT_IN_METADATA + rztxt +#endif + ); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::purgeFile:Cant queue task\n"); return false; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 0c8cca61..dc124c2f 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -70,11 +70,6 @@ enum value_slot { ////////// Recoll only: // Doc sig as chosen by app (ex: mtime+size VALUE_SIG = 10, - // Doc extracted text, with punctuation: splitter input. Used for - // generating snippets. This is only used if RAWTEXT_IN_VALUE is - // defined (else the text goes to the data record), but reserve - // the value in any case. - VALUE_RAWTEXT= 11, }; class SearchData; diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 12302869..4ea97f41 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -30,6 +30,10 @@ #endif // IDX_THREADS #include "xmacros.h" +// Store raw doc text in data record or metadata ? +#undef RAWTEXT_IN_DATA +#define RAWTEXT_IN_METADATA + namespace Rcl { class Query; @@ -51,10 +55,16 @@ public: // available on the caller site. // Take some care to avoid sharing string data (if string impl is cow) DbUpdTask(Op _op, const string& ud, const string& un, - Xapian::Document *d, size_t tl) - : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()), - doc(d), txtlen(tl) - {} + Xapian::Document *d, size_t tl +#ifdef RAWTEXT_IN_METADATA + , string& rztxt +#endif + ) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()), + doc(d), txtlen(tl) { +#ifdef RAWTEXT_IN_METADATA + rawztext.swap(rztxt); +#endif + } // Udi and uniterm equivalently designate the doc Op op; string udi; @@ -64,6 +74,9 @@ public: // purge because we actually don't know it, and the code fakes a // text length based on the term count. size_t txtlen; +#ifdef RAWTEXT_IN_METADATA + string rawztext; // Compressed doc text +#endif }; #endif // IDX_THREADS @@ -101,7 +114,11 @@ class Db::Native { // Final steps of doc update, part which need to be single-threaded bool addOrUpdateWrite(const string& udi, const string& uniterm, - Xapian::Document *doc, size_t txtlen); + Xapian::Document *doc, size_t txtlen +#ifdef RAWTEXT_IN_METADATA + , const string& rawztext +#endif + ); /** Delete all documents which are contained in the input document, * which must be a file-level one. @@ -171,18 +188,39 @@ class Db::Native { /** Check if a page position list is defined */ bool hasPages(Xapian::docid id); + +#ifdef RAWTEXT_IN_METADATA + std::string rawtextMetaKey(Xapian::docid did) { + // Xapian's Olly Betts avises to use a key which will + // sort the same as the docid (which we do), and to + // use Xapian's pack_uint_preserving_sort() which is + // efficient but hard to read. I'd wager that this + // does not make much of a difference. 10 ascii bytes + // gives us 10 billion docs, which is enough (says I). + char buf[30]; + sprintf(buf, "%010d", did); + return buf; + } +#endif + + void deleteDocument(Xapian::docid docid) { +#ifdef RAWTEXT_IN_METADATA + string metareason; + XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()), + xwdb, metareason); + if (!metareason.empty()) { + LOGERR("deleteDocument: set_metadata error: " << + metareason << "\n"); + // not fatal + } +#endif + xwdb.delete_document(docid); + } }; // This is the word position offset at which we index the body text // (abstract, keywords, etc.. are stored before this) static const unsigned int baseTextPosition = 100000; -// Store raw doc text in data record or value slot ? -#if 0 -#define RAWTEXT_IN_DATA 1 -#elif 1 -#define RAWTEXT_IN_VALUE 1 -#endif - } #endif /* _rcldb_p_h_included_ */