Abstracts: storing raw doc text in user metadata records

2018-01-06 11:38:24 +01:00 · 2018-01-06 11:38:24 +01:00 · 2c76a70c0e
commit 2c76a70c0e
parent 57d9ece876
4 changed files with 107 additions and 37 deletions
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@ -350,8 +350,9 @@ int Query::Native::abstractFromText(
        rawtext = doc.meta["RAWTEXT"];
    }
 #endif
-#ifdef RAWTEXT_IN_VALUE
+#ifdef RAWTEXT_IN_METADATA
-    XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason);
+    XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)),
           ndb->xrdb, reason);
    if (!reason.empty()) {
        LOGERR("abstractFromText: could not get value: " << reason << endl);
        return ABSRES_ERROR;
@ -367,7 +368,7 @@ int Query::Native::abstractFromText(
    }
 #if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
-    (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
+    (defined(RAWTEXT_IN_DATA))
    // Tryout the Xapian internal method.
    string snippet = xmset.snippet(rawtext);
    LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -191,7 +191,11 @@ void *DbUpdWorker(void* vdbp)
 	case DbUpdTask::AddOrUpdate:
 	    LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
 	    status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm, 
-					    tsk->doc, tsk->txtlen);
+					    tsk->doc, tsk->txtlen
 #ifdef RAWTEXT_IN_METADATA
                          , tsk->rawztext
 #endif
                );
 	    break;
 	case DbUpdTask::Delete:
 	    LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
@ -585,13 +589,17 @@ int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks, int pos)
 // to delete it before returning.
 bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, 
 				  Xapian::Document *newdocument_ptr, 
-                                  size_t textlen)
+                                  size_t textlen
 #ifdef RAWTEXT_IN_METADATA
                          , const string& rawztext
 #endif
    )
 {
 #ifdef IDX_THREADS
    Chrono chron;
    std::unique_lock<std::mutex> lock(m_mutex);
 #endif
-    std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
+    std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
    // Check file system full every mbyte of indexed text. It's a bit wasteful
    // to do this after having prepared the document, but it needs to be in
@ -614,9 +622,9 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
    string ermsg;
    // Add db entry or update existing entry:
    Xapian::docid did = 0;
    try {
-	Xapian::docid did = 
+	did = xwdb.replace_document(uniterm, *newdocument_ptr);
 	    xwdb.replace_document(uniterm, *newdocument_ptr);
 	if (did < m_rcldb->updated.size()) {
            // This is necessary because only the file-level docs are tested
            // by needUpdate(), so the subdocs existence flags are only set
@ -627,7 +635,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
 	    LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
 	}
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
 	LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
 	ermsg.erase();
@ -643,6 +650,16 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
 	}
    }
 #ifdef RAWTEXT_IN_METADATA
    XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
           xwdb, m_rcldb->m_reason);
    if (!m_rcldb->m_reason.empty()) {
        LOGERR("Db::addOrUpdate: set_metadata error: " <<
               m_rcldb->m_reason << "\n");
        // This only affects snippets, so let's say not fatal
    }
 #endif
    // Test if we're over the flush threshold (limit memory usage):
    bool ret = m_rcldb->maybeflush(textlen);
 #ifdef IDX_THREADS
@ -682,7 +699,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
 	    }
 	} else {
 	    LOGDEB("purgeFile: delete docid " << *docid << "\n");
-	    xwdb.delete_document(*docid);
+            deleteDocument(*docid);
 	}
 	vector<Xapian::docid> docids;
 	subDocs(udi, 0, docids);
@ -705,7 +722,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
 	    if (!orphansOnly || sig != subdocsig) {
 		LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
-		xwdb.delete_document(*it);
+		deleteDocument(*it);
 	    }
 	}
 	return true;
@ -1365,6 +1382,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
    // Udi unique term: this is used for file existence/uptodate
    // checks, and unique id for the replace_document() call.
    string uniterm = make_uniterm(udi);
 #if defined(RAWTEXT_IN_METADATA)
        string rawztext; // Doc compressed text
 #endif
    if (doc.onlyxattr) {
 	// Only updating an existing doc with new extended attributes
@ -1468,13 +1488,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 	if (!splitter.text_to_words(doc.text)) {
 	    LOGDEB("Db::addOrUpdate: split failed for main text\n");
        } else {
-#ifdef RAWTEXT_IN_VALUE
+#if defined(RAWTEXT_IN_METADATA)
            if (o_index_storedoctext) {
                ZLibUtBuf buf;
                deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
-                string tt;
+                rawztext.assign(buf.getBuf(), buf.getCnt());
                tt.assign(buf.getBuf(), buf.getCnt());
                newdocument.add_value(VALUE_RAWTEXT, tt);
            }
 #endif
        }
@ -1700,7 +1718,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 #ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
-				      newdocument_ptr, doc.text.length());
+				      newdocument_ptr, doc.text.length()
 #ifdef RAWTEXT_IN_METADATA
                                      , rawztext
 #endif
            );
 	if (!m_ndb->m_wqueue.put(tp)) {
 	    LOGERR("Db::addOrUpdate:Cant queue task\n");
            delete newdocument_ptr;
@ -1712,7 +1734,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 #endif
    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
-				   doc.text.length());
+				   doc.text.length()
 #ifdef RAWTEXT_IN_METADATA
                                   , rawztext
 #endif
        );
 }
 bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
@ -2076,7 +2102,7 @@ bool Db::purge()
 		    Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
 		    maybeflush(trms * 5);
 		}
-		m_ndb->xwdb.delete_document(docid);
+		m_ndb->deleteDocument(docid);
 		LOGDEB("Db::purge: deleted document #" << docid << "\n");
 	    } catch (const Xapian::DocNotFoundError &) {
 		LOGDEB0("Db::purge: document #" << docid << " not found\n");
@ -2137,8 +2163,13 @@ bool Db::purgeFile(const string &udi, bool *existed)
 #ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
        string rztxt;
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, 
-				      0, (size_t)-1);
+				      0, (size_t)-1,
 #if defined(RAWTEXT_IN_METADATA)
                                      rztxt
 #endif
            );
 	if (!m_ndb->m_wqueue.put(tp)) {
 	    LOGERR("Db::purgeFile:Cant queue task\n");
 	    return false;
@ -2164,8 +2195,13 @@ bool Db::purgeOrphans(const string &udi)
 #ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
        string rztxt;
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, 
-				      0, (size_t)-1);
+				      0, (size_t)-1,
 #ifdef RAWTEXT_IN_METADATA
                                      rztxt
 #endif
            );
 	if (!m_ndb->m_wqueue.put(tp)) {
 	    LOGERR("Db::purgeFile:Cant queue task\n");
 	    return false;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -70,11 +70,6 @@ enum value_slot {
    ////////// Recoll only:
    // Doc sig as chosen by app (ex: mtime+size
    VALUE_SIG = 10,
    // Doc extracted text, with punctuation: splitter input. Used for
    // generating snippets. This is only used if RAWTEXT_IN_VALUE is
    // defined (else the text goes to the data record), but reserve
    // the value in any case.
    VALUE_RAWTEXT= 11,  
 };
 class SearchData;
--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@ -30,6 +30,10 @@
 #endif // IDX_THREADS
 #include "xmacros.h"
 // Store raw doc text in data record or metadata ?
 #undef RAWTEXT_IN_DATA
 #define RAWTEXT_IN_METADATA
 namespace Rcl {
 class Query;
@ -51,10 +55,16 @@ public:
    // available on the caller site.
    // Take some care to avoid sharing string data (if string impl is cow)
    DbUpdTask(Op _op, const string& ud, const string& un, 
-	      Xapian::Document *d, size_t tl)
+	      Xapian::Document *d, size_t tl
-	: op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()), 
+#ifdef RAWTEXT_IN_METADATA
-          doc(d), txtlen(tl)
+              , string& rztxt
-    {}
+#endif
        ) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()), 
            doc(d), txtlen(tl) {
 #ifdef RAWTEXT_IN_METADATA
        rawztext.swap(rztxt);
 #endif
    }
    // Udi and uniterm equivalently designate the doc
    Op op;
    string udi;
@ -64,6 +74,9 @@ public:
    // purge because we actually don't know it, and the code fakes a
    // text length based on the term count.
    size_t txtlen;
 #ifdef RAWTEXT_IN_METADATA
    string rawztext; // Compressed doc text
 #endif
 };
 #endif // IDX_THREADS
@ -101,7 +114,11 @@ class Db::Native {
    // Final steps of doc update, part which need to be single-threaded
    bool addOrUpdateWrite(const string& udi, const string& uniterm, 
-			  Xapian::Document *doc, size_t txtlen);
+			  Xapian::Document *doc, size_t txtlen
 #ifdef RAWTEXT_IN_METADATA
                          , const string& rawztext
 #endif
        );
    /** Delete all documents which are contained in the input document, 
     * which must be a file-level one.
@ -171,18 +188,39 @@ class Db::Native {
    /** Check if a page position list is defined */
    bool hasPages(Xapian::docid id);
 #ifdef RAWTEXT_IN_METADATA
    std::string rawtextMetaKey(Xapian::docid did) {
        // Xapian's Olly Betts avises to use a key which will
        // sort the same as the docid (which we do), and to
        // use Xapian's pack_uint_preserving_sort() which is
        // efficient but hard to read. I'd wager that this
        // does not make much of a difference. 10 ascii bytes
        // gives us 10 billion docs, which is enough (says I).
        char buf[30];
        sprintf(buf, "%010d", did);
        return buf;
    }
 #endif
    void deleteDocument(Xapian::docid docid) {
 #ifdef RAWTEXT_IN_METADATA
        string metareason;
        XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()),
               xwdb, metareason);
        if (!metareason.empty()) {
            LOGERR("deleteDocument: set_metadata error: " <<
                   metareason << "\n");
            // not fatal
        }
 #endif
        xwdb.delete_document(docid);
    }
 };
 // This is the word position offset at which we index the body text
 // (abstract, keywords, etc.. are stored before this)
 static const unsigned int baseTextPosition = 100000;
 // Store raw doc text in data record or value slot ?
 #if 0
 #define RAWTEXT_IN_DATA 1
 #elif 1
 #define RAWTEXT_IN_VALUE 1
 #endif
 }
 #endif /* _rcldb_p_h_included_ */