From bb810f9ceb3f9ecdddba7c1e2b4553a9a47e4796 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Tue, 2 Jan 2018 19:23:12 +0100
Subject: [PATCH] Changed new param name storerawtext->storedoctext. + comments

---
 src/common/rclconfig.cpp     | 11 +++++------
 src/common/rclconfig.h       |  2 +-
 src/rcldb/rclabsfromtext.cpp |  2 +-
 src/rcldb/rclabstract.cpp    |  2 +-
 src/rcldb/rcldb.cpp          | 10 ++++++----
 src/sampleconf/recoll.conf   |  4 ++--
 src/utils/hldata.h           |  8 ++++++--
 7 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp
index e7a0234c..87651030 100644
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@@ -64,11 +64,10 @@ bool o_index_stripchars = true;
 // instead of building them from index position data. Has become
 // necessary for versions of Xapian 1.6, which have dropped support
 // for the chert index format, and adopted a setup which renders our
-// use of positions list unacceptably slow in cases. 'raw' text here
-// means that the text is not stripped of upper-case, diacritics, or
-// punctuation signs. It is still translated from its original format
-// to UTF-8 plain text.
-bool o_index_storerawtext = false;
+// use of positions list unacceptably slow in cases. The text just
+// translated from its original format to UTF-8 plain text, and is not
+// stripped of upper-case, diacritics, or punctuation signs.
+bool o_index_storedoctext = false;
 
 bool o_uptodate_test_use_mtime = false;
 
@@ -401,7 +400,7 @@ bool RclConfig::updateMainConfig()
     static int m_index_stripchars_init = 0;
     if (!m_index_stripchars_init) {
 	getConfParam("indexStripChars", &o_index_stripchars);
-        getConfParam("indexStoreRawText", &o_index_storerawtext);
+        getConfParam("indexStoreDocText", &o_index_storedoctext);
         getConfParam("testmodifusemtime", &o_uptodate_test_use_mtime);
 	m_index_stripchars_init = 1;
     }
diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h
index 19a9381a..744fbb93 100644
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@@ -446,7 +446,7 @@ extern bool o_index_stripchars;
 // means that the text is not stripped of upper-case, diacritics, or
 // punctuation signs. It is still translated from its original format
 // to UTF-8 plain text.
-extern bool o_index_storerawtext;
+extern bool o_index_storedoctext;
 
 // This global variable defines if we use mtime instead of ctime for
 // up-to-date tests. This is mostly incompatible with xattr indexing,
diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp
index 81c1cf9c..b65eab51 100644
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@@ -242,9 +242,9 @@ int Query::Native::abstractFromText(
         return ABSRES_ERROR;
     }
 
-    // tryout the xapian internal method.
 #if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
     (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
+    // Tryout the Xapian internal method.
     string snippet = xmset.snippet(rawtext);
     LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
 #endif
diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp
index a24eb5b9..8b62a4b8 100644
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@@ -649,7 +649,7 @@ int Query::Native::makeAbstract(Xapian::docid docid,
     LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
            maxtotaloccs << " ctxwords " << ctxwords << "\n");
 
-    if (o_index_storerawtext) {
+    if (o_index_storedoctext) {
         return abstractFromText(ndb, docid, matchedTerms, byQ,
                                 totalweight, ctxwords, maxtotaloccs, vabs,
                                 chron);
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 8f278447..6ba9f8aa 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -794,8 +794,10 @@ bool Db::open(OpenMode mode, OpenError *error)
 	    {
 		int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
 		    Xapian::DB_CREATE_OR_OVERWRITE;
-                if (::access(dir.c_str(), 0) != 0) {
-                    // New index. use a stub to force using Chert
+                if (!o_index_storedoctext && ::access(dir.c_str(), 0) != 0) {
+                    // New index. use a stub to force using Chert. No
+                    // sense in doing this if we are storing the text
+                    // anyway.
                     string stub = path_cat(m_config->getConfDir(),
                                            "xapian.stub");
                     FILE *fp = fopen(stub.c_str(), "w");
@@ -1463,7 +1465,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 	    LOGDEB("Db::addOrUpdate: split failed for main text\n");
         } else {
 #ifdef RAWTEXT_IN_VALUE
-            if (o_index_storerawtext) {
+            if (o_index_storedoctext) {
                 ZLibUtBuf buf;
                 deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
                 string tt;
@@ -1683,7 +1685,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 	}
 
 #ifdef RAWTEXT_IN_DATA
-        if (o_index_storerawtext) {
+        if (o_index_storedoctext) {
             RECORD_APPEND(record, string("RAWTEXT"),
                           neutchars(doc.text, cstr_nc));
         }
diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf
index a14424e0..6816c078 100644
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@@ -231,7 +231,7 @@ membermaxkbs = 50000
 # implies an index reset.</descr></var>
 indexStripChars = 1
 
-# <var name="indexStoreRawText" type="bool"><brief>Decide if we store the
+# <var name="indexStoreDocText" type="bool"><brief>Decide if we store the
 # documents' text content in the index.</brief><descr>Storing the text
 # allows extracting snippets from it at query time, 
 # instead of building them from index position data. This Has become
@@ -244,7 +244,7 @@ indexStripChars = 1
 # but also allows for nicer snippets, so it may be worth enabling it even
 # if not strictly needed for performance if you can afford the space.
 # </desc></var>
-indexStoreRawText = 0
+indexStoreDocText = 0
 
 # <var name="nonumbers" type="bool"><brief>Decides if terms will be
 # generated for numbers.</brief><descr>For example "123", "1.5e6",
diff --git a/src/utils/hldata.h b/src/utils/hldata.h
index 93766d8a..d6886c34 100644
--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@@ -33,7 +33,9 @@ struct HighlightData {
     std::vector<std::vector<std::string> > ugroups;
 
     /** Processed/expanded terms and groups. Used for looking for
-     * regions to highlight. Terms are just groups with 1 entry. All
+     * regions to highlight. A group can be a PHRASE or NEAR entry (we
+     * process everything as NEAR to keep things reasonably
+     * simple. Terms are just groups with 1 entry. All
      * terms are transformed to be compatible with index content
      * (unaccented and lowercased as needed depending on
      * configuration), and the list may include values
@@ -45,7 +47,9 @@ struct HighlightData {
 
     /** Index into ugroups for each group. Parallel to groups. As a
      * user term or group may generate many processed/expanded terms
-     * or groups, this is how we relate an expansion to its source.
+     * or groups, this is how we relate an expansion to its source
+     * (used, e.g. for generating anchors for walking search matches
+     * in the preview window).
      */
     std::vector<size_t> grpsugidx;