From bb810f9ceb3f9ecdddba7c1e2b4553a9a47e4796 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 2 Jan 2018 19:23:12 +0100 Subject: [PATCH] Changed new param name storerawtext->storedoctext. + comments --- src/common/rclconfig.cpp | 11 +++++------ src/common/rclconfig.h | 2 +- src/rcldb/rclabsfromtext.cpp | 2 +- src/rcldb/rclabstract.cpp | 2 +- src/rcldb/rcldb.cpp | 10 ++++++---- src/sampleconf/recoll.conf | 4 ++-- src/utils/hldata.h | 8 ++++++-- 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index e7a0234c..87651030 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -64,11 +64,10 @@ bool o_index_stripchars = true; // instead of building them from index position data. Has become // necessary for versions of Xapian 1.6, which have dropped support // for the chert index format, and adopted a setup which renders our -// use of positions list unacceptably slow in cases. 'raw' text here -// means that the text is not stripped of upper-case, diacritics, or -// punctuation signs. It is still translated from its original format -// to UTF-8 plain text. -bool o_index_storerawtext = false; +// use of positions list unacceptably slow in cases. The text just +// translated from its original format to UTF-8 plain text, and is not +// stripped of upper-case, diacritics, or punctuation signs. +bool o_index_storedoctext = false; bool o_uptodate_test_use_mtime = false; @@ -401,7 +400,7 @@ bool RclConfig::updateMainConfig() static int m_index_stripchars_init = 0; if (!m_index_stripchars_init) { getConfParam("indexStripChars", &o_index_stripchars); - getConfParam("indexStoreRawText", &o_index_storerawtext); + getConfParam("indexStoreDocText", &o_index_storedoctext); getConfParam("testmodifusemtime", &o_uptodate_test_use_mtime); m_index_stripchars_init = 1; } diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 19a9381a..744fbb93 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -446,7 +446,7 @@ extern bool o_index_stripchars; // means that the text is not stripped of upper-case, diacritics, or // punctuation signs. It is still translated from its original format // to UTF-8 plain text. -extern bool o_index_storerawtext; +extern bool o_index_storedoctext; // This global variable defines if we use mtime instead of ctime for // up-to-date tests. This is mostly incompatible with xattr indexing, diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index 81c1cf9c..b65eab51 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -242,9 +242,9 @@ int Query::Native::abstractFromText( return ABSRES_ERROR; } - // tryout the xapian internal method. #if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \ (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE)) + // Tryout the Xapian internal method. string snippet = xmset.snippet(rawtext); LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n"); #endif diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index a24eb5b9..8b62a4b8 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -649,7 +649,7 @@ int Query::Native::makeAbstract(Xapian::docid docid, LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " << maxtotaloccs << " ctxwords " << ctxwords << "\n"); - if (o_index_storerawtext) { + if (o_index_storedoctext) { return abstractFromText(ndb, docid, matchedTerms, byQ, totalweight, ctxwords, maxtotaloccs, vabs, chron); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 8f278447..6ba9f8aa 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -794,8 +794,10 @@ bool Db::open(OpenMode mode, OpenError *error) { int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN : Xapian::DB_CREATE_OR_OVERWRITE; - if (::access(dir.c_str(), 0) != 0) { - // New index. use a stub to force using Chert + if (!o_index_storedoctext && ::access(dir.c_str(), 0) != 0) { + // New index. use a stub to force using Chert. No + // sense in doing this if we are storing the text + // anyway. string stub = path_cat(m_config->getConfDir(), "xapian.stub"); FILE *fp = fopen(stub.c_str(), "w"); @@ -1463,7 +1465,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) LOGDEB("Db::addOrUpdate: split failed for main text\n"); } else { #ifdef RAWTEXT_IN_VALUE - if (o_index_storerawtext) { + if (o_index_storedoctext) { ZLibUtBuf buf; deflateToBuf(doc.text.c_str(), doc.text.size(), buf); string tt; @@ -1683,7 +1685,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) } #ifdef RAWTEXT_IN_DATA - if (o_index_storerawtext) { + if (o_index_storedoctext) { RECORD_APPEND(record, string("RAWTEXT"), neutchars(doc.text, cstr_nc)); } diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index a14424e0..6816c078 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -231,7 +231,7 @@ membermaxkbs = 50000 # implies an index reset. indexStripChars = 1 -# Decide if we store the +# Decide if we store the # documents' text content in the index.Storing the text # allows extracting snippets from it at query time, # instead of building them from index position data. This Has become @@ -244,7 +244,7 @@ indexStripChars = 1 # but also allows for nicer snippets, so it may be worth enabling it even # if not strictly needed for performance if you can afford the space. # -indexStoreRawText = 0 +indexStoreDocText = 0 # Decides if terms will be # generated for numbers.For example "123", "1.5e6", diff --git a/src/utils/hldata.h b/src/utils/hldata.h index 93766d8a..d6886c34 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -33,7 +33,9 @@ struct HighlightData { std::vector > ugroups; /** Processed/expanded terms and groups. Used for looking for - * regions to highlight. Terms are just groups with 1 entry. All + * regions to highlight. A group can be a PHRASE or NEAR entry (we + * process everything as NEAR to keep things reasonably + * simple. Terms are just groups with 1 entry. All * terms are transformed to be compatible with index content * (unaccented and lowercased as needed depending on * configuration), and the list may include values @@ -45,7 +47,9 @@ struct HighlightData { /** Index into ugroups for each group. Parallel to groups. As a * user term or group may generate many processed/expanded terms - * or groups, this is how we relate an expansion to its source. + * or groups, this is how we relate an expansion to its source + * (used, e.g. for generating anchors for walking search matches + * in the preview window). */ std::vector grpsugidx;