Snippets generation: add method for generating from doc stored text. Still needs refining, esp. for phrase/near

2017-12-30 08:43:14 +01:00 · 2017-12-30 08:43:14 +01:00 · b4493ed9e1
commit b4493ed9e1
parent 040f62f1d2
10 changed files with 762 additions and 303 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -154,6 +154,7 @@ rcldb/daterange.h \
 rcldb/expansiondbs.cpp \
 rcldb/expansiondbs.h \
 rcldb/rclabstract.cpp \
+rcldb/rclabsfromtext.cpp \
 rcldb/rcldb.cpp \
 rcldb/rcldb.h \
 rcldb/rcldb_p.h \
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -60,6 +60,16 @@ using namespace std;
 // We default to a case- and diacritics-less index for now
 bool o_index_stripchars = true;

+// Store document text in index. Allows extracting snippets from text
+// instead of building them from index position data. Has become
+// necessary for versions of Xapian 1.6, which have dropped support
+// for the chert index format, and adopted a setup which renders our
+// use of positions list unacceptably slow in cases. 'raw' text here
+// means that the text is not stripped of upper-case, diacritics, or
+// punctuation signs. It is still translated from its original format
+// to UTF-8 plain text.
+bool o_index_storerawtext = false;
+
 bool o_uptodate_test_use_mtime = false;

 string RclConfig::o_localecharset; 
@ -391,6 +401,7 @@ bool RclConfig::updateMainConfig()
    static int m_index_stripchars_init = 0;
    if (!m_index_stripchars_init) {
 	getConfParam("indexStripChars", &o_index_stripchars);
+        getConfParam("indexStoreRawText", &o_index_storerawtext);
        getConfParam("testmodifusemtime", &o_uptodate_test_use_mtime);
 	m_index_stripchars_init = 1;
    }
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -438,6 +438,16 @@ class RclConfig {
 // reset. When using multiple indexes, all must have the same value
 extern bool o_index_stripchars;

+// Store document text in index. Allows extracting snippets from text
+// instead of building them from index position data. Has become
+// necessary for versions of Xapian 1.6, which have dropped support
+// for the chert index format, and adopted a setup which renders our
+// use of positions list unacceptably slow in cases. 'raw' text here
+// means that the text is not stripped of upper-case, diacritics, or
+// punctuation signs. It is still translated from its original format
+// to UTF-8 plain text.
+extern bool o_index_storerawtext;
+
 // This global variable defines if we use mtime instead of ctime for
 // up-to-date tests. This is mostly incompatible with xattr indexing,
 // in addition to other issues. See recoll.conf comments. 
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@ -0,0 +1,298 @@
+/* Copyright (C) 2004-2017 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#include "autoconfig.h"
+
+#include <math.h>
+
+#include <map>
+#include <unordered_map>
+#include <deque>
+#include <algorithm>
+
+#include "log.h"
+#include "rcldb.h"
+#include "rcldb_p.h"
+#include "rclquery.h"
+#include "rclquery_p.h"
+#include "textsplit.h"
+#include "hldata.h"
+#include "chrono.h"
+#include "unacpp.h"
+#include "zlibut.h"
+
+using namespace std;
+
+
+namespace Rcl {
+
+#warning NEAR and PHRASE
+
+// Text splitter for finding the match terms in the doc text.
+class TextSplitABS : public TextSplit {
+public:
+
+    struct MatchEntry {
+        // Start/End byte offsets of fragment in the document text
+        int start;
+        int stop;
+        double coef;
+        // Position of the first matched term.
+        unsigned int hitpos;
+        // "best term" for this match
+        string term;
+        // Hilight areas (each is one or several contiguous match terms).
+        vector<pair<int,int>> hlzones;
+        
+        MatchEntry(int sta, int sto, double c, vector<pair<int,int>>& hl,
+                   unsigned int pos, string& trm) 
+            : start(sta), stop(sto), coef(c), hitpos(pos) {
+            hlzones.swap(hl);
+            term.swap(trm);
+        }
+    };
+
+
+    TextSplitABS(const vector<string>& matchTerms,
+                 unordered_map<string, double>& wordcoefs,
+                 unsigned int ctxwords,
+                 Flags flags = TXTS_NONE)
+        :  TextSplit(flags),  m_terms(matchTerms.begin(), matchTerms.end()),
+           m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
+        LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl);
+    }
+
+    // Accept a word and its position. If the word is a matched term,
+    // add/update fragment definition.
+    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
+        LOGDEB2("takeword: " << term << endl);
+
+        // Recent past
+        m_prevterms.push_back(pair<int,int>(bts,bte));
+        if (m_prevterms.size() > m_ctxwords+1) {
+            m_prevterms.pop_front();
+        }
+
+        string dumb;
+        if (o_index_stripchars) {
+            if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
+                LOGINFO("abstract: unac failed for [" << term << "]\n");
+                return true;
+            }
+        } else {
+            dumb = term;
+        }
+
+        if (m_terms.find(dumb) != m_terms.end()) {
+            // This word is a search term. Extend or create fragment
+            LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
+                   ", " << m_curfrag.second << " remain " <<
+                   m_remainingWords << endl);
+            double coef = m_wordcoefs[dumb];
+            if (!m_remainingWords) {
+                // No current fragment
+                m_curhitpos = baseTextPosition + pos;
+                m_curfrag.first = m_prevterms.front().first;
+                m_curfrag.second = m_prevterms.back().second;
+                m_curhlzones.push_back(pair<int,int>(bts, bte));
+                m_curterm = term;
+                m_curtermcoef = coef;
+            } else {
+                LOGDEB2("Extending current fragment: " << m_remainingWords <<
+                       " -> " << m_ctxwords << endl);
+                m_extcount++;
+                if (m_prevwordhit) {
+                    m_curhlzones.back().second = bte;
+                } else {
+                    m_curhlzones.push_back(pair<int,int>(bts, bte));
+                }
+                if (coef > m_curtermcoef) {
+                    m_curterm = term;
+                    m_curtermcoef = coef;
+                }
+            }
+            m_prevwordhit = true;
+            m_curfragcoef += coef;
+            m_remainingWords = m_ctxwords + 1;
+            if (m_extcount > 3) {
+                // Limit expansion of contiguous fragments (this is to
+                // avoid common terms in search causing long
+                // heavyweight meaningless fragments. Also, limit length).
+                m_remainingWords = 1;
+                m_extcount = 0;
+            }
+        } else {
+            m_prevwordhit = false;
+        }
+       
+        
+        if (m_remainingWords) {
+            // Fragment currently open. Time to close ?
+            m_remainingWords--;
+            m_curfrag.second = bte;
+            if (m_remainingWords == 0) {
+                if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
+                    // Don't push bad fragments if we have a lot already
+                    m_fragments.push_back(MatchEntry(m_curfrag.first,
+                                                     m_curfrag.second,
+                                                     m_curfragcoef,
+                                                     m_curhlzones,
+                                                     m_curhitpos,
+                                                     m_curterm
+                                              ));
+                }
+                m_totalcoef += m_curfragcoef;
+                m_curfragcoef = 0.0;
+                m_curtermcoef = 0.0;
+            }
+        }
+        return true;
+    }
+    const vector<MatchEntry>& getFragments() {
+        return m_fragments;
+    }
+
+private:
+    // Past terms because we need to go back for context before a hit
+    deque<pair<int,int>>  m_prevterms;
+    // Data about the fragment we are building
+    pair<int,int> m_curfrag{0,0};
+    double m_curfragcoef{0.0};
+    unsigned int m_remainingWords{0};
+    unsigned int m_extcount{0};
+    vector<pair<int,int>> m_curhlzones;
+    bool m_prevwordhit{false};
+    // Current sum of fragment weights
+    double m_totalcoef{0.0};
+    // Position of 1st term match (for page number computations)
+    unsigned int m_curhitpos{0};
+    // "best" term
+    string m_curterm;
+    double m_curtermcoef{0.0};
+    
+    // Input
+    set<string> m_terms;
+    unordered_map<string, double>& m_wordcoefs;
+    unsigned int m_ctxwords;
+
+    // Result: begin and end byte positions of query terms/groups in text
+    vector<MatchEntry> m_fragments;  
+};
+
+int Query::Native::abstractFromText(
+    Rcl::Db::Native *ndb,
+    Xapian::docid docid,
+    const vector<string>& matchTerms,
+    const multimap<double, vector<string>> byQ,
+    double totalweight,
+    int ctxwords,
+    unsigned int maxtotaloccs,
+    vector<Snippet>& vabs,
+    Chrono&
+    )
+{
+    Xapian::Database& xrdb(ndb->xrdb);
+    Xapian::Document xdoc;
+
+    string reason;
+    XAPTRY(xdoc = xrdb.get_document(docid), xrdb, reason);
+    if (!reason.empty()) {
+        LOGERR("abstractFromText: could not get doc: " << reason << endl);
+        return ABSRES_ERROR;
+    }
+
+    string rawtext, data;
+#ifdef RAWTEXT_IN_DATA
+    XAPTRY(data = xdoc.get_data(), xrdb, reason);
+    if (!reason.empty()) {
+        LOGERR("abstractFromText: could not get data: " << reason << endl);
+        return ABSRES_ERROR;
+    }
+    Doc doc;
+    if (ndb->dbDataToRclDoc(docid, data, doc)) {
+        rawtext = doc.meta["RAWTEXT"];
+    }
+#endif
+#ifdef RAWTEXT_IN_VALUE
+    XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason);
+    if (!reason.empty()) {
+        LOGERR("abstractFromText: could not get value: " << reason << endl);
+        return ABSRES_ERROR;
+    }
+    ZLibUtBuf cbuf;
+    inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
+    rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
+#endif
+
+    if (rawtext.empty()) {
+        LOGDEB0("abstractFromText: no text\n");
+        return ABSRES_ERROR;
+    }
+
+    // tryout the xapian internal method.
+#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
+    (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
+    string snippet = xmset.snippet(rawtext);
+    LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
+#endif
+
+    // We need the q coefs for individual terms
+    unordered_map<string, double> wordcoefs;
+    for (const auto& mment : byQ) {
+        for (const auto& word : mment.second) {
+            wordcoefs[word] = mment.first;
+        }
+    }
+    TextSplitABS splitter(matchTerms, wordcoefs, ctxwords,
+                          TextSplit::TXTS_ONLYSPANS);
+    splitter.text_to_words(rawtext);
+    const vector<TextSplitABS::MatchEntry>& res1 = splitter.getFragments();
+    vector<TextSplitABS::MatchEntry> result(res1.begin(), res1.end());
+    std::sort(result.begin(), result.end(),
+              [](const TextSplitABS::MatchEntry& a,
+                 const TextSplitABS::MatchEntry& b) -> bool { 
+                  return a.coef > b.coef; 
+              }
+        );
+
+    static const string cstr_nc("\n\r\x0c\\");
+    vector<int> vpbreaks;
+    ndb->getPagePositions(docid, vpbreaks);
+    unsigned int count = 0;
+    for (const auto& entry : result) {
+        string frag = neutchars(
+            rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
+#if 0
+        static const string starthit("<span style='color: blue;'>");
+        static const string endhit("</span>");
+        size_t inslen = 0;
+        for (const auto& hlzone: entry.hlzones) {
+            frag.replace(hlzone.first - entry.start + inslen, 0, starthit);
+            inslen += starthit.size();
+            frag.replace(hlzone.second - entry.start + inslen, 0, endhit);
+            inslen += endhit.size();
+        }
+#endif
+        LOGDEB("=== FRAGMENT: Coef: " << entry.coef << ": " << frag << endl);
+        int page = ndb->getPageNumberForPosition(vpbreaks, entry.hitpos);
+        vabs.push_back(Snippet(page, frag).setTerm(entry.term));
+        if (count++ >= maxtotaloccs)
+            break;
+    }
+    return ABSRES_OK;
+}
+
+}
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes
+/* Copyright (C) 2004-2017 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -19,6 +19,9 @@
 #include <math.h>

 #include <map>
+#include <unordered_map>
+#include <deque>
+#include <algorithm>

 #include "log.h"
 #include "rcldb.h"
@ -33,30 +36,22 @@

 using namespace std;

+
 namespace Rcl {
+
 // This is used as a marker inside the abstract frag lists, but
 // normally doesn't remain in final output (which is built with a
 // custom sep. by our caller).
 static const string cstr_ellipsis("...");
+static const string emptys;
 // This is used to mark positions overlapped by a multi-word match term
 static const string occupiedmarker("?");

-#undef DEBUGABSTRACT  
+#define DEBUGABSTRACT  
 #ifdef DEBUGABSTRACT
 #define LOGABS LOGDEB
-static void listList(const string& what, const vector<string>&l)
-{
-    string a;
-    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
-        a = a + *it + " ";
-    }
-    LOGDEB("" << what << ": " << a << "\n");
-}
 #else
 #define LOGABS LOGDEB2
-static void listList(const string&, const vector<string>&)
-{
-}
 #endif

 // Unprefix terms. Actually it's not completely clear if we should
@ -66,13 +61,12 @@ static void listList(const string&, const vector<string>&)
 static const bool prune_prefixed_terms = true; 
 static void noPrefixList(const vector<string>& in, vector<string>& out) 
 {
-    for (vector<string>::const_iterator qit = in.begin(); 
-         qit != in.end(); qit++) {
+    for (const auto& term : in) {
        if (prune_prefixed_terms) {
-            if (has_prefix(*qit))
+            if (has_prefix(term))
                continue;
        }
-        out.push_back(strip_prefix(*qit));
+        out.push_back(strip_prefix(term));
    }
    sort(out.begin(), out.end());
    vector<string>::iterator it = unique(out.begin(), out.end());
@ -117,18 +111,17 @@ void Query::Native::setDbWideQTermsFreqs()
        m_q->getQueryTerms(iqterms);
        noPrefixList(iqterms, qterms);
    }
-    // listList("Query terms: ", qterms);
+    LOGDEB("Query terms: " << stringsToString(qterms) << endl);
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;

    double doccnt = xrdb.get_doccount();
    if (doccnt == 0) 
        doccnt = 1;

-    for (vector<string>::const_iterator qit = qterms.begin(); 
-         qit != qterms.end(); qit++) {
-        termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
-        LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
-               termfreqs[*qit] << "\n");
+    for (const auto& term : qterms) {
+        termfreqs[term] = xrdb.get_termfreq(term) / doccnt;
+        LOGABS("setDbWideQTermFreqs: [" << term << "] db freq " <<
+               termfreqs[term] << "\n");
    }
 }

@ -162,36 +155,29 @@ double Query::Native::qualityTerms(Xapian::docid docid,
        m_q->m_sd->getTerms(hld);
    }

-#ifdef DEBUGABSTRACT
-    {
-        string deb;
-        hld.toString(deb);
-        LOGABS("qualityTerms: hld: " << deb << "\n");
-    }
-#endif
-
-    // Group the input terms by the user term they were possibly expanded from
+    // Group the input terms by the user term they were possibly
+    // expanded from (by stemming)
    map<string, vector<string> > byRoot;
-    for (vector<string>::const_iterator qit = terms.begin(); 
-         qit != terms.end(); qit++) {
-        map<string, string>::const_iterator eit = hld.terms.find(*qit);
+    for (const auto& term: terms) {
+        map<string, string>::const_iterator eit = hld.terms.find(term);
        if (eit != hld.terms.end()) {
-            byRoot[eit->second].push_back(*qit);
+            byRoot[eit->second].push_back(term);
        } else {
-            LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
-            byRoot[*qit].push_back(*qit);
+            LOGDEB0("qualityTerms: [" << term << "] not found in hld\n");
+            byRoot[term].push_back(term);
        }
    }

 #ifdef DEBUGABSTRACT
    {
+        string deb;
+        hld.toString(deb);
+        LOGABS("qualityTerms: hld: " << deb << "\n");
        string byRootstr;
-        for (map<string, vector<string> >::const_iterator debit = 
-                 byRoot.begin();  debit != byRoot.end(); debit++) {
-            byRootstr.append("[").append(debit->first).append("]->");
-            for (vector<string>::const_iterator it = debit->second.begin();
-                 it != debit->second.end(); it++) {
-                byRootstr.append("[").append(*it).append("] ");
+        for (const auto& entry : byRoot) {
+            byRootstr.append("[").append(entry.first).append("]->");
+            for (const auto& term : entry.second) {
+                byRootstr.append("[").append(term).append("] ");
            }
            byRootstr.append("\n");
        }
@ -202,28 +188,25 @@ double Query::Native::qualityTerms(Xapian::docid docid,
    // Compute in-document and global frequencies for the groups.
    map<string, double> grpwdfs;
    map<string, double> grptfreqs;
-    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
-         git != byRoot.end(); git++) {
-        for (vector<string>::const_iterator qit = git->second.begin(); 
-             qit != git->second.end(); qit++) {
-            Xapian::TermIterator term = xrdb.termlist_begin(docid);
-            term.skip_to(*qit);
-            if (term != xrdb.termlist_end(docid) && *term == *qit) {
-                if (grpwdfs.find(git->first) != grpwdfs.end()) {
-                    grpwdfs[git->first] = term.get_wdf() / doclen;
-                    grptfreqs[git->first] = termfreqs[*qit];
+    for (const auto& group : byRoot) {
+        for (const auto& term : group.second) {
+            Xapian::TermIterator xtermit = xrdb.termlist_begin(docid);
+            xtermit.skip_to(term);
+            if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) {
+                if (grpwdfs.find(group.first) != grpwdfs.end()) {
+                    grpwdfs[group.first] = xtermit.get_wdf() / doclen;
+                    grptfreqs[group.first] = termfreqs[term];
                } else {
-                    grpwdfs[git->first] += term.get_wdf() / doclen;
-                    grptfreqs[git->first] += termfreqs[*qit];
+                    grpwdfs[group.first] += xtermit.get_wdf() / doclen;
+                    grptfreqs[group.first] += termfreqs[term];
                }
            }
        }    
    }

    // Build a sorted by quality container for the groups
-    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
-         git != byRoot.end(); git++) {
-        double q = (grpwdfs[git->first]) * grptfreqs[git->first];
+    for (const auto& group : byRoot) {
+        double q = (grpwdfs[group.first]) * grptfreqs[group.first];
        q = -log10(q);
        if (q < 3) {
            q = 0.05;
@ -237,22 +220,19 @@ double Query::Native::qualityTerms(Xapian::docid docid,
            q = 1;
        }
        totalweight += q;
-        byQ.insert(pair<double, vector<string> >(q, git->second));
+        byQ.insert(pair<double, vector<string> >(q, group.second));
    }

 #ifdef DEBUGABSTRACT
-    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
-         mit != byQ.rend(); mit++) {
-        LOGABS("qualityTerms: group\n");
-        for (vector<string>::const_iterator qit = mit->second.begin();
-             qit != mit->second.end(); qit++) {
-            LOGABS("" << mit->first << "->[" << *qit << "]\n");
-        }
+    for (auto mit= byQ.rbegin(); mit != byQ.rend(); mit++) {
+        LOGABS("qualityTerms: coef: " << mit->first << " group: " <<
+               stringsToString(mit->second) << endl);
    }
 #endif
    return totalweight;
 }

+
 // Return page number for first match of "significant" term.
 int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
 {
@ -283,8 +263,7 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
    multimap<double, vector<string> > byQ;
    qualityTerms(docid, terms, byQ);

-    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
-         mit != byQ.rend(); mit++) {
+    for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
        for (vector<string>::const_iterator qit = mit->second.begin();
             qit != mit->second.end(); qit++) {
            string qterm = *qit;
@ -307,55 +286,211 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
    return -1;
 }

-// Build a document abstract by extracting text chunks around the query terms
-// This uses the db termlists, not the original document.
-//
-// DatabaseModified and other general exceptions are catched and
-// possibly retried by our caller
-int Query::Native::makeAbstract(Xapian::docid docid,
-                                vector<Snippet>& vabs, 
-                                int imaxoccs, int ictxwords)
+// Creating the abstract from index position data: populate the sparse
+// array with the positions for a given query term, and mark the
+// neighboring positions.
+void Query::Native::abstractPopulateQTerm(
+    Xapian::Database& xrdb,
+    Xapian::docid docid,
+    const string& qterm,
+    int qtrmwrdcnt,
+    int ctxwords,
+    unsigned int maxgrpoccs,
+    unsigned int maxtotaloccs,
+    map<unsigned int, string>& sparseDoc,
+    unordered_set<unsigned int>& searchTermPositions,
+    unsigned int& maxpos,
+    unsigned int& totaloccs,
+    unsigned int& grpoccs,
+    int& ret
+    )
 {
-    Chrono chron;
-    LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
-           imaxoccs << " ictxwords " << ictxwords << "\n");
+    Xapian::PositionIterator pos;

-    // The (unprefixed) terms matched by this document
-    vector<string> matchedTerms;
-    getMatchTerms(docid, matchedTerms);
-    if (matchedTerms.empty()) {
-        LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
-        return ABSRES_ERROR;
+    // Walk the position list for this term.
+    for (pos = xrdb.positionlist_begin(docid, qterm);
+         pos != xrdb.positionlist_end(docid, qterm); pos++) {
+        int ipos = *pos;
+        if (ipos < int(baseTextPosition)) // Not in text body
+            continue;
+        LOGABS("makeAbstract: [" << qterm << "] at pos " <<
+               ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
+               maxgrpoccs << "\n");
+
+        totaloccs++;
+        grpoccs++;
+
+        // Add adjacent slots to the set to populate at next
+        // step by inserting empty strings. Special provisions
+        // for adding ellipsis and for positions overlapped by
+        // the match term.
+        unsigned int sta = MAX(int(baseTextPosition), 
+                               ipos - ctxwords);
+        unsigned int sto = ipos + qtrmwrdcnt-1 + 
+            m_q->m_db->getAbsCtxLen();
+        for (unsigned int ii = sta; ii <= sto;  ii++) {
+            if (ii == (unsigned int)ipos) {
+                sparseDoc[ii] = qterm;
+                searchTermPositions.insert(ii);
+                if (ii > maxpos)
+                    maxpos = ii;
+            } else if (ii > (unsigned int)ipos && 
+                       ii < (unsigned int)ipos + qtrmwrdcnt) {
+                // Position for another word of the multi-word term
+                sparseDoc[ii] = occupiedmarker;
+            } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
+                // For an empty slot, the test above has a side
+                // effect of inserting an empty string which
+                // is what we want. Do it also if it was an ellipsis
+                sparseDoc[ii] = emptys;
+            }
+        }
+        // Add ellipsis at the end. This may be replaced later by
+        // an overlapping extract. Take care not to replace an
+        // empty string here, we really want an empty slot,
+        // use find()
+        if (sparseDoc.find(sto+1) == sparseDoc.end()) {
+            sparseDoc[sto+1] = cstr_ellipsis;
+        }
+
+        // Group done ?
+        if (grpoccs >= maxgrpoccs) {
+            ret |= ABSRES_TRUNC;
+            LOGABS("Db::makeAbstract: max group occs cutoff\n");
+            break;
+        }
+        // Global done ?
+        if (totaloccs >= maxtotaloccs) {
+            ret |= ABSRES_TRUNC;
+            LOGABS("Db::makeAbstract: max occurrences cutoff\n");
+            break;
+        }
    }
+}

-    listList("Match terms: ", matchedTerms);
+// Creating the abstract from index position data: after the query
+// terms have been inserted at their place in the sparse array, and
+// the neighboring positions marked, populate the neighbours: for each
+// term in the document, walk its position list and populate slots
+// around the query terms. We arbitrarily truncate the list to avoid
+// taking forever. If we do cutoff, the abstract may be inconsistant
+// (missing words, potentially altering meaning), which is bad.
+void Query::Native::abstractPopulateContextTerms(
+    Xapian::Database& xrdb,
+    Xapian::docid docid,
+    unsigned int maxpos,
+    map<unsigned int, string>& sparseDoc,
+    int& ret
+    )
+{
+    Xapian::TermIterator term;
+    int cutoff = m_q->m_snipMaxPosWalk;
+    for (term = xrdb.termlist_begin(docid);
+         term != xrdb.termlist_end(docid); term++) {
+        // Ignore prefixed terms
+        if (has_prefix(*term))
+            continue;
+        if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+            ret |= ABSRES_TERMMISS;
+            LOGDEB0("makeAbstract: max term count cutoff " <<
+                    m_q->m_snipMaxPosWalk << "\n");
+            break;
+        }

-    // Retrieve the term frequencies for the query terms. This is
-    // actually computed only once for a query, and for all terms in
-    // the query (not only the matches for this doc)
-    setDbWideQTermsFreqs();
-
-    // Build a sorted by quality container for the match terms We are
-    // going to try and show text around the less common search terms.
-    // Terms issued from an original one by stem expansion are
-    // aggregated by the qualityTerms() routine.
-    multimap<double, vector<string> > byQ;
-    double totalweight = qualityTerms(docid, matchedTerms, byQ);
-    LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
-    // This can't happen, but would crash us
-    if (totalweight == 0.0) {
-        LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
-        return ABSRES_ERROR;
+        map<unsigned int, string>::iterator vit;
+        Xapian::PositionIterator pos;
+        for (pos = xrdb.positionlist_begin(docid, *term);
+             pos != xrdb.positionlist_end(docid, *term); pos++) {
+            if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+                ret |= ABSRES_TERMMISS;
+                LOGDEB0("makeAbstract: max term count cutoff " <<
+                        m_q->m_snipMaxPosWalk << "\n");
+                break;
+            }
+            // If we are beyond the max possible position, stop
+            // for this term
+            if (*pos > maxpos) {
+                break;
+            }
+            if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
+                // Don't replace a term: the terms list is in
+                // alphabetic order, and we may have several terms
+                // at the same position, we want to keep only the
+                // first one (ie: dockes and dockes@wanadoo.fr)
+                if (vit->second.empty()) {
+                    LOGDEB2("makeAbstract: populating: [" << *term <<
+                            "] at " << *pos << "\n");
+                    sparseDoc[*pos] = *term;
+                }
+            }
+        }
    }
+}

-    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+// Creating the abstract from position data: final phase: extract the
+// snippets from the sparse array.
+void Query::Native::abstractCreateSnippetsVector(
+    Rcl::Db::Native *ndb,
+    map<unsigned int, string>& sparseDoc,
+    unordered_set<unsigned int>& searchTermPositions,
+    vector<int>& vpbreaks,
+    vector<Snippet>& vabs)
+{
+    vabs.clear();
+    string chunk;
+    bool incjk = false;
+    int page = 0;
+    string term;
+
+    for (const auto& ent : sparseDoc) {
+        LOGDEB2("Abtract:output "<< ent.first <<" -> [" <<ent.second <<"]\n");
+        if (!occupiedmarker.compare(ent.second)) {
+            LOGDEB("Abstract: qtrm position not filled ??\n");
+            continue;
+        }
+        if (chunk.empty() && !vpbreaks.empty()) {
+            page =  ndb->getPageNumberForPosition(vpbreaks, ent.first);
+            if (page < 0) 
+                page = 0;
+            term.clear();
+        }
+        Utf8Iter uit(ent.second);
+        bool newcjk = false;
+        if (TextSplit::isCJK(*uit))
+            newcjk = true;
+        if (!incjk || (incjk && !newcjk))
+            chunk += " ";
+        incjk = newcjk;
+        if (searchTermPositions.find(ent.first) != searchTermPositions.end())
+            term = ent.second;
+        if (ent.second == cstr_ellipsis) {
+            vabs.push_back(Snippet(page, chunk).setTerm(term));
+            chunk.clear();
+        } else {
+            if (ent.second.compare(end_of_field_term) && 
+                ent.second.compare(start_of_field_term))
+                chunk += ent.second;
+        }
+    }
+    if (!chunk.empty())
+        vabs.push_back(Snippet(page, chunk).setTerm(term));
+}
+
+// Creating the abstract from index position data: top level routine
+int Query::Native::abstractFromIndex(
+    Rcl::Db::Native *ndb,
+    Xapian::docid docid,
+    const vector<string>& matchTerms,
+    const multimap<double, vector<string>> byQ,
+    double totalweight,
+    int ctxwords,
+    unsigned int maxtotaloccs,
+    vector<Snippet>& vabs,
+    Chrono& chron
+    )
+{
    Xapian::Database& xrdb(ndb->xrdb);
-
-    ///////////////////
-    // For each of the query terms, ask xapian for its positions list
-    // in the document. For each position entry, insert it and its
-    // neighbours in the set of 'interesting' positions
-
+    int ret = ABSRES_OK;
    // The terms 'array' that we partially populate with the document
    // terms, at their positions around the search terms positions:
    map<unsigned int, string> sparseDoc;
@ -370,22 +505,12 @@ int Query::Native::makeAbstract(Xapian::docid docid,
    // Total number of occurences for all terms. We stop when we have too much
    unsigned int totaloccs = 0;

-    // Total number of slots we populate. The 7 is taken as
-    // average word size. It was a mistake to have the user max
-    // abstract size parameter in characters, we basically only deal
-    // with words. We used to limit the character size at the end, but
-    // this damaged our careful selection of terms
-    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
-        m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
-    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
-    LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
-           maxtotaloccs << " ctxwords " << ctxwords << "\n");
-
-    int ret = ABSRES_OK;
-
-    // Let's go populate
-    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
-         mit != byQ.rend(); mit++) {
+    // First pass to populate the sparse document: we walk the term
+    // groups, beginning with the better ones, and insert each term at
+    // its position. We also insert empty strings at the surrounding
+    // positions. These are markers showing where we should insert
+    // data during the next pass.
+    for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
        unsigned int maxgrpoccs;
        double q;
        if (byQ.size() == 1) {
@ -398,87 +523,30 @@ int Query::Native::makeAbstract(Xapian::docid docid,
        }
        unsigned int grpoccs = 0;

-        for (vector<string>::const_iterator qit = mit->second.begin();
-             qit != mit->second.end(); qit++) {
-
-            // Group done ?
+        // For each term in user term expansion group
+        for (const auto& qterm : mit->second) {
+            // Enough for this group ?
            if (grpoccs >= maxgrpoccs) 
                break;

-            string qterm = *qit;
-
            LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
                   " max grp occs (coef " << q << ")\n");

-            // The match term may span several words
+            // The match term may span several words (more than one position)
            int qtrmwrdcnt = 
                TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);

-            Xapian::PositionIterator pos;
+            // Populate positions for this query term.
            // There may be query terms not in this doc. This raises an
            // exception when requesting the position list, we catch it ??
            // Not clear how this can happen because we are walking the
            // match list returned by Xapian. Maybe something with the
            // fields?
-            string emptys;
            try {
-                for (pos = xrdb.positionlist_begin(docid, qterm);
-                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
-                    int ipos = *pos;
-                    if (ipos < int(baseTextPosition)) // Not in text body
-                        continue;
-                    LOGABS("makeAbstract: [" << qterm << "] at pos " <<
-                           ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
-                           maxgrpoccs << "\n");
-
-                    totaloccs++;
-                    grpoccs++;
-
-                    // Add adjacent slots to the set to populate at next
-                    // step by inserting empty strings. Special provisions
-                    // for adding ellipsis and for positions overlapped by
-                    // the match term.
-                    unsigned int sta = MAX(int(baseTextPosition), 
-                                           ipos - ctxwords);
-                    unsigned int sto = ipos + qtrmwrdcnt-1 + 
-                        m_q->m_db->getAbsCtxLen();
-                    for (unsigned int ii = sta; ii <= sto;  ii++) {
-                        if (ii == (unsigned int)ipos) {
-                            sparseDoc[ii] = qterm;
-                            searchTermPositions.insert(ii);
-                            if (ii > maxpos)
-                                maxpos = ii;
-                        } else if (ii > (unsigned int)ipos && 
-                                   ii < (unsigned int)ipos + qtrmwrdcnt) {
-                            sparseDoc[ii] = occupiedmarker;
-                        } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
-                            // For an empty slot, the test has a side
-                            // effect of inserting an empty string which
-                            // is what we want.
-                            sparseDoc[ii] = emptys;
-                        }
-                    }
-                    // Add ellipsis at the end. This may be replaced later by
-                    // an overlapping extract. Take care not to replace an
-                    // empty string here, we really want an empty slot,
-                    // use find()
-                    if (sparseDoc.find(sto+1) == sparseDoc.end()) {
-                        sparseDoc[sto+1] = cstr_ellipsis;
-                    }
-
-                    // Group done ?
-                    if (grpoccs >= maxgrpoccs) {
-                        ret |= ABSRES_TRUNC;
-                        LOGABS("Db::makeAbstract: max group occs cutoff\n");
-                        break;
-                    }
-                    // Global done ?
-                    if (totaloccs >= maxtotaloccs) {
-                        ret |= ABSRES_TRUNC;
-                        LOGABS("Db::makeAbstract: max occurrences cutoff\n");
-                        break;
-                    }
-                }
+                abstractPopulateQTerm(xrdb, docid, qterm, qtrmwrdcnt, ctxwords,
+                                      maxgrpoccs,maxtotaloccs, sparseDoc,
+                                      searchTermPositions, maxpos, totaloccs,
+                                      grpoccs, ret);
            } catch (...) {
                // Term does not occur. No problem.
            }
@ -494,6 +562,7 @@ int Query::Native::makeAbstract(Xapian::docid docid,

    LOGABS("makeAbstract:" << chron.millis() <<
           "mS:chosen number of positions " << totaloccs << "\n");
+
    // This can happen if there are term occurences in the keywords
    // etc. but not elsewhere ?
    if (totaloccs == 0) {
@ -501,124 +570,95 @@ int Query::Native::makeAbstract(Xapian::docid docid,
        return ABSRES_OK;
    }

-    // Walk all document's terms position lists and populate slots
-    // around the query terms. We arbitrarily truncate the list to
-    // avoid taking forever. If we do cutoff, the abstract may be
-    // inconsistant (missing words, potentially altering meaning),
-    // which is bad. 
-    { 
-        Xapian::TermIterator term;
-        int cutoff = m_q->m_snipMaxPosWalk;
-        for (term = xrdb.termlist_begin(docid);
-             term != xrdb.termlist_end(docid); term++) {
-            // Ignore prefixed terms
-            if (has_prefix(*term))
-                continue;
-            if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
-                ret |= ABSRES_TERMMISS;
-                LOGDEB0("makeAbstract: max term count cutoff " <<
-                        m_q->m_snipMaxPosWalk << "\n");
-                break;
-            }
-
-            map<unsigned int, string>::iterator vit;
-            Xapian::PositionIterator pos;
-            for (pos = xrdb.positionlist_begin(docid, *term);
-                 pos != xrdb.positionlist_end(docid, *term); pos++) {
-                if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
-                    ret |= ABSRES_TERMMISS;
-                    LOGDEB0("makeAbstract: max term count cutoff " <<
-                            m_q->m_snipMaxPosWalk << "\n");
-                    break;
-                }
-                // If we are beyond the max possible position, stop
-                // for this term
-                if (*pos > maxpos) {
-                    break;
-                }
-                if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
-                    // Don't replace a term: the terms list is in
-                    // alphabetic order, and we may have several terms
-                    // at the same position, we want to keep only the
-                    // first one (ie: dockes and dockes@wanadoo.fr)
-                    if (vit->second.empty()) {
-                        LOGDEB2("makeAbstract: populating: [" << *term <<
-                                "] at " << *pos << "\n");
-                        sparseDoc[*pos] = *term;
-                    }
-                }
-            }
-        }
-    }
+    abstractPopulateContextTerms(xrdb, docid, maxpos, sparseDoc, ret);
+    
    LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");

-#if 0
-    // Debug only: output the full term[position] vector
-    bool epty = false;
-    int ipos = 0;
-    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
-         it != sparseDoc.end();
-         it++, ipos++) {
-        if (it->empty()) {
-            if (!epty)
-                LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
-            epty=true;
-        } else {
-            epty = false;
-            LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
-        }
-    }
-#endif
-
    vector<int> vpbreaks;
    ndb->getPagePositions(docid, vpbreaks);

    LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
           vpbreaks.size() << " pages\n");
-    // Finally build the abstract by walking the map (in order of position)
-    vabs.clear();
-    string chunk;
-    bool incjk = false;
-    int page = 0;
-    string term;
-    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
-         it != sparseDoc.end(); it++) {
-        LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
-                "]\n");
-        if (!occupiedmarker.compare(it->second)) {
-            LOGDEB("Abstract: qtrm position not filled ??\n");
-            continue;
-        }
-        if (chunk.empty() && !vpbreaks.empty()) {
-            page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
-            if (page < 0) 
-                page = 0;
-            term.clear();
-        }
-        Utf8Iter uit(it->second);
-        bool newcjk = false;
-        if (TextSplit::isCJK(*uit))
-            newcjk = true;
-        if (!incjk || (incjk && !newcjk))
-            chunk += " ";
-        incjk = newcjk;
-        if (searchTermPositions.find(it->first) != searchTermPositions.end())
-            term = it->second;
-        if (it->second == cstr_ellipsis) {
-            vabs.push_back(Snippet(page, chunk).setTerm(term));
-            chunk.clear();
-        } else {
-            if (it->second.compare(end_of_field_term) && 
-                it->second.compare(start_of_field_term))
-                chunk += it->second;
-        }
-    }
-    if (!chunk.empty())
-        vabs.push_back(Snippet(page, chunk).setTerm(term));

+    // Finally build the abstract by walking the map (in order of position)
+    abstractCreateSnippetsVector(ndb, sparseDoc, searchTermPositions,
+                                 vpbreaks, vabs);
+    
    LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
    return ret;
 }


+// Build a document abstract by extracting text chunks around the
+// query terms.  This can either uses the index position lists, or the
+// stored document text, with very different implementations.
+//
+// DatabaseModified and other general exceptions are catched and
+// possibly retried by our caller.
+//
+// @param[out] vabs the abstract is returned as a vector of snippets.
+int Query::Native::makeAbstract(Xapian::docid docid,
+                                vector<Snippet>& vabs, 
+                                int imaxoccs, int ictxwords)
+{
+    Chrono chron;
+    LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
+           imaxoccs << " ictxwords " << ictxwords << "\n");
+
+    // The (unprefixed) terms matched by this document
+    vector<string> matchedTerms;
+    getMatchTerms(docid, matchedTerms);
+    if (matchedTerms.empty()) {
+        LOGDEB("makeAbstract:" << chron.millis() << "mS:Empty term list\n");
+        return ABSRES_ERROR;
+    }
+
+    LOGDEB("Match terms: " << stringsToString(matchedTerms) << endl);
+
+    // Retrieve the term frequencies for the query terms. This is
+    // actually computed only once for a query, and for all terms in
+    // the query (not only the matches for this doc)
+    setDbWideQTermsFreqs();
+
+    // Build a sorted by quality container for the match terms We are
+    // going to try and show text around the less common search terms.
+    // Terms issued from an original one by stem expansion are
+    // aggregated by the qualityTerms() routine (this is what we call
+    // 'term groups' in the following: index terms expanded from the
+    // same user term).
+    multimap<double, vector<string>> byQ;
+    double totalweight = qualityTerms(docid, matchedTerms, byQ);
+    LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
+    // This can't happen, but would crash us
+    if (totalweight == 0.0) {
+        LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
+        return ABSRES_ERROR;
+    }
+
+    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+    Xapian::Database& xrdb(ndb->xrdb);
+
+    // Total number of slots we populate. The 7 is taken as
+    // average word size. It was a mistake to have the user max
+    // abstract size parameter in characters, we basically only deal
+    // with words. We used to limit the character size at the end, but
+    // this damaged our careful selection of terms
+    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
+        m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
+    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
+    LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
+           maxtotaloccs << " ctxwords " << ctxwords << "\n");
+
+    if (o_index_storerawtext) {
+        return abstractFromText(ndb, docid, matchedTerms, byQ,
+                                totalweight, ctxwords, maxtotaloccs, vabs,
+                                chron);
+    } else {
+        return abstractFromIndex(ndb, docid, matchedTerms, byQ,
+                                 totalweight, ctxwords, maxtotaloccs, vabs,
+                                 chron);
+    }
+}
+
+
 }
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -61,6 +61,7 @@ using namespace std;
 #ifdef RCL_USE_ASPELL
 #include "rclaspell.h"
 #endif
+#include "zlibut.h"

 // Recoll index format version is stored in user metadata. When this change,
 // we can't open the db and will have to reindex.
@ -1458,8 +1459,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 #ifdef TEXTSPLIT_STATS
 	splitter.resetStats();
 #endif
-	if (!splitter.text_to_words(doc.text))
+	if (!splitter.text_to_words(doc.text)) {
 	    LOGDEB("Db::addOrUpdate: split failed for main text\n");
+        } else {
+#ifdef RAWTEXT_IN_VALUE
+            if (o_index_storerawtext) {
+                ZLibUtBuf buf;
+                deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
+                string tt;
+                tt.assign(buf.getBuf(), buf.getCnt());
+                newdocument.add_value(VALUE_RAWTEXT, tt);
+            }
+#endif
+        }

 #ifdef TEXTSPLIT_STATS
 	// Reject bad data. unrecognized base64 text is characterized by
@ -1670,6 +1682,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 	    newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
 	}

+#ifdef RAWTEXT_IN_DATA
+        if (o_index_storerawtext) {
+            RECORD_APPEND(record, string("RAWTEXT"),
+                          neutchars(doc.text, cstr_nc));
+        }
+#endif
 	LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
 	newdocument.set_data(record);
    }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -67,8 +67,14 @@ enum value_slot {
    VALUE_MD5 = 1,	// 16 byte MD5 checksum of original document.
    VALUE_SIZE = 2,     // sortable_serialise(<file size in bytes>)

-    // Recoll only:
-    VALUE_SIG = 10      // Doc sig as chosen by app (ex: mtime+size
+    ////////// Recoll only:
+    // Doc sig as chosen by app (ex: mtime+size
+    VALUE_SIG = 10,
+    // Doc extracted text, with punctuation: splitter input. Used for
+    // generating snippets. This is only used if RAWTEXT_IN_VALUE is
+    // defined (else the text goes to the data record), but reserve
+    // the value in any case.
+    VALUE_RAWTEXT= 11,  
 };

 class SearchData;
--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@ -177,5 +177,12 @@ class Db::Native {
 // (abstract, keywords, etc.. are stored before this)
 static const unsigned int baseTextPosition = 100000;

+// Store raw doc text in data record or value slot ?
+#if 0
+#define RAWTEXT_IN_DATA 1
+#elif 1
+#define RAWTEXT_IN_VALUE 1
+#endif
+
 }
 #endif /* _rcldb_p_h_included_ */
--- a/src/rcldb/rclquery_p.h
+++ b/src/rcldb/rclquery_p.h
@ -20,10 +20,13 @@
 #include <map>
 #include <vector>
 #include <string>
+#include <unordered_set>

 #include <xapian.h>
 #include "rclquery.h"

+class Chrono;
+
 namespace Rcl {

 class Query::Native {
@ -58,6 +61,56 @@ public:
    double qualityTerms(Xapian::docid docid, 
                        const std::vector<std::string>& terms,
                        std::multimap<double, std::vector<std::string> >& byQ);
+    void abstractPopulateQTerm(
+        Xapian::Database& xrdb,
+        Xapian::docid docid,
+        const string& qterm,
+        int qtrmwrdcnt,
+        int ctxwords,
+        unsigned int maxgrpoccs,
+        unsigned int maxtotaloccs,
+        std::map<unsigned int, std::string>& sparseDoc,
+        std::unordered_set<unsigned int>& searchTermPositions,
+        unsigned int& maxpos,
+        unsigned int& totaloccs,
+        unsigned int& grpoccs,
+        int& ret
+        );
+    void abstractPopulateContextTerms(
+        Xapian::Database& xrdb,
+        Xapian::docid docid,
+        unsigned int maxpos,
+        std::map<unsigned int, std::string>& sparseDoc,
+        int& ret
+        );
+    void abstractCreateSnippetsVector(
+        Db::Native *ndb,
+        std::map<unsigned int, std::string>& sparseDoc,
+        std::unordered_set<unsigned int>& searchTermPositions,
+        std::vector<int>& vpbreaks,
+        std::vector<Snippet>& vabs);
+    int abstractFromIndex(
+        Rcl::Db::Native *ndb,
+        Xapian::docid docid,
+        const std::vector<std::string>& matchTerms,
+        const std::multimap<double, std::vector<std::string>> byQ,
+        double totalweight,
+        int ctxwords,
+        unsigned int maxtotaloccs,
+        std::vector<Snippet>& vabs,
+        Chrono& chron
+        );
+    int abstractFromText(
+        Rcl::Db::Native *ndb,
+        Xapian::docid docid,
+        const std::vector<std::string>& matchTerms,
+        const std::multimap<double, std::vector<std::string>> byQ,
+        double totalweight,
+        int ctxwords,
+        unsigned int maxtotaloccs,
+        vector<Snippet>& vabs,
+        Chrono& chron
+        );
 };

 }
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@ -214,7 +214,7 @@ membermaxkbs = 50000


 # <grouptitle id="TERMS">Parameters affecting how we generate
-# terms</grouptitle> 
+# terms and organize the index</grouptitle> 

 # Changing some of these parameters will imply a full
 # reindex. Also, when using multiple indexes, it may not make sense
@ -231,6 +231,21 @@ membermaxkbs = 50000
 # implies an index reset.</descr></var>
 indexStripChars = 1

+# <var name="indexStoreRawText" type="bool"><brief>Decide if we store the
+# documents' text content in the index.</brief><descr>Storing the text
+# allows extracting snippets from it at query time, 
+# instead of building them from index position data. This Has become
+# necessary for versions of Xapian 1.6, which have dropped support
+# for the chert index format, and adopted a setup which renders our
+# use of positions list unacceptably slow in cases. 'raw' text here
+# means that the text is not stripped of upper-case, diacritics, or
+# punctuation signs. It is still translated from its original format
+# to UTF-8 plain text. This increases the index size by 10-20% typically,
+# but also allows for nicer snippets, so it may be worth enabling it even
+# if not strictly needed for performance if you can afford the space.
+# </desc></var>
+indexStoreRawText = 0
+
 # <var name="nonumbers" type="bool"><brief>Decides if terms will be
 # generated for numbers.</brief><descr>For example "123", "1.5e6",
 # 192.168.1.4, would not be indexed if nonumbers is set ("value123" would