From aa2f0bfd73b71fbdc32e2383033ee62f01033f0a Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Fri, 15 Jan 2021 12:04:06 +0100
Subject: [PATCH] implemented multi-word terms indexing for phrase/prox search
 on multiword synonyms

---
 src/common/rclconfig.cpp    |   4 +-
 src/common/rclconfig.h      |  70 ++++++++--------
 src/common/syngroups.cpp    | 155 +++++++++++++++++++++++-------------
 src/common/syngroups.h      |  10 ++-
 src/rcldb/rcldb.cpp         |  43 +++++-----
 src/rcldb/rcldb.h           |  51 ++++++------
 src/rcldb/rclterms.cpp      |   2 +-
 src/rcldb/searchdatatox.cpp |  50 ++++++------
 src/rcldb/termproc.h        | 120 +++++++++++++++++-----------
 src/sampleconf/recoll.conf  |   8 ++
 10 files changed, 296 insertions(+), 217 deletions(-)
diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp
index 30f88255..f2f3f282 100644
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
     return getConfdirPath("stoplistfile", "stoplist.txt");
 }
 
-string RclConfig::getSynGroupsFile() const
+string RclConfig::getIdxSynGroupsFile() const
 {
-    return getConfdirPath("syngroupsfile", "syngroups.txt");
+    return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
 }
 
 // The index status file is fast changing, so it's possible to put it outside
diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h
index 7e7bb22b..cf00d094 100644
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@@ -72,8 +72,8 @@ private:
 
 // Hold the description for an external metadata-gathering command
 struct MDReaper {
-  string fieldname;
-  vector<string> cmdv;
+    string fieldname;
+    vector<string> cmdv;
 };
 
 // Data associated to a indexed field name: 
@@ -90,7 +90,7 @@ struct FieldTraits {
 };
 
 class RclConfig {
- public:
+public:
 
     // Constructor: we normally look for a configuration file, except
     // if this was specified on the command line and passed through
@@ -100,7 +100,7 @@ class RclConfig {
     RclConfig(const RclConfig &r);
 
     ~RclConfig() {
-    freeAll();
+        freeAll();
     }
 
     // Return a writable clone of the main config. This belongs to the
@@ -133,18 +133,16 @@ class RclConfig {
     string getKeyDir() const {return m_keydir;}
 
     /** Get generic configuration parameter according to current keydir */
-    bool getConfParam(const string &name, string &value, 
-                      bool shallow=false) const
-    {
-    if (m_conf == 0)
-        return false;
-    return m_conf->get(name, value, m_keydir, shallow);
+    bool getConfParam(const string& name, string& value, 
+                      bool shallow=false) const {
+            if (m_conf == 0)
+                return false;
+            return m_conf->get(name, value, m_keydir, shallow);
     }
     /** Variant with autoconversion to int */
     bool getConfParam(const string &name, int *value, bool shallow=false) const;
     /** Variant with autoconversion to bool */
-    bool getConfParam(const string &name, bool *value, 
-                      bool shallow=false) const;
+    bool getConfParam(const string &name, bool *value, bool shallow=false) const;
     /** Variant with conversion to vector<string>
      *  (stringToStrings). Can fail if the string is malformed. */
     bool getConfParam(const string &name, vector<string> *value, 
@@ -164,18 +162,15 @@ class RclConfig {
      * Get list of config names under current sk, with possible 
      * wildcard filtering 
      */
-    vector<string> getConfNames(const char *pattern = 0) const
-    {
-    return m_conf->getNames(m_keydir, pattern);
+    vector<string> getConfNames(const char *pattern = 0) const {
+        return m_conf->getNames(m_keydir, pattern);
     }
 
     /** Check if name exists anywhere in config */
-    bool hasNameAnywhere(const string& nm) const
-    {
+    bool hasNameAnywhere(const string& nm) const {
         return m_conf? m_conf->hasNameAnywhere(nm) : false;
     }
 
-
     /** Get default charset for current keydir (was set during setKeydir) 
      * filenames are handled differently */
     const string &getDefCharset(bool filename = false) const;
@@ -198,7 +193,7 @@ class RclConfig {
     /** Get stoplist file name */
     string getStopfile() const;
     /** Get synonym groups file name */
-    string getSynGroupsFile() const;
+    string getIdxSynGroupsFile() const;
     /** Get indexing pid file name */
     string getPidfile() const;
     /** Get indexing status file name */
@@ -207,7 +202,7 @@ class RclConfig {
     /** Do path translation according to the ptrans table */
     void urlrewrite(const string& dbdir, string& url) const;
     ConfSimple *getPTrans() {
-    return m_ptrans;
+        return m_ptrans;
     }
     /** Get Web Queue directory name */
     string getWebQueueDir() const;
@@ -215,13 +210,13 @@ class RclConfig {
     /** Get list of skipped file names for current keydir */
     vector<string>& getSkippedNames();
     /** Get list of file name filters for current keydir (only those
-       names indexed) */
+        names indexed) */
     vector<string>& getOnlyNames();
 
     /** Get list of skipped paths patterns. Doesn't depend on the keydir */
     vector<string> getSkippedPaths() const;
     /** Get list of skipped paths patterns, daemon version (may add some)
-    Doesn't depend on the keydir */
+        Doesn't depend on the keydir */
     vector<string> getDaemSkippedPaths() const;
 
     /** Return list of no content suffixes. Used by confgui, indexing uses
@@ -260,7 +255,7 @@ class RclConfig {
      * @param whole the raw value. No way to escape a semi-colon in there.
      */
     static bool valueSplitAttributes(const string& whole, string& value, 
-                     ConfSimple& attrs) ;
+                                     ConfSimple& attrs) ;
 
     /** Compute difference between 'base' and 'changed', as elements to be
      * added and substracted from base. Input and output strings are in
@@ -288,9 +283,9 @@ class RclConfig {
     bool getGuiFilter(const string& filtername, string& frag) const;
 
     /** fields: get field prefix from field name. Use additional query
-       aliases if isquery is set */
+        aliases if isquery is set */
     bool getFieldTraits(const string& fldname, const FieldTraits **,
-        bool isquery = false) const;
+                        bool isquery = false) const;
 
     const set<string>& getStoredFields() const {return m_storedFields;}
 
@@ -311,11 +306,11 @@ class RclConfig {
      */
     vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
     bool getFieldConfParam(const string &name, const string &sk, string &value)
-    const;
+        const;
 
     /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
     string getMimeViewerDef(const string &mimetype, const string& apptag, 
-                bool useall) const;
+                            bool useall) const;
     set<string> getMimeViewerAllEx() const;
     bool setMimeViewerAllEx(const set<string>& allex);
     bool getMimeViewerDefs(vector<pair<string, string> >&) const;
@@ -358,26 +353,25 @@ class RclConfig {
     string findFilter(const string& cmd) const;
 
     /** Thread config init is not done automatically because not all
-    programs need it and it uses the debug log so that it's better to
-    call it after primary init */
+        programs need it and it uses the debug log so that it's better to
+        call it after primary init */
     void initThrConf();
 
-    const string& getOrigCwd() 
-    {
-    return o_origcwd;
+    const string& getOrigCwd() {
+        return o_origcwd;
     }
 
     RclConfig& operator=(const RclConfig &r) {
-    if (this != &r) {
-        freeAll();
-        initFrom(r);
-    }
-    return *this;
+        if (this != &r) {
+            freeAll();
+            initFrom(r);
+        }
+        return *this;
     }
 
     friend class ParamStale;
 
- private:
+private:
     int m_ok;
     string m_reason;    // Explanation for bad state
     string m_confdir;   // User directory where the customized files are stored
diff --git a/src/common/syngroups.cpp b/src/common/syngroups.cpp
index 5bd33f2b..3b95022a 100644
--- a/src/common/syngroups.cpp
+++ b/src/common/syngroups.cpp
@@ -44,8 +44,7 @@ using namespace std;
 // groups anyway
 class SynGroups::Internal {
 public:
-    Internal() : ok(false) {
-    }
+    Internal() {}
     void setpath(const string& fn) {
         path = path_canon(fn);
         stat(path.c_str(), &st);
@@ -61,16 +60,22 @@ public:
         }
         return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
     }
-    bool ok;
+    bool ok{false};
     // Term to group num 
     std::unordered_map<string, unsigned int> terms;
     // Group num to group
     vector<vector<string> > groups;
+
+    // Aux: set of multiword synonyms used for generating multiword
+    // terms while indexing
+    std::set<std::string> multiwords;
+    size_t multiwords_maxlen{0};
+    
     std::string path;
     struct stat st;
 };
 
-bool SynGroups::ok() 
+bool SynGroups::ok() const
 {
     return m && m->ok;
 }
@@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
     if (fn.empty()) {
         delete m;
         m = 0;
-    return true;
+        return true;
     }
 
     if (m->samefile(fn)) {
@@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
     ifstream input;
     input.open(fn.c_str(), ios::in);
     if (!input.is_open()) {
-    LOGSYSERR("SynGroups:setfile", "open", fn);
-    return false;
+        LOGSYSERR("SynGroups:setfile", "open", fn);
+        return false;
     }        
 
     string cline;
@@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
     string line;
     bool eof = false;
     int lnum = 0;
-
+    m->groups.clear();
+    m->terms.clear();
+    m->multiwords.clear();
+    m->multiwords_maxlen = 0;
     for (;;) {
         cline.clear();
-    getline(input, cline);
-    if (!input.good()) {
-        if (input.bad()) {
+        getline(input, cline);
+        if (!input.good()) {
+            if (input.bad()) {
                 LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
-        return false;
-        }
-        // Must be eof ? But maybe we have a partial line which
-        // must be processed. This happens if the last line before
-        // eof ends with a backslash, or there is no final \n
+                return false;
+            }
+            // Must be eof ? But maybe we have a partial line which
+            // must be processed. This happens if the last line before
+            // eof ends with a backslash, or there is no final \n
             eof = true;
-    }
-    lnum++;
+        }
+        lnum++;
 
         {
             string::size_type pos = cline.find_last_not_of("\n\r");
@@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
             }
         }
 
-    if (appending)
-        line += cline;
-    else
-        line = cline;
+        if (appending)
+            line += cline;
+        else
+            line = cline;
 
-    // Note that we trim whitespace before checking for backslash-eol
-    // This avoids invisible whitespace problems.
-    trimstring(line);
-    if (line.empty() || line.at(0) == '#') {
+        // Note that we trim whitespace before checking for backslash-eol
+        // This avoids invisible whitespace problems.
+        trimstring(line);
+        if (line.empty() || line.at(0) == '#') {
             if (eof)
                 break;
-        continue;
-    }
-    if (line[line.length() - 1] == '\\') {
-        line.erase(line.length() - 1);
-        appending = true;
-        continue;
-    }
-    appending = false;
+            continue;
+        }
+        if (line[line.length() - 1] == '\\') {
+            line.erase(line.length() - 1);
+            appending = true;
+            continue;
+        }
+        appending = false;
 
-    vector<string> words;
-    if (!stringToStrings(line, words)) {
-        LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
+        vector<string> words;
+        if (!stringToStrings(line, words)) {
+            LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
                    ": " << line << "\n");
-        continue;
-    }
+            continue;
+        }
 
-    if (words.empty())
-        continue;
-    if (words.size() == 1) {
-        LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
+        if (words.empty())
+            continue;
+        if (words.size() == 1) {
+            LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
                    << lnum << " ??\n");
-        continue;
-    }
+            continue;
+        }
 
-    m->groups.push_back(words);
-    for (const auto& word : words) {
-        m->terms[word] = m->groups.size()-1;
-    }
-    LOGDEB1("SynGroups::setfile: group: [" <<
+        m->groups.push_back(words);
+        for (const auto& word : words) {
+            m->terms[word] = m->groups.size()-1;
+        }
+        LOGDEB1("SynGroups::setfile: group: [" <<
                 stringsToString(m->groups.back()) << "]\n");
     }
-    LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
-           " distinct terms." << endl);
+
+    for (const auto& group : m->groups) {
+        for (const auto& term : group) {
+            std::vector<std::string> words;
+            stringToTokens(term, words);
+            if (words.size() > 1) {
+                std::string multiword;
+                for (const auto& word : words) {
+                    if (!multiword.empty()) {
+                        multiword += " ";
+                    }
+                    multiword += word;
+                }
+                m->multiwords.insert(multiword);
+                if (m->multiwords_maxlen < words.size()) {
+                    m->multiwords_maxlen = words.size();
+                }
+            }
+        }
+    }
+    LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
+           "Multiwords: " << stringsToString(m->multiwords) <<"\n");
     m->ok = true;
     m->setpath(fn);
     return true;
 }
 
-vector<string> SynGroups::getgroup(const string& term)
+vector<string> SynGroups::getgroup(const string& term) const
 {
     vector<string> ret;
     if (!ok())
-    return ret;
+        return ret;
 
     const auto it1 = m->terms.find(term);
     if (it1 == m->terms.end()) {
-    LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
-    return ret;
+        LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
+        return ret;
     }
 
     unsigned int idx = it1->second;
@@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
             << endl);
     return m->groups[idx];
 }
+
+const std::set<std::string>& SynGroups::getmultiwords() const
+{
+    return m->multiwords;
+}
+
+size_t SynGroups::getmultiwordsmaxlength() const
+{
+    return m->multiwords_maxlen;
+}
+
+const std::string& SynGroups::getpath() const
+{
+    return m->path;
+}
diff --git a/src/common/syngroups.h b/src/common/syngroups.h
index b5631204..520929fc 100644
--- a/src/common/syngroups.h
+++ b/src/common/syngroups.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2015 J.F.Dockes
+/* Copyright (C) 2015-2021 J.F.Dockes
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
@@ -20,6 +20,7 @@
 
 #include <string>
 #include <vector>
+#include <set>
 
 // Manage synonym groups. This is very different from stemming and
 // case/diac expansion because there is no reference form: all terms
@@ -34,8 +35,11 @@ public:
     SynGroups& operator=(const SynGroups&&) = delete;
 
     bool setfile(const std::string& fname);
-    std::vector<std::string> getgroup(const std::string& term);
-    bool ok();
+    std::vector<std::string> getgroup(const std::string& term) const;
+    const std::set<std::string>& getmultiwords() const;
+    size_t getmultiwordsmaxlength() const;
+    const std::string& getpath() const;
+    bool ok() const;
 private:
     class Internal;
     Internal *m;
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 3b43ef7a..b242b801 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
 bool Db::o_inPlaceReset;
 
 Db::Db(const RclConfig *cfp)
-    : m_ndb(0),  m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
-      m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
-      m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), 
-      m_flushMb(-1), m_maxFsOccupPc(0)
 {
     m_config = new RclConfig(*cfp);
+    m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
+    m_config->getConfParam("idxflushmb", &m_flushMb);
+    m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
+    m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
     if (start_of_field_term.empty()) {
         if (o_index_stripchars) {
             start_of_field_term = "XXST";
@@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
             end_of_field_term = "XXND/";
         }
     }
-
     m_ndb = new Native(this);
-    if (m_config) {
-        m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
-        m_config->getConfParam("idxflushmb", &m_flushMb);
-        m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
-        m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
-    }
 }
 
 Db::~Db()
 {
     LOGDEB2("Db::~Db\n");
-    if (m_ndb == 0)
+    if (nullptr == m_ndb)
         return;
     LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
            m_ndb->m_iswritable << "\n");
@@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
     return res;
 }
 
-
 bool Db::open(OpenMode mode, OpenError *error)
 {
     if (error)
@@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
     if (!m_config->getStopfile().empty())
         m_stops.setFile(m_config->getStopfile());
 
+    if (isWriteMode(mode)) {
+        // Check for an index-time synonyms file. We use this to
+        // generate multiword terms for multiword synonyms
+        string synfile = m_config->getIdxSynGroupsFile();
+        if (path_exists(synfile)) {
+            setSynGroupsFile(synfile);
+        }
+    }
+    
     string dir = m_config->getDbDir();
     string ermsg;
     try {
-        switch (mode) {
-        case DbUpd:
-        case DbTrunc: 
+        if (isWriteMode(mode)) {
             m_ndb->openWrite(dir, mode);
             updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
             // We used to open a readonly object in addition to the
@@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
             // so the query db is now a clone of the update one.
             m_ndb->xrdb = m_ndb->xwdb;
             LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
-            break;
-        case DbRO:
-        default:
+        } else {
             m_ndb->openRead(dir);
             for (auto& db : m_extraDbs) {
                 if (error)
@@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
                 // but I can't see why
                 m_ndb->xrdb.add_database(Xapian::Database(db));
             }
-            break;
         }
         if (error)
             *error = DbOpenMainDb;
@@ -1531,10 +1527,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
     TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
     //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
 
+    TermProcMulti tpmulti(nxt, m_syngroups);
+    if (m_syngroups.getmultiwordsmaxlength() > 1) {
+        nxt = &tpmulti;
+    }
+
     TermProcPrep tpprep(nxt);
     if (o_index_stripchars)
         nxt = &tpprep;
-
+    
     TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
     tpidx.setTSD(&splitter);
 
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index ba11d5fd..f695d6a1 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -114,14 +114,13 @@ public:
 
 class DbStats {
 public:
-    DbStats()
-        :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
+    DbStats() {}
     // Index-wide stats
-    unsigned int dbdoccount;
-    double       dbavgdoclen;
-    size_t       mindoclen;
-    size_t       maxdoclen;
-    vector<string> failedurls; /* Only set if requested */
+    unsigned int dbdoccount{0};
+    double       dbavgdoclen{0};
+    size_t       mindoclen{0};
+    size_t       maxdoclen{0};
+    std::vector<std::string> failedurls; /* Only set if requested */
 };
 
 inline bool has_prefix(const string& trm)
@@ -175,6 +174,9 @@ public:
     ~Db();
 
     enum OpenMode {DbRO, DbUpd, DbTrunc};
+    bool isWriteMode(OpenMode mode) {
+        return mode == DbUpd || mode == DbTrunc;
+    }
     enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
     bool open(OpenMode mode, OpenError *error = 0);
     bool close();
@@ -342,7 +344,7 @@ public:
     bool setExtraQueryDbs(const std::vector<std::string>& dbs);
 
     /** Check if document comes from the main index (this is used to
-       decide if we can update the index for it */
+        decide if we can update the index for it */
     bool fromMainIndex(const Doc& doc);
 
     /** Retrieve the stored doc text. This returns false if the index does not
@@ -499,7 +501,8 @@ public:
 
     // Use empty fn for no synonyms
     bool setSynGroupsFile(const std::string& fn);
-
+    const SynGroups& getSynGroups() {return m_syngroups;}
+    
     // Mark all documents with an UDI having input as prefix as
     // existing.  Only works if the UDIs for the store are
     // hierarchical of course.  Used by FsIndexer to avoid purging
@@ -508,25 +511,26 @@ public:
     bool udiTreeMarkExisting(const string& udi);
 
     /* This has to be public for access by embedded Query::Native */
-    Native *m_ndb; 
+    Native *m_ndb{nullptr};
+    
 private:
     const RclConfig *m_config;
     string     m_reason; // Error explanation
 
     // Xapian directories for additional databases to query
     vector<string> m_extraDbs;
-    OpenMode m_mode;
+    OpenMode m_mode{Db::DbRO};
     // File existence vector: this is filled during the indexing pass. Any
     // document whose bit is not set at the end is purged
     vector<bool> updated;
     // Text bytes indexed since beginning
-    long long    m_curtxtsz;
+    long long    m_curtxtsz{0};
     // Text bytes at last flush
-    long long    m_flushtxtsz;
+    long long    m_flushtxtsz{0};
     // Text bytes at last fsoccup check
-    long long    m_occtxtsz;
+    long long    m_occtxtsz{0};
     // First fs occup check ?
-    int         m_occFirstCheck;
+    int         m_occFirstCheck{1};
 
     // Synonym groups. There is no strict reason that this has to be
     // an Rcl::Db member, as it is only used when building each It
@@ -538,32 +542,31 @@ private:
     SynGroups m_syngroups;
 
     // Aspell object if needed
-    Aspell *m_aspell = nullptr;
-    
+    Aspell *m_aspell{nullptr};
+
     /***************
      * Parameters cached out of the configuration files. Logically const 
      * after init */
     // Stop terms: those don't get indexed.
     StopList m_stops;
-
     // Truncation length for stored meta fields
-    int         m_idxMetaStoredLen;
+    int         m_idxMetaStoredLen{150};
     // This is how long an abstract we keep or build from beginning of
     // text when indexing. It only has an influence on the size of the
     // db as we are free to shorten it again when displaying
-    int          m_idxAbsTruncLen;
+    int          m_idxAbsTruncLen{250};
     // Document text truncation length
     int          m_idxTextTruncateLen{0};
     // This is the size of the abstract that we synthetize out of query
     // term contexts at *query time*
-    int          m_synthAbsLen;
+    int          m_synthAbsLen{250};
     // This is how many words (context size) we keep around query terms
     // when building the abstract
-    int          m_synthAbsWordCtxLen;
+    int          m_synthAbsWordCtxLen{4};
     // Flush threshold. Megabytes of text indexed before we flush.
-    int          m_flushMb;
+    int          m_flushMb{-1};
     // Maximum file system occupation percentage
-    int          m_maxFsOccupPc;
+    int          m_maxFsOccupPc{0};
     // Database directory
     string       m_basedir;
     // When this is set, all documents are considered as needing a reindex.
diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp
index 5faa919e..35dacba6 100644
--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
                         LOGDEB("Db::TermMatch: syngroups out: " <<
                                term << " -> " << stringsToString(sg) << "\n");
                         for (const auto& synonym : sg) {
-                            if (synonym.find_first_of(" ") != string::npos) {
+                            if (synonym.find(' ') != string::npos) {
                                 if (multiwords) {
                                     multiwords->push_back(synonym);
                                 }
diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp
index be281cbf..f61957e9 100644
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
     }
 
     // Push phrases for the multi-word expansions
-    for (vector<string>::const_iterator mwp = multiwords.begin();
-         mwp != multiwords.end(); mwp++) {
+    for (const auto& mw : multiwords) {
         vector<string> phr;
         // We just do a basic split to keep things a bit simpler here
         // (no textsplit). This means though that no punctuation is
         // allowed in multi-word synonyms.
-        stringToTokens(*mwp, phr);
+        stringToTokens(mw, phr);
         if (!prefix.empty())
             prefix_vector(phr, prefix);
         xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
@@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
 // NEAR xapian query, the elements of which can themselves be OR
 // queries if the terms get expanded by stemming or wildcards (we
 // don't do stemming for PHRASE though)
-void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
-                                                 TermProcQ *splitData, 
-                                                 int mods, void *pq,
-                                                 bool useNear, int slack)
+void SearchDataClauseSimple::processPhraseOrNear(
+    Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
+    bool useNear, int slack)
 {
     vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
     Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
         Xapian::Query::OP_PHRASE;
     vector<Xapian::Query> orqueries;
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-    bool hadmultiple = false;
-#endif
     vector<vector<string> >groups;
 
+    bool useidxsynonyms =
+        db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
+    
     string prefix;
     const FieldTraits *ftp;
     if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
     }
 
     // Go through the list and perform stem/wildcard expansion for each element
-    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
-    for (vector<string>::const_iterator it = splitData->terms().begin();
+    auto nxit = splitData->nostemexps().begin();
+    for (auto it = splitData->terms().begin();
          it != splitData->terms().end(); it++, nxit++) {
         LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
         // Adjust when we do stem expansion. Not if disabled by
         // caller, not inside phrases, and some versions of xapian
         // will accept only one OR clause inside NEAR.
-        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-            || hadmultiple
-#endif // single OR inside NEAR
-            ;
+        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
         int lmods = mods;
         if (nostemexp)
             lmods |= SearchDataClause::SDCM_NOSTEMMING;
         string sterm;
         vector<string> exp;
-        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
+        vector<string> multiwords;
+        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
             return;
+
+        // Note: because of how expandTerm works, the multiwords can
+        // only come from the synonyms expansion, which means that, if
+        // idxsynonyms is set, they have each been indexed as a single
+        // term. So, if idxsynonyms is set, and is the current active
+        // synonyms file, we just add them to the expansion.
+        if (!multiwords.empty() && useidxsynonyms) {
+            exp.insert(exp.end(), multiwords.begin(), multiwords.end());
+        }
+
         LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
                 stringsToString(exp) << "\n");
         // groups is used for highlighting, we don't want prefixes in there.
         vector<string> noprefs;
-        for (vector<string>::const_iterator it = exp.begin(); 
-             it != exp.end(); it++) {
-            noprefs.push_back(it->substr(prefix.size()));
+        for (const auto& prefterm : exp) {
+            noprefs.push_back(prefterm.substr(prefix.size()));
         }
         groups.push_back(noprefs);
         orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
@@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
         m_curcl += exp.size();
         if (m_curcl >= getMaxCl())
             return;
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-        if (exp.size() > 1) 
-            hadmultiple = true;
-#endif
     }
 
     if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h
index e0f5950d..c276ff1b 100644
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@@ -19,12 +19,15 @@
 
 #include <vector>
 #include <string>
+#include <set>
+#include <list>
 
 #include "textsplit.h"
 #include "stoplist.h"
 #include "smallut.h"
 #include "utf8iter.h"
 #include "unacpp.h"
+#include "syngroups.h"
 
 namespace Rcl {
 
@@ -52,11 +55,13 @@ class TermProc {
 public:
     TermProc(TermProc* next) : m_next(next) {}
     virtual ~TermProc() {}
+    /* Copyconst and assignment forbidden */
+    TermProc(const TermProc &) = delete;
+    TermProc& operator=(const TermProc &) = delete;
     virtual bool takeword(const string &term, int pos, int bs, int be) {
         if (m_next)
             return m_next->takeword(term, pos, bs, be);
-        else
-            return true;
+        return true;
     }
     // newpage() is like takeword(), but for page breaks.
     virtual void newpage(int pos) {
@@ -66,16 +71,10 @@ public:
     virtual bool flush() {
         if (m_next)
             return m_next->flush();
-        else
-            return true;
+        return true;
     }
 private:
     TermProc *m_next;
-    /* Copyconst and assignment private and forbidden */
-    TermProc(const TermProc &) {}
-    TermProc& operator=(const TermProc &) {
-        return *this;
-    };
 };
 
 /**
@@ -100,8 +99,7 @@ public:
     virtual bool takeword(const string& term, int pos, int bs, int be) {
         if (m_prc)
             return m_prc->takeword(term, pos, bs, be);
-        else
-            return true;
+        return true;
     }
 
     virtual void newpage(int pos) {
@@ -119,12 +117,9 @@ private:
 class TermProcPrep : public TermProc {
 public:
     TermProcPrep(TermProc *nxt)
-        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
-    {
-    }
+        : TermProc(nxt) {}
 
-    virtual bool takeword(const string& itrm, int pos, int bs, int be)
-    {
+    virtual bool takeword(const string& itrm, int pos, int bs, int be) {
         m_totalterms++;
         string otrm;
 
@@ -179,49 +174,37 @@ public:
         // change in here. This means that phrase searches and
         // snippets will be wrong, but at least searching for the
         // terms will work.
-        bool hasspace = false;
-        for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
-            if (*it == ' ') {
-                hasspace=true;
-                break;
-            }
-        }
+        bool hasspace = otrm.find(' ') != std::string::npos;
         if (hasspace) {
             std::vector<std::string> terms;
             stringToTokens(otrm, terms, " ", true);
-            for (std::vector<std::string>::const_iterator it = terms.begin(); 
-                 it < terms.end(); it++) {
-                if (!TermProc::takeword(*it, pos, bs, be)) {
+            for (const auto& term : terms) {
+                if (!TermProc::takeword(term, pos, bs, be)) {
                     return false;
                 }
             }
             return true;
-        } else {
-            return TermProc::takeword(otrm, pos, bs, be);
         }
+        return TermProc::takeword(otrm, pos, bs, be);
     }
 
-    virtual bool flush()
-    {
+    virtual bool flush() {
         m_totalterms = m_unacerrors = 0;
         return TermProc::flush();
     }
 
 private:
-    int m_totalterms;
-    int m_unacerrors;
+    int m_totalterms{0};
+    int m_unacerrors{0};
 };
 
 /** Compare to stop words list and discard if match found */
 class TermProcStop : public TermProc {
 public:
     TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops)
-    {
-    }
+        : TermProc(nxt), m_stops(stops) {}
 
-    virtual bool takeword(const string& term, int pos, int bs, int be)
-    {
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
         if (m_stops.isStop(term)) {
             return true;
         }
@@ -232,6 +215,53 @@ private:
     const Rcl::StopList& m_stops;
 };
 
+/** Generate multiword terms for multiword synonyms. This allows
+ * NEAR/PHRASE searches for multiword synonyms. */
+class TermProcMulti : public TermProc {
+public:
+    TermProcMulti(TermProc *nxt, const SynGroups& sg)
+        : TermProc(nxt), m_groups(sg.getmultiwords()), 
+          m_maxl(sg.getmultiwordsmaxlength()) {}
+    
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
+        if (m_maxl < 2) {
+            // Should not have been pushed??
+            return TermProc::takeword(term, pos, bs, be);
+        }
+        m_terms.push_back(term);
+        if (m_terms.size() > m_maxl) {
+            m_terms.pop_front();
+        }
+        string comp;
+        int gsz{1};
+        for (const auto& gterm : m_terms) {
+            if (comp.empty()) {
+                comp = gterm;
+                continue;
+            } else {
+                comp += " ";
+                comp += gterm;
+                gsz++;
+                // We could optimize by not testing m_groups for sizes
+                // which do not exist.
+                // if not gsz in sizes continue;
+            }
+            if (m_groups.find(comp) != m_groups.end()) {
+                LOGDEB1("Found multiword synonym: [" << comp << "]\n");
+                // TBD bs-be correct computation. Need to store the
+                // values in a parallel list
+                TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
+            }
+        }
+        return TermProc::takeword(term, pos, bs, be);
+    }
+
+private:
+    const std::set<std::string>& m_groups;
+    size_t m_maxl{0};
+    std::list<std::string> m_terms;
+};
+
 /** Handle common-gram generation: combine frequent terms with neighbours to
  *  shorten the positions lists for phrase searches.
  *  NOTE: This does not currently work because of bad interaction with the
@@ -241,13 +271,11 @@ private:
 class TermProcCommongrams : public TermProc {
 public:
     TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
-    {
-    }
+        : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
 
-    virtual bool takeword(const string& term, int pos, int bs, int be)
-    {
-        LOGDEB1("TermProcCom::takeword: pos "  << (pos) << " "  << (bs) << " "  << (be) << " ["  << (term) << "]\n" );
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
+        LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
+                be << " [" << term << "]\n");
         bool isstop = m_stops.isStop(term);
         bool twogramemit = false;
 
@@ -287,8 +315,7 @@ public:
         return true;
     }
 
-    virtual bool flush()
-    {
+    virtual bool flush() {
         if (!m_prevsent && !m_prevterm.empty())
             if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
                 return false;
@@ -297,8 +324,7 @@ public:
         m_prevsent = true;
         return TermProc::flush();
     }
-    void onlygrams(bool on)
-    {
+    void onlygrams(bool on) {
         m_onlygrams = on;
     }
 private:
diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf
index 8d161247..99f1335b 100644
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
 # space issues.</descr></var> 
 #idxtexttruncatelen = 0
 
+# <var name="idxsynonyms" type="fn">
+#
+# <brief>Name of the index-time synonyms file.</brief>
+# <descr>This is used for indexing multiword synonyms as single terms,
+# which in turn is only useful if you want to perform proximity searches
+# with such terms.</descr></var>
+#idxsynonyms = thereisnodefaultidxsynonyms
+
 # <var name="aspellLanguage" type="string">
 #
 # <brief>Language definitions to use when creating the aspell