implemented multi-word terms indexing for phrase/prox search on multiword synonyms

2021-01-15 12:04:06 +01:00 · 2021-01-15 12:04:06 +01:00 · aa2f0bfd73
commit aa2f0bfd73
parent cb13b8b6df
10 changed files with 296 additions and 217 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
    return getConfdirPath("stoplistfile", "stoplist.txt");
 }

-string RclConfig::getSynGroupsFile() const
+string RclConfig::getIdxSynGroupsFile() const
 {
-    return getConfdirPath("syngroupsfile", "syngroups.txt");
+    return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
 }

 // The index status file is fast changing, so it's possible to put it outside
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -72,8 +72,8 @@ private:

 // Hold the description for an external metadata-gathering command
 struct MDReaper {
-  string fieldname;
-  vector<string> cmdv;
+    string fieldname;
+    vector<string> cmdv;
 };

 // Data associated to a indexed field name: 
@ -90,7 +90,7 @@ struct FieldTraits {
 };

 class RclConfig {
- public:
+public:

    // Constructor: we normally look for a configuration file, except
    // if this was specified on the command line and passed through
@ -100,7 +100,7 @@ class RclConfig {
    RclConfig(const RclConfig &r);

    ~RclConfig() {
-    freeAll();
+        freeAll();
    }

    // Return a writable clone of the main config. This belongs to the
@ -133,18 +133,16 @@ class RclConfig {
    string getKeyDir() const {return m_keydir;}

    /** Get generic configuration parameter according to current keydir */
-    bool getConfParam(const string &name, string &value, 
-                      bool shallow=false) const
-    {
-    if (m_conf == 0)
-        return false;
-    return m_conf->get(name, value, m_keydir, shallow);
+    bool getConfParam(const string& name, string& value, 
+                      bool shallow=false) const {
+            if (m_conf == 0)
+                return false;
+            return m_conf->get(name, value, m_keydir, shallow);
    }
    /** Variant with autoconversion to int */
    bool getConfParam(const string &name, int *value, bool shallow=false) const;
    /** Variant with autoconversion to bool */
-    bool getConfParam(const string &name, bool *value, 
-                      bool shallow=false) const;
+    bool getConfParam(const string &name, bool *value, bool shallow=false) const;
    /** Variant with conversion to vector<string>
     *  (stringToStrings). Can fail if the string is malformed. */
    bool getConfParam(const string &name, vector<string> *value, 
@ -164,18 +162,15 @@ class RclConfig {
     * Get list of config names under current sk, with possible 
     * wildcard filtering 
     */
-    vector<string> getConfNames(const char *pattern = 0) const
-    {
-    return m_conf->getNames(m_keydir, pattern);
+    vector<string> getConfNames(const char *pattern = 0) const {
+        return m_conf->getNames(m_keydir, pattern);
    }

    /** Check if name exists anywhere in config */
-    bool hasNameAnywhere(const string& nm) const
-    {
+    bool hasNameAnywhere(const string& nm) const {
        return m_conf? m_conf->hasNameAnywhere(nm) : false;
    }

-
    /** Get default charset for current keydir (was set during setKeydir) 
     * filenames are handled differently */
    const string &getDefCharset(bool filename = false) const;
@ -198,7 +193,7 @@ class RclConfig {
    /** Get stoplist file name */
    string getStopfile() const;
    /** Get synonym groups file name */
-    string getSynGroupsFile() const;
+    string getIdxSynGroupsFile() const;
    /** Get indexing pid file name */
    string getPidfile() const;
    /** Get indexing status file name */
@ -207,7 +202,7 @@ class RclConfig {
    /** Do path translation according to the ptrans table */
    void urlrewrite(const string& dbdir, string& url) const;
    ConfSimple *getPTrans() {
-    return m_ptrans;
+        return m_ptrans;
    }
    /** Get Web Queue directory name */
    string getWebQueueDir() const;
@ -215,13 +210,13 @@ class RclConfig {
    /** Get list of skipped file names for current keydir */
    vector<string>& getSkippedNames();
    /** Get list of file name filters for current keydir (only those
-       names indexed) */
+        names indexed) */
    vector<string>& getOnlyNames();

    /** Get list of skipped paths patterns. Doesn't depend on the keydir */
    vector<string> getSkippedPaths() const;
    /** Get list of skipped paths patterns, daemon version (may add some)
-    Doesn't depend on the keydir */
+        Doesn't depend on the keydir */
    vector<string> getDaemSkippedPaths() const;

    /** Return list of no content suffixes. Used by confgui, indexing uses
@ -260,7 +255,7 @@ class RclConfig {
     * @param whole the raw value. No way to escape a semi-colon in there.
     */
    static bool valueSplitAttributes(const string& whole, string& value, 
-                     ConfSimple& attrs) ;
+                                     ConfSimple& attrs) ;

    /** Compute difference between 'base' and 'changed', as elements to be
     * added and substracted from base. Input and output strings are in
@ -288,9 +283,9 @@ class RclConfig {
    bool getGuiFilter(const string& filtername, string& frag) const;

    /** fields: get field prefix from field name. Use additional query
-       aliases if isquery is set */
+        aliases if isquery is set */
    bool getFieldTraits(const string& fldname, const FieldTraits **,
-        bool isquery = false) const;
+                        bool isquery = false) const;

    const set<string>& getStoredFields() const {return m_storedFields;}

@ -311,11 +306,11 @@ class RclConfig {
     */
    vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
    bool getFieldConfParam(const string &name, const string &sk, string &value)
-    const;
+        const;

    /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
    string getMimeViewerDef(const string &mimetype, const string& apptag, 
-                bool useall) const;
+                            bool useall) const;
    set<string> getMimeViewerAllEx() const;
    bool setMimeViewerAllEx(const set<string>& allex);
    bool getMimeViewerDefs(vector<pair<string, string> >&) const;
@ -358,26 +353,25 @@ class RclConfig {
    string findFilter(const string& cmd) const;

    /** Thread config init is not done automatically because not all
-    programs need it and it uses the debug log so that it's better to
-    call it after primary init */
+        programs need it and it uses the debug log so that it's better to
+        call it after primary init */
    void initThrConf();

-    const string& getOrigCwd() 
-    {
-    return o_origcwd;
+    const string& getOrigCwd() {
+        return o_origcwd;
    }

    RclConfig& operator=(const RclConfig &r) {
-    if (this != &r) {
-        freeAll();
-        initFrom(r);
-    }
-    return *this;
+        if (this != &r) {
+            freeAll();
+            initFrom(r);
+        }
+        return *this;
    }

    friend class ParamStale;

- private:
+private:
    int m_ok;
    string m_reason;    // Explanation for bad state
    string m_confdir;   // User directory where the customized files are stored
--- a/src/common/syngroups.cpp
+++ b/src/common/syngroups.cpp
@ -44,8 +44,7 @@ using namespace std;
 // groups anyway
 class SynGroups::Internal {
 public:
-    Internal() : ok(false) {
-    }
+    Internal() {}
    void setpath(const string& fn) {
        path = path_canon(fn);
        stat(path.c_str(), &st);
@ -61,16 +60,22 @@ public:
        }
        return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
    }
-    bool ok;
+    bool ok{false};
    // Term to group num 
    std::unordered_map<string, unsigned int> terms;
    // Group num to group
    vector<vector<string> > groups;
+
+    // Aux: set of multiword synonyms used for generating multiword
+    // terms while indexing
+    std::set<std::string> multiwords;
+    size_t multiwords_maxlen{0};
+    
    std::string path;
    struct stat st;
 };

-bool SynGroups::ok() 
+bool SynGroups::ok() const
 {
    return m && m->ok;
 }
@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
    if (fn.empty()) {
        delete m;
        m = 0;
-    return true;
+        return true;
    }

    if (m->samefile(fn)) {
@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
    ifstream input;
    input.open(fn.c_str(), ios::in);
    if (!input.is_open()) {
-    LOGSYSERR("SynGroups:setfile", "open", fn);
-    return false;
+        LOGSYSERR("SynGroups:setfile", "open", fn);
+        return false;
    }        

    string cline;
@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
    string line;
    bool eof = false;
    int lnum = 0;
-
+    m->groups.clear();
+    m->terms.clear();
+    m->multiwords.clear();
+    m->multiwords_maxlen = 0;
    for (;;) {
        cline.clear();
-    getline(input, cline);
-    if (!input.good()) {
-        if (input.bad()) {
+        getline(input, cline);
+        if (!input.good()) {
+            if (input.bad()) {
                LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
-        return false;
-        }
-        // Must be eof ? But maybe we have a partial line which
-        // must be processed. This happens if the last line before
-        // eof ends with a backslash, or there is no final \n
+                return false;
+            }
+            // Must be eof ? But maybe we have a partial line which
+            // must be processed. This happens if the last line before
+            // eof ends with a backslash, or there is no final \n
            eof = true;
-    }
-    lnum++;
+        }
+        lnum++;

        {
            string::size_type pos = cline.find_last_not_of("\n\r");
@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
            }
        }

-    if (appending)
-        line += cline;
-    else
-        line = cline;
+        if (appending)
+            line += cline;
+        else
+            line = cline;

-    // Note that we trim whitespace before checking for backslash-eol
-    // This avoids invisible whitespace problems.
-    trimstring(line);
-    if (line.empty() || line.at(0) == '#') {
+        // Note that we trim whitespace before checking for backslash-eol
+        // This avoids invisible whitespace problems.
+        trimstring(line);
+        if (line.empty() || line.at(0) == '#') {
            if (eof)
                break;
-        continue;
-    }
-    if (line[line.length() - 1] == '\\') {
-        line.erase(line.length() - 1);
-        appending = true;
-        continue;
-    }
-    appending = false;
+            continue;
+        }
+        if (line[line.length() - 1] == '\\') {
+            line.erase(line.length() - 1);
+            appending = true;
+            continue;
+        }
+        appending = false;

-    vector<string> words;
-    if (!stringToStrings(line, words)) {
-        LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
+        vector<string> words;
+        if (!stringToStrings(line, words)) {
+            LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
                   ": " << line << "\n");
-        continue;
-    }
+            continue;
+        }

-    if (words.empty())
-        continue;
-    if (words.size() == 1) {
-        LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
+        if (words.empty())
+            continue;
+        if (words.size() == 1) {
+            LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
                   << lnum << " ??\n");
-        continue;
-    }
+            continue;
+        }

-    m->groups.push_back(words);
-    for (const auto& word : words) {
-        m->terms[word] = m->groups.size()-1;
-    }
-    LOGDEB1("SynGroups::setfile: group: [" <<
+        m->groups.push_back(words);
+        for (const auto& word : words) {
+            m->terms[word] = m->groups.size()-1;
+        }
+        LOGDEB1("SynGroups::setfile: group: [" <<
                stringsToString(m->groups.back()) << "]\n");
    }
-    LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
-           " distinct terms." << endl);
+
+    for (const auto& group : m->groups) {
+        for (const auto& term : group) {
+            std::vector<std::string> words;
+            stringToTokens(term, words);
+            if (words.size() > 1) {
+                std::string multiword;
+                for (const auto& word : words) {
+                    if (!multiword.empty()) {
+                        multiword += " ";
+                    }
+                    multiword += word;
+                }
+                m->multiwords.insert(multiword);
+                if (m->multiwords_maxlen < words.size()) {
+                    m->multiwords_maxlen = words.size();
+                }
+            }
+        }
+    }
+    LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
+           "Multiwords: " << stringsToString(m->multiwords) <<"\n");
    m->ok = true;
    m->setpath(fn);
    return true;
 }

-vector<string> SynGroups::getgroup(const string& term)
+vector<string> SynGroups::getgroup(const string& term) const
 {
    vector<string> ret;
    if (!ok())
-    return ret;
+        return ret;

    const auto it1 = m->terms.find(term);
    if (it1 == m->terms.end()) {
-    LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
-    return ret;
+        LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
+        return ret;
    }

    unsigned int idx = it1->second;
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
            << endl);
    return m->groups[idx];
 }
+
+const std::set<std::string>& SynGroups::getmultiwords() const
+{
+    return m->multiwords;
+}
+
+size_t SynGroups::getmultiwordsmaxlength() const
+{
+    return m->multiwords_maxlen;
+}
+
+const std::string& SynGroups::getpath() const
+{
+    return m->path;
+}
--- a/src/common/syngroups.h
+++ b/src/common/syngroups.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2015 J.F.Dockes
+/* Copyright (C) 2015-2021 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -20,6 +20,7 @@

 #include <string>
 #include <vector>
+#include <set>

 // Manage synonym groups. This is very different from stemming and
 // case/diac expansion because there is no reference form: all terms
@ -34,8 +35,11 @@ public:
    SynGroups& operator=(const SynGroups&&) = delete;

    bool setfile(const std::string& fname);
-    std::vector<std::string> getgroup(const std::string& term);
-    bool ok();
+    std::vector<std::string> getgroup(const std::string& term) const;
+    const std::set<std::string>& getmultiwords() const;
+    size_t getmultiwordsmaxlength() const;
+    const std::string& getpath() const;
+    bool ok() const;
 private:
    class Internal;
    Internal *m;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
 bool Db::o_inPlaceReset;

 Db::Db(const RclConfig *cfp)
-    : m_ndb(0),  m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
-      m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
-      m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), 
-      m_flushMb(-1), m_maxFsOccupPc(0)
 {
    m_config = new RclConfig(*cfp);
+    m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
+    m_config->getConfParam("idxflushmb", &m_flushMb);
+    m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
+    m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
    if (start_of_field_term.empty()) {
        if (o_index_stripchars) {
            start_of_field_term = "XXST";
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
            end_of_field_term = "XXND/";
        }
    }
-
    m_ndb = new Native(this);
-    if (m_config) {
-        m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
-        m_config->getConfParam("idxflushmb", &m_flushMb);
-        m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
-        m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
-    }
 }

 Db::~Db()
 {
    LOGDEB2("Db::~Db\n");
-    if (m_ndb == 0)
+    if (nullptr == m_ndb)
        return;
    LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
           m_ndb->m_iswritable << "\n");
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
    return res;
 }

-
 bool Db::open(OpenMode mode, OpenError *error)
 {
    if (error)
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
    if (!m_config->getStopfile().empty())
        m_stops.setFile(m_config->getStopfile());

+    if (isWriteMode(mode)) {
+        // Check for an index-time synonyms file. We use this to
+        // generate multiword terms for multiword synonyms
+        string synfile = m_config->getIdxSynGroupsFile();
+        if (path_exists(synfile)) {
+            setSynGroupsFile(synfile);
+        }
+    }
+    
    string dir = m_config->getDbDir();
    string ermsg;
    try {
-        switch (mode) {
-        case DbUpd:
-        case DbTrunc: 
+        if (isWriteMode(mode)) {
            m_ndb->openWrite(dir, mode);
            updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
            // We used to open a readonly object in addition to the
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
            // so the query db is now a clone of the update one.
            m_ndb->xrdb = m_ndb->xwdb;
            LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
-            break;
-        case DbRO:
-        default:
+        } else {
            m_ndb->openRead(dir);
            for (auto& db : m_extraDbs) {
                if (error)
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
                // but I can't see why
                m_ndb->xrdb.add_database(Xapian::Database(db));
            }
-            break;
        }
        if (error)
            *error = DbOpenMainDb;
@ -1531,10 +1527,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;

+    TermProcMulti tpmulti(nxt, m_syngroups);
+    if (m_syngroups.getmultiwordsmaxlength() > 1) {
+        nxt = &tpmulti;
+    }
+
    TermProcPrep tpprep(nxt);
    if (o_index_stripchars)
        nxt = &tpprep;
-
+    
    TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
    tpidx.setTSD(&splitter);

--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -114,14 +114,13 @@ public:

 class DbStats {
 public:
-    DbStats()
-        :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
+    DbStats() {}
    // Index-wide stats
-    unsigned int dbdoccount;
-    double       dbavgdoclen;
-    size_t       mindoclen;
-    size_t       maxdoclen;
-    vector<string> failedurls; /* Only set if requested */
+    unsigned int dbdoccount{0};
+    double       dbavgdoclen{0};
+    size_t       mindoclen{0};
+    size_t       maxdoclen{0};
+    std::vector<std::string> failedurls; /* Only set if requested */
 };

 inline bool has_prefix(const string& trm)
@ -175,6 +174,9 @@ public:
    ~Db();

    enum OpenMode {DbRO, DbUpd, DbTrunc};
+    bool isWriteMode(OpenMode mode) {
+        return mode == DbUpd || mode == DbTrunc;
+    }
    enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
    bool open(OpenMode mode, OpenError *error = 0);
    bool close();
@ -342,7 +344,7 @@ public:
    bool setExtraQueryDbs(const std::vector<std::string>& dbs);

    /** Check if document comes from the main index (this is used to
-       decide if we can update the index for it */
+        decide if we can update the index for it */
    bool fromMainIndex(const Doc& doc);

    /** Retrieve the stored doc text. This returns false if the index does not
@ -499,7 +501,8 @@ public:

    // Use empty fn for no synonyms
    bool setSynGroupsFile(const std::string& fn);
-
+    const SynGroups& getSynGroups() {return m_syngroups;}
+    
    // Mark all documents with an UDI having input as prefix as
    // existing.  Only works if the UDIs for the store are
    // hierarchical of course.  Used by FsIndexer to avoid purging
@ -508,25 +511,26 @@ public:
    bool udiTreeMarkExisting(const string& udi);

    /* This has to be public for access by embedded Query::Native */
-    Native *m_ndb; 
+    Native *m_ndb{nullptr};
+    
 private:
    const RclConfig *m_config;
    string     m_reason; // Error explanation

    // Xapian directories for additional databases to query
    vector<string> m_extraDbs;
-    OpenMode m_mode;
+    OpenMode m_mode{Db::DbRO};
    // File existence vector: this is filled during the indexing pass. Any
    // document whose bit is not set at the end is purged
    vector<bool> updated;
    // Text bytes indexed since beginning
-    long long    m_curtxtsz;
+    long long    m_curtxtsz{0};
    // Text bytes at last flush
-    long long    m_flushtxtsz;
+    long long    m_flushtxtsz{0};
    // Text bytes at last fsoccup check
-    long long    m_occtxtsz;
+    long long    m_occtxtsz{0};
    // First fs occup check ?
-    int         m_occFirstCheck;
+    int         m_occFirstCheck{1};

    // Synonym groups. There is no strict reason that this has to be
    // an Rcl::Db member, as it is only used when building each It
@ -538,32 +542,31 @@ private:
    SynGroups m_syngroups;

    // Aspell object if needed
-    Aspell *m_aspell = nullptr;
-    
+    Aspell *m_aspell{nullptr};
+
    /***************
     * Parameters cached out of the configuration files. Logically const 
     * after init */
    // Stop terms: those don't get indexed.
    StopList m_stops;
-
    // Truncation length for stored meta fields
-    int         m_idxMetaStoredLen;
+    int         m_idxMetaStoredLen{150};
    // This is how long an abstract we keep or build from beginning of
    // text when indexing. It only has an influence on the size of the
    // db as we are free to shorten it again when displaying
-    int          m_idxAbsTruncLen;
+    int          m_idxAbsTruncLen{250};
    // Document text truncation length
    int          m_idxTextTruncateLen{0};
    // This is the size of the abstract that we synthetize out of query
    // term contexts at *query time*
-    int          m_synthAbsLen;
+    int          m_synthAbsLen{250};
    // This is how many words (context size) we keep around query terms
    // when building the abstract
-    int          m_synthAbsWordCtxLen;
+    int          m_synthAbsWordCtxLen{4};
    // Flush threshold. Megabytes of text indexed before we flush.
-    int          m_flushMb;
+    int          m_flushMb{-1};
    // Maximum file system occupation percentage
-    int          m_maxFsOccupPc;
+    int          m_maxFsOccupPc{0};
    // Database directory
    string       m_basedir;
    // When this is set, all documents are considered as needing a reindex.
--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
                        LOGDEB("Db::TermMatch: syngroups out: " <<
                               term << " -> " << stringsToString(sg) << "\n");
                        for (const auto& synonym : sg) {
-                            if (synonym.find_first_of(" ") != string::npos) {
+                            if (synonym.find(' ') != string::npos) {
                                if (multiwords) {
                                    multiwords->push_back(synonym);
                                }
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
    }

    // Push phrases for the multi-word expansions
-    for (vector<string>::const_iterator mwp = multiwords.begin();
-         mwp != multiwords.end(); mwp++) {
+    for (const auto& mw : multiwords) {
        vector<string> phr;
        // We just do a basic split to keep things a bit simpler here
        // (no textsplit). This means though that no punctuation is
        // allowed in multi-word synonyms.
-        stringToTokens(*mwp, phr);
+        stringToTokens(mw, phr);
        if (!prefix.empty())
            prefix_vector(phr, prefix);
        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
 // NEAR xapian query, the elements of which can themselves be OR
 // queries if the terms get expanded by stemming or wildcards (we
 // don't do stemming for PHRASE though)
-void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
-                                                 TermProcQ *splitData, 
-                                                 int mods, void *pq,
-                                                 bool useNear, int slack)
+void SearchDataClauseSimple::processPhraseOrNear(
+    Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
+    bool useNear, int slack)
 {
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
        Xapian::Query::OP_PHRASE;
    vector<Xapian::Query> orqueries;
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-    bool hadmultiple = false;
-#endif
    vector<vector<string> >groups;

+    bool useidxsynonyms =
+        db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
+    
    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
    }

    // Go through the list and perform stem/wildcard expansion for each element
-    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
-    for (vector<string>::const_iterator it = splitData->terms().begin();
+    auto nxit = splitData->nostemexps().begin();
+    for (auto it = splitData->terms().begin();
         it != splitData->terms().end(); it++, nxit++) {
        LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
        // Adjust when we do stem expansion. Not if disabled by
        // caller, not inside phrases, and some versions of xapian
        // will accept only one OR clause inside NEAR.
-        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-            || hadmultiple
-#endif // single OR inside NEAR
-            ;
+        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
        int lmods = mods;
        if (nostemexp)
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
        string sterm;
        vector<string> exp;
-        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
+        vector<string> multiwords;
+        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
            return;
+
+        // Note: because of how expandTerm works, the multiwords can
+        // only come from the synonyms expansion, which means that, if
+        // idxsynonyms is set, they have each been indexed as a single
+        // term. So, if idxsynonyms is set, and is the current active
+        // synonyms file, we just add them to the expansion.
+        if (!multiwords.empty() && useidxsynonyms) {
+            exp.insert(exp.end(), multiwords.begin(), multiwords.end());
+        }
+
        LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
                stringsToString(exp) << "\n");
        // groups is used for highlighting, we don't want prefixes in there.
        vector<string> noprefs;
-        for (vector<string>::const_iterator it = exp.begin(); 
-             it != exp.end(); it++) {
-            noprefs.push_back(it->substr(prefix.size()));
+        for (const auto& prefterm : exp) {
+            noprefs.push_back(prefterm.substr(prefix.size()));
        }
        groups.push_back(noprefs);
        orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
        m_curcl += exp.size();
        if (m_curcl >= getMaxCl())
            return;
-#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
-        if (exp.size() > 1) 
-            hadmultiple = true;
-#endif
    }

    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -19,12 +19,15 @@

 #include <vector>
 #include <string>
+#include <set>
+#include <list>

 #include "textsplit.h"
 #include "stoplist.h"
 #include "smallut.h"
 #include "utf8iter.h"
 #include "unacpp.h"
+#include "syngroups.h"

 namespace Rcl {

@ -52,11 +55,13 @@ class TermProc {
 public:
    TermProc(TermProc* next) : m_next(next) {}
    virtual ~TermProc() {}
+    /* Copyconst and assignment forbidden */
+    TermProc(const TermProc &) = delete;
+    TermProc& operator=(const TermProc &) = delete;
    virtual bool takeword(const string &term, int pos, int bs, int be) {
        if (m_next)
            return m_next->takeword(term, pos, bs, be);
-        else
-            return true;
+        return true;
    }
    // newpage() is like takeword(), but for page breaks.
    virtual void newpage(int pos) {
@ -66,16 +71,10 @@ public:
    virtual bool flush() {
        if (m_next)
            return m_next->flush();
-        else
-            return true;
+        return true;
    }
 private:
    TermProc *m_next;
-    /* Copyconst and assignment private and forbidden */
-    TermProc(const TermProc &) {}
-    TermProc& operator=(const TermProc &) {
-        return *this;
-    };
 };

 /**
@ -100,8 +99,7 @@ public:
    virtual bool takeword(const string& term, int pos, int bs, int be) {
        if (m_prc)
            return m_prc->takeword(term, pos, bs, be);
-        else
-            return true;
+        return true;
    }

    virtual void newpage(int pos) {
@ -119,12 +117,9 @@ private:
 class TermProcPrep : public TermProc {
 public:
    TermProcPrep(TermProc *nxt)
-        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
-    {
-    }
+        : TermProc(nxt) {}

-    virtual bool takeword(const string& itrm, int pos, int bs, int be)
-    {
+    virtual bool takeword(const string& itrm, int pos, int bs, int be) {
        m_totalterms++;
        string otrm;

@ -179,49 +174,37 @@ public:
        // change in here. This means that phrase searches and
        // snippets will be wrong, but at least searching for the
        // terms will work.
-        bool hasspace = false;
-        for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
-            if (*it == ' ') {
-                hasspace=true;
-                break;
-            }
-        }
+        bool hasspace = otrm.find(' ') != std::string::npos;
        if (hasspace) {
            std::vector<std::string> terms;
            stringToTokens(otrm, terms, " ", true);
-            for (std::vector<std::string>::const_iterator it = terms.begin(); 
-                 it < terms.end(); it++) {
-                if (!TermProc::takeword(*it, pos, bs, be)) {
+            for (const auto& term : terms) {
+                if (!TermProc::takeword(term, pos, bs, be)) {
                    return false;
                }
            }
            return true;
-        } else {
-            return TermProc::takeword(otrm, pos, bs, be);
        }
+        return TermProc::takeword(otrm, pos, bs, be);
    }

-    virtual bool flush()
-    {
+    virtual bool flush() {
        m_totalterms = m_unacerrors = 0;
        return TermProc::flush();
    }

 private:
-    int m_totalterms;
-    int m_unacerrors;
+    int m_totalterms{0};
+    int m_unacerrors{0};
 };

 /** Compare to stop words list and discard if match found */
 class TermProcStop : public TermProc {
 public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops)
-    {
-    }
+        : TermProc(nxt), m_stops(stops) {}

-    virtual bool takeword(const string& term, int pos, int bs, int be)
-    {
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
        if (m_stops.isStop(term)) {
            return true;
        }
@ -232,6 +215,53 @@ private:
    const Rcl::StopList& m_stops;
 };

+/** Generate multiword terms for multiword synonyms. This allows
+ * NEAR/PHRASE searches for multiword synonyms. */
+class TermProcMulti : public TermProc {
+public:
+    TermProcMulti(TermProc *nxt, const SynGroups& sg)
+        : TermProc(nxt), m_groups(sg.getmultiwords()), 
+          m_maxl(sg.getmultiwordsmaxlength()) {}
+    
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
+        if (m_maxl < 2) {
+            // Should not have been pushed??
+            return TermProc::takeword(term, pos, bs, be);
+        }
+        m_terms.push_back(term);
+        if (m_terms.size() > m_maxl) {
+            m_terms.pop_front();
+        }
+        string comp;
+        int gsz{1};
+        for (const auto& gterm : m_terms) {
+            if (comp.empty()) {
+                comp = gterm;
+                continue;
+            } else {
+                comp += " ";
+                comp += gterm;
+                gsz++;
+                // We could optimize by not testing m_groups for sizes
+                // which do not exist.
+                // if not gsz in sizes continue;
+            }
+            if (m_groups.find(comp) != m_groups.end()) {
+                LOGDEB1("Found multiword synonym: [" << comp << "]\n");
+                // TBD bs-be correct computation. Need to store the
+                // values in a parallel list
+                TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
+            }
+        }
+        return TermProc::takeword(term, pos, bs, be);
+    }
+
+private:
+    const std::set<std::string>& m_groups;
+    size_t m_maxl{0};
+    std::list<std::string> m_terms;
+};
+
 /** Handle common-gram generation: combine frequent terms with neighbours to
 *  shorten the positions lists for phrase searches.
 *  NOTE: This does not currently work because of bad interaction with the
@ -241,13 +271,11 @@ private:
 class TermProcCommongrams : public TermProc {
 public:
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
-    {
-    }
+        : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}

-    virtual bool takeword(const string& term, int pos, int bs, int be)
-    {
-        LOGDEB1("TermProcCom::takeword: pos "  << (pos) << " "  << (bs) << " "  << (be) << " ["  << (term) << "]\n" );
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
+        LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
+                be << " [" << term << "]\n");
        bool isstop = m_stops.isStop(term);
        bool twogramemit = false;

@ -287,8 +315,7 @@ public:
        return true;
    }

-    virtual bool flush()
-    {
+    virtual bool flush() {
        if (!m_prevsent && !m_prevterm.empty())
            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
                return false;
@ -297,8 +324,7 @@ public:
        m_prevsent = true;
        return TermProc::flush();
    }
-    void onlygrams(bool on)
-    {
+    void onlygrams(bool on) {
        m_onlygrams = on;
    }
 private:
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
 # space issues.</descr></var> 
 #idxtexttruncatelen = 0

+# <var name="idxsynonyms" type="fn">
+#
+# <brief>Name of the index-time synonyms file.</brief>
+# <descr>This is used for indexing multiword synonyms as single terms,
+# which in turn is only useful if you want to perform proximity searches
+# with such terms.</descr></var>
+#idxsynonyms = thereisnodefaultidxsynonyms
+
 # <var name="aspellLanguage" type="string">
 #
 # <brief>Language definitions to use when creating the aspell