implemented multi-word terms indexing for phrase/prox search on multiword synonyms

2021-01-15 12:04:06 +01:00 · 2021-01-15 12:04:06 +01:00 · aa2f0bfd73
commit aa2f0bfd73
parent cb13b8b6df
10 changed files with 296 additions and 217 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
    return getConfdirPath("stoplistfile", "stoplist.txt");
 }
-string RclConfig::getSynGroupsFile() const
+string RclConfig::getIdxSynGroupsFile() const
 {
-    return getConfdirPath("syngroupsfile", "syngroups.txt");
+    return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
 }
 // The index status file is fast changing, so it's possible to put it outside
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -90,7 +90,7 @@ struct FieldTraits {
 };
 class RclConfig {
- public:
+public:
    // Constructor: we normally look for a configuration file, except
    // if this was specified on the command line and passed through
@ -133,9 +133,8 @@ class RclConfig {
    string getKeyDir() const {return m_keydir;}
    /** Get generic configuration parameter according to current keydir */
-    bool getConfParam(const string &name, string &value, 
+    bool getConfParam(const string& name, string& value, 
-                      bool shallow=false) const
+                      bool shallow=false) const {
    {
            if (m_conf == 0)
                return false;
            return m_conf->get(name, value, m_keydir, shallow);
@ -143,8 +142,7 @@ class RclConfig {
    /** Variant with autoconversion to int */
    bool getConfParam(const string &name, int *value, bool shallow=false) const;
    /** Variant with autoconversion to bool */
-    bool getConfParam(const string &name, bool *value, 
+    bool getConfParam(const string &name, bool *value, bool shallow=false) const;
                      bool shallow=false) const;
    /** Variant with conversion to vector<string>
     *  (stringToStrings). Can fail if the string is malformed. */
    bool getConfParam(const string &name, vector<string> *value, 
@ -164,18 +162,15 @@ class RclConfig {
     * Get list of config names under current sk, with possible 
     * wildcard filtering 
     */
-    vector<string> getConfNames(const char *pattern = 0) const
+    vector<string> getConfNames(const char *pattern = 0) const {
    {
        return m_conf->getNames(m_keydir, pattern);
    }
    /** Check if name exists anywhere in config */
-    bool hasNameAnywhere(const string& nm) const
+    bool hasNameAnywhere(const string& nm) const {
    {
        return m_conf? m_conf->hasNameAnywhere(nm) : false;
    }
    /** Get default charset for current keydir (was set during setKeydir) 
     * filenames are handled differently */
    const string &getDefCharset(bool filename = false) const;
@ -198,7 +193,7 @@ class RclConfig {
    /** Get stoplist file name */
    string getStopfile() const;
    /** Get synonym groups file name */
-    string getSynGroupsFile() const;
+    string getIdxSynGroupsFile() const;
    /** Get indexing pid file name */
    string getPidfile() const;
    /** Get indexing status file name */
@ -362,8 +357,7 @@ class RclConfig {
        call it after primary init */
    void initThrConf();
-    const string& getOrigCwd() 
+    const string& getOrigCwd() {
    {
        return o_origcwd;
    }
@ -377,7 +371,7 @@ class RclConfig {
    friend class ParamStale;
- private:
+private:
    int m_ok;
    string m_reason;    // Explanation for bad state
    string m_confdir;   // User directory where the customized files are stored
--- a/src/common/syngroups.cpp
+++ b/src/common/syngroups.cpp
@ -44,8 +44,7 @@ using namespace std;
 // groups anyway
 class SynGroups::Internal {
 public:
-    Internal() : ok(false) {
+    Internal() {}
    }
    void setpath(const string& fn) {
        path = path_canon(fn);
        stat(path.c_str(), &st);
@ -61,16 +60,22 @@ public:
        }
        return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
    }
-    bool ok;
+    bool ok{false};
    // Term to group num 
    std::unordered_map<string, unsigned int> terms;
    // Group num to group
    vector<vector<string> > groups;
    // Aux: set of multiword synonyms used for generating multiword
    // terms while indexing
    std::set<std::string> multiwords;
    size_t multiwords_maxlen{0};
    std::string path;
    struct stat st;
 };
-bool SynGroups::ok() 
+bool SynGroups::ok() const
 {
    return m && m->ok;
 }
@ -120,7 +125,10 @@ bool SynGroups::setfile(const string& fn)
    string line;
    bool eof = false;
    int lnum = 0;
-
+    m->groups.clear();
    m->terms.clear();
    m->multiwords.clear();
    m->multiwords_maxlen = 0;
    for (;;) {
        cline.clear();
        getline(input, cline);
@ -187,14 +195,34 @@ bool SynGroups::setfile(const string& fn)
        LOGDEB1("SynGroups::setfile: group: [" <<
                stringsToString(m->groups.back()) << "]\n");
    }
-    LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
+
-           " distinct terms." << endl);
+    for (const auto& group : m->groups) {
        for (const auto& term : group) {
            std::vector<std::string> words;
            stringToTokens(term, words);
            if (words.size() > 1) {
                std::string multiword;
                for (const auto& word : words) {
                    if (!multiword.empty()) {
                        multiword += " ";
                    }
                    multiword += word;
                }
                m->multiwords.insert(multiword);
                if (m->multiwords_maxlen < words.size()) {
                    m->multiwords_maxlen = words.size();
                }
            }
        }
    }
    LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
           "Multiwords: " << stringsToString(m->multiwords) <<"\n");
    m->ok = true;
    m->setpath(fn);
    return true;
 }
-vector<string> SynGroups::getgroup(const string& term)
+vector<string> SynGroups::getgroup(const string& term) const
 {
    vector<string> ret;
    if (!ok())
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
            << endl);
    return m->groups[idx];
 }
 const std::set<std::string>& SynGroups::getmultiwords() const
 {
    return m->multiwords;
 }
 size_t SynGroups::getmultiwordsmaxlength() const
 {
    return m->multiwords_maxlen;
 }
 const std::string& SynGroups::getpath() const
 {
    return m->path;
 }
--- a/src/common/syngroups.h
+++ b/src/common/syngroups.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2015 J.F.Dockes
+/* Copyright (C) 2015-2021 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -20,6 +20,7 @@
 #include <string>
 #include <vector>
 #include <set>
 // Manage synonym groups. This is very different from stemming and
 // case/diac expansion because there is no reference form: all terms
@ -34,8 +35,11 @@ public:
    SynGroups& operator=(const SynGroups&&) = delete;
    bool setfile(const std::string& fname);
-    std::vector<std::string> getgroup(const std::string& term);
+    std::vector<std::string> getgroup(const std::string& term) const;
-    bool ok();
+    const std::set<std::string>& getmultiwords() const;
    size_t getmultiwordsmaxlength() const;
    const std::string& getpath() const;
    bool ok() const;
 private:
    class Internal;
    Internal *m;
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
 bool Db::o_inPlaceReset;
 Db::Db(const RclConfig *cfp)
    : m_ndb(0),  m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
      m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
      m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), 
      m_flushMb(-1), m_maxFsOccupPc(0)
 {
    m_config = new RclConfig(*cfp);
    m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
    m_config->getConfParam("idxflushmb", &m_flushMb);
    m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
    m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
    if (start_of_field_term.empty()) {
        if (o_index_stripchars) {
            start_of_field_term = "XXST";
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
            end_of_field_term = "XXND/";
        }
    }
    m_ndb = new Native(this);
    if (m_config) {
        m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
        m_config->getConfParam("idxflushmb", &m_flushMb);
        m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
        m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
    }
 }
 Db::~Db()
 {
    LOGDEB2("Db::~Db\n");
-    if (m_ndb == 0)
+    if (nullptr == m_ndb)
        return;
    LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
           m_ndb->m_iswritable << "\n");
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
    return res;
 }
 bool Db::open(OpenMode mode, OpenError *error)
 {
    if (error)
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
    if (!m_config->getStopfile().empty())
        m_stops.setFile(m_config->getStopfile());
    if (isWriteMode(mode)) {
        // Check for an index-time synonyms file. We use this to
        // generate multiword terms for multiword synonyms
        string synfile = m_config->getIdxSynGroupsFile();
        if (path_exists(synfile)) {
            setSynGroupsFile(synfile);
        }
    }
    string dir = m_config->getDbDir();
    string ermsg;
    try {
-        switch (mode) {
+        if (isWriteMode(mode)) {
        case DbUpd:
        case DbTrunc: 
            m_ndb->openWrite(dir, mode);
            updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
            // We used to open a readonly object in addition to the
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
            // so the query db is now a clone of the update one.
            m_ndb->xrdb = m_ndb->xwdb;
            LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
-            break;
+        } else {
        case DbRO:
        default:
            m_ndb->openRead(dir);
            for (auto& db : m_extraDbs) {
                if (error)
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
                // but I can't see why
                m_ndb->xrdb.add_database(Xapian::Database(db));
            }
            break;
        }
        if (error)
            *error = DbOpenMainDb;
@ -1531,6 +1527,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
    TermProcMulti tpmulti(nxt, m_syngroups);
    if (m_syngroups.getmultiwordsmaxlength() > 1) {
        nxt = &tpmulti;
    }
    TermProcPrep tpprep(nxt);
    if (o_index_stripchars)
        nxt = &tpprep;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -114,14 +114,13 @@ public:
 class DbStats {
 public:
-    DbStats()
+    DbStats() {}
        :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
    // Index-wide stats
-    unsigned int dbdoccount;
+    unsigned int dbdoccount{0};
-    double       dbavgdoclen;
+    double       dbavgdoclen{0};
-    size_t       mindoclen;
+    size_t       mindoclen{0};
-    size_t       maxdoclen;
+    size_t       maxdoclen{0};
-    vector<string> failedurls; /* Only set if requested */
+    std::vector<std::string> failedurls; /* Only set if requested */
 };
 inline bool has_prefix(const string& trm)
@ -175,6 +174,9 @@ public:
    ~Db();
    enum OpenMode {DbRO, DbUpd, DbTrunc};
    bool isWriteMode(OpenMode mode) {
        return mode == DbUpd || mode == DbTrunc;
    }
    enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
    bool open(OpenMode mode, OpenError *error = 0);
    bool close();
@ -499,6 +501,7 @@ public:
    // Use empty fn for no synonyms
    bool setSynGroupsFile(const std::string& fn);
    const SynGroups& getSynGroups() {return m_syngroups;}
    // Mark all documents with an UDI having input as prefix as
    // existing.  Only works if the UDIs for the store are
@ -508,25 +511,26 @@ public:
    bool udiTreeMarkExisting(const string& udi);
    /* This has to be public for access by embedded Query::Native */
-    Native *m_ndb; 
+    Native *m_ndb{nullptr};
 private:
    const RclConfig *m_config;
    string     m_reason; // Error explanation
    // Xapian directories for additional databases to query
    vector<string> m_extraDbs;
-    OpenMode m_mode;
+    OpenMode m_mode{Db::DbRO};
    // File existence vector: this is filled during the indexing pass. Any
    // document whose bit is not set at the end is purged
    vector<bool> updated;
    // Text bytes indexed since beginning
-    long long    m_curtxtsz;
+    long long    m_curtxtsz{0};
    // Text bytes at last flush
-    long long    m_flushtxtsz;
+    long long    m_flushtxtsz{0};
    // Text bytes at last fsoccup check
-    long long    m_occtxtsz;
+    long long    m_occtxtsz{0};
    // First fs occup check ?
-    int         m_occFirstCheck;
+    int         m_occFirstCheck{1};
    // Synonym groups. There is no strict reason that this has to be
    // an Rcl::Db member, as it is only used when building each It
@ -538,32 +542,31 @@ private:
    SynGroups m_syngroups;
    // Aspell object if needed
-    Aspell *m_aspell = nullptr;
+    Aspell *m_aspell{nullptr};
    /***************
     * Parameters cached out of the configuration files. Logically const 
     * after init */
    // Stop terms: those don't get indexed.
    StopList m_stops;
    // Truncation length for stored meta fields
-    int         m_idxMetaStoredLen;
+    int         m_idxMetaStoredLen{150};
    // This is how long an abstract we keep or build from beginning of
    // text when indexing. It only has an influence on the size of the
    // db as we are free to shorten it again when displaying
-    int          m_idxAbsTruncLen;
+    int          m_idxAbsTruncLen{250};
    // Document text truncation length
    int          m_idxTextTruncateLen{0};
    // This is the size of the abstract that we synthetize out of query
    // term contexts at *query time*
-    int          m_synthAbsLen;
+    int          m_synthAbsLen{250};
    // This is how many words (context size) we keep around query terms
    // when building the abstract
-    int          m_synthAbsWordCtxLen;
+    int          m_synthAbsWordCtxLen{4};
    // Flush threshold. Megabytes of text indexed before we flush.
-    int          m_flushMb;
+    int          m_flushMb{-1};
    // Maximum file system occupation percentage
-    int          m_maxFsOccupPc;
+    int          m_maxFsOccupPc{0};
    // Database directory
    string       m_basedir;
    // When this is set, all documents are considered as needing a reindex.
--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
                        LOGDEB("Db::TermMatch: syngroups out: " <<
                               term << " -> " << stringsToString(sg) << "\n");
                        for (const auto& synonym : sg) {
-                            if (synonym.find_first_of(" ") != string::npos) {
+                            if (synonym.find(' ') != string::npos) {
                                if (multiwords) {
                                    multiwords->push_back(synonym);
                                }
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
    }
    // Push phrases for the multi-word expansions
-    for (vector<string>::const_iterator mwp = multiwords.begin();
+    for (const auto& mw : multiwords) {
         mwp != multiwords.end(); mwp++) {
        vector<string> phr;
        // We just do a basic split to keep things a bit simpler here
        // (no textsplit). This means though that no punctuation is
        // allowed in multi-word synonyms.
-        stringToTokens(*mwp, phr);
+        stringToTokens(mw, phr);
        if (!prefix.empty())
            prefix_vector(phr, prefix);
        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
 // NEAR xapian query, the elements of which can themselves be OR
 // queries if the terms get expanded by stemming or wildcards (we
 // don't do stemming for PHRASE though)
-void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
+void SearchDataClauseSimple::processPhraseOrNear(
-                                                 TermProcQ *splitData, 
+    Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
                                                 int mods, void *pq,
    bool useNear, int slack)
 {
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
        Xapian::Query::OP_PHRASE;
    vector<Xapian::Query> orqueries;
 #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
    bool hadmultiple = false;
 #endif
    vector<vector<string> >groups;
    bool useidxsynonyms =
        db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
    }
    // Go through the list and perform stem/wildcard expansion for each element
-    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
+    auto nxit = splitData->nostemexps().begin();
-    for (vector<string>::const_iterator it = splitData->terms().begin();
+    for (auto it = splitData->terms().begin();
         it != splitData->terms().end(); it++, nxit++) {
        LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
        // Adjust when we do stem expansion. Not if disabled by
        // caller, not inside phrases, and some versions of xapian
        // will accept only one OR clause inside NEAR.
-        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
+        bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
 #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
            || hadmultiple
 #endif // single OR inside NEAR
            ;
        int lmods = mods;
        if (nostemexp)
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
        string sterm;
        vector<string> exp;
-        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
+        vector<string> multiwords;
        if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
            return;
        // Note: because of how expandTerm works, the multiwords can
        // only come from the synonyms expansion, which means that, if
        // idxsynonyms is set, they have each been indexed as a single
        // term. So, if idxsynonyms is set, and is the current active
        // synonyms file, we just add them to the expansion.
        if (!multiwords.empty() && useidxsynonyms) {
            exp.insert(exp.end(), multiwords.begin(), multiwords.end());
        }
        LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
                stringsToString(exp) << "\n");
        // groups is used for highlighting, we don't want prefixes in there.
        vector<string> noprefs;
-        for (vector<string>::const_iterator it = exp.begin(); 
+        for (const auto& prefterm : exp) {
-             it != exp.end(); it++) {
+            noprefs.push_back(prefterm.substr(prefix.size()));
            noprefs.push_back(it->substr(prefix.size()));
        }
        groups.push_back(noprefs);
        orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
        m_curcl += exp.size();
        if (m_curcl >= getMaxCl())
            return;
 #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
        if (exp.size() > 1) 
            hadmultiple = true;
 #endif
    }
    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -19,12 +19,15 @@
 #include <vector>
 #include <string>
 #include <set>
 #include <list>
 #include "textsplit.h"
 #include "stoplist.h"
 #include "smallut.h"
 #include "utf8iter.h"
 #include "unacpp.h"
 #include "syngroups.h"
 namespace Rcl {
@ -52,10 +55,12 @@ class TermProc {
 public:
    TermProc(TermProc* next) : m_next(next) {}
    virtual ~TermProc() {}
    /* Copyconst and assignment forbidden */
    TermProc(const TermProc &) = delete;
    TermProc& operator=(const TermProc &) = delete;
    virtual bool takeword(const string &term, int pos, int bs, int be) {
        if (m_next)
            return m_next->takeword(term, pos, bs, be);
        else
        return true;
    }
    // newpage() is like takeword(), but for page breaks.
@ -66,16 +71,10 @@ public:
    virtual bool flush() {
        if (m_next)
            return m_next->flush();
        else
        return true;
    }
 private:
    TermProc *m_next;
    /* Copyconst and assignment private and forbidden */
    TermProc(const TermProc &) {}
    TermProc& operator=(const TermProc &) {
        return *this;
    };
 };
 /**
@ -100,7 +99,6 @@ public:
    virtual bool takeword(const string& term, int pos, int bs, int be) {
        if (m_prc)
            return m_prc->takeword(term, pos, bs, be);
        else
        return true;
    }
@ -119,12 +117,9 @@ private:
 class TermProcPrep : public TermProc {
 public:
    TermProcPrep(TermProc *nxt)
-        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
+        : TermProc(nxt) {}
    {
    }
-    virtual bool takeword(const string& itrm, int pos, int bs, int be)
+    virtual bool takeword(const string& itrm, int pos, int bs, int be) {
    {
        m_totalterms++;
        string otrm;
@ -179,49 +174,37 @@ public:
        // change in here. This means that phrase searches and
        // snippets will be wrong, but at least searching for the
        // terms will work.
-        bool hasspace = false;
+        bool hasspace = otrm.find(' ') != std::string::npos;
        for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
            if (*it == ' ') {
                hasspace=true;
                break;
            }
        }
        if (hasspace) {
            std::vector<std::string> terms;
            stringToTokens(otrm, terms, " ", true);
-            for (std::vector<std::string>::const_iterator it = terms.begin(); 
+            for (const auto& term : terms) {
-                 it < terms.end(); it++) {
+                if (!TermProc::takeword(term, pos, bs, be)) {
                if (!TermProc::takeword(*it, pos, bs, be)) {
                    return false;
                }
            }
            return true;
-        } else {
+        }
        return TermProc::takeword(otrm, pos, bs, be);
    }
    }
-    virtual bool flush()
+    virtual bool flush() {
    {
        m_totalterms = m_unacerrors = 0;
        return TermProc::flush();
    }
 private:
-    int m_totalterms;
+    int m_totalterms{0};
-    int m_unacerrors;
+    int m_unacerrors{0};
 };
 /** Compare to stop words list and discard if match found */
 class TermProcStop : public TermProc {
 public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops)
+        : TermProc(nxt), m_stops(stops) {}
    {
    }
-    virtual bool takeword(const string& term, int pos, int bs, int be)
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
    {
        if (m_stops.isStop(term)) {
            return true;
        }
@ -232,6 +215,53 @@ private:
    const Rcl::StopList& m_stops;
 };
 /** Generate multiword terms for multiword synonyms. This allows
 * NEAR/PHRASE searches for multiword synonyms. */
 class TermProcMulti : public TermProc {
 public:
    TermProcMulti(TermProc *nxt, const SynGroups& sg)
        : TermProc(nxt), m_groups(sg.getmultiwords()), 
          m_maxl(sg.getmultiwordsmaxlength()) {}
    virtual bool takeword(const string& term, int pos, int bs, int be) {
        if (m_maxl < 2) {
            // Should not have been pushed??
            return TermProc::takeword(term, pos, bs, be);
        }
        m_terms.push_back(term);
        if (m_terms.size() > m_maxl) {
            m_terms.pop_front();
        }
        string comp;
        int gsz{1};
        for (const auto& gterm : m_terms) {
            if (comp.empty()) {
                comp = gterm;
                continue;
            } else {
                comp += " ";
                comp += gterm;
                gsz++;
                // We could optimize by not testing m_groups for sizes
                // which do not exist.
                // if not gsz in sizes continue;
            }
            if (m_groups.find(comp) != m_groups.end()) {
                LOGDEB1("Found multiword synonym: [" << comp << "]\n");
                // TBD bs-be correct computation. Need to store the
                // values in a parallel list
                TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
            }
        }
        return TermProc::takeword(term, pos, bs, be);
    }
 private:
    const std::set<std::string>& m_groups;
    size_t m_maxl{0};
    std::list<std::string> m_terms;
 };
 /** Handle common-gram generation: combine frequent terms with neighbours to
 *  shorten the positions lists for phrase searches.
 *  NOTE: This does not currently work because of bad interaction with the
@ -241,13 +271,11 @@ private:
 class TermProcCommongrams : public TermProc {
 public:
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
-        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
+        : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
    {
    }
-    virtual bool takeword(const string& term, int pos, int bs, int be)
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
-    {
+        LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
-        LOGDEB1("TermProcCom::takeword: pos "  << (pos) << " "  << (bs) << " "  << (be) << " ["  << (term) << "]\n" );
+                be << " [" << term << "]\n");
        bool isstop = m_stops.isStop(term);
        bool twogramemit = false;
@ -287,8 +315,7 @@ public:
        return true;
    }
-    virtual bool flush()
+    virtual bool flush() {
    {
        if (!m_prevsent && !m_prevterm.empty())
            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
                return false;
@ -297,8 +324,7 @@ public:
        m_prevsent = true;
        return TermProc::flush();
    }
-    void onlygrams(bool on)
+    void onlygrams(bool on) {
    {
        m_onlygrams = on;
    }
 private:
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
 # space issues.</descr></var> 
 #idxtexttruncatelen = 0
 # <var name="idxsynonyms" type="fn">
 #
 # <brief>Name of the index-time synonyms file.</brief>
 # <descr>This is used for indexing multiword synonyms as single terms,
 # which in turn is only useful if you want to perform proximity searches
 # with such terms.</descr></var>
 #idxsynonyms = thereisnodefaultidxsynonyms
 # <var name="aspellLanguage" type="string">
 #
 # <brief>Language definitions to use when creating the aspell