From aa2f0bfd73b71fbdc32e2383033ee62f01033f0a Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 15 Jan 2021 12:04:06 +0100 Subject: [PATCH] implemented multi-word terms indexing for phrase/prox search on multiword synonyms --- src/common/rclconfig.cpp | 4 +- src/common/rclconfig.h | 70 ++++++++-------- src/common/syngroups.cpp | 155 +++++++++++++++++++++++------------- src/common/syngroups.h | 10 ++- src/rcldb/rcldb.cpp | 43 +++++----- src/rcldb/rcldb.h | 51 ++++++------ src/rcldb/rclterms.cpp | 2 +- src/rcldb/searchdatatox.cpp | 50 ++++++------ src/rcldb/termproc.h | 120 +++++++++++++++++----------- src/sampleconf/recoll.conf | 8 ++ 10 files changed, 296 insertions(+), 217 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 30f88255..f2f3f282 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const return getConfdirPath("stoplistfile", "stoplist.txt"); } -string RclConfig::getSynGroupsFile() const +string RclConfig::getIdxSynGroupsFile() const { - return getConfdirPath("syngroupsfile", "syngroups.txt"); + return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms"); } // The index status file is fast changing, so it's possible to put it outside diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 7e7bb22b..cf00d094 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -72,8 +72,8 @@ private: // Hold the description for an external metadata-gathering command struct MDReaper { - string fieldname; - vector cmdv; + string fieldname; + vector cmdv; }; // Data associated to a indexed field name: @@ -90,7 +90,7 @@ struct FieldTraits { }; class RclConfig { - public: +public: // Constructor: we normally look for a configuration file, except // if this was specified on the command line and passed through @@ -100,7 +100,7 @@ class RclConfig { RclConfig(const RclConfig &r); ~RclConfig() { - freeAll(); + freeAll(); } // Return a writable clone of the main config. This belongs to the @@ -133,18 +133,16 @@ class RclConfig { string getKeyDir() const {return m_keydir;} /** Get generic configuration parameter according to current keydir */ - bool getConfParam(const string &name, string &value, - bool shallow=false) const - { - if (m_conf == 0) - return false; - return m_conf->get(name, value, m_keydir, shallow); + bool getConfParam(const string& name, string& value, + bool shallow=false) const { + if (m_conf == 0) + return false; + return m_conf->get(name, value, m_keydir, shallow); } /** Variant with autoconversion to int */ bool getConfParam(const string &name, int *value, bool shallow=false) const; /** Variant with autoconversion to bool */ - bool getConfParam(const string &name, bool *value, - bool shallow=false) const; + bool getConfParam(const string &name, bool *value, bool shallow=false) const; /** Variant with conversion to vector * (stringToStrings). Can fail if the string is malformed. */ bool getConfParam(const string &name, vector *value, @@ -164,18 +162,15 @@ class RclConfig { * Get list of config names under current sk, with possible * wildcard filtering */ - vector getConfNames(const char *pattern = 0) const - { - return m_conf->getNames(m_keydir, pattern); + vector getConfNames(const char *pattern = 0) const { + return m_conf->getNames(m_keydir, pattern); } /** Check if name exists anywhere in config */ - bool hasNameAnywhere(const string& nm) const - { + bool hasNameAnywhere(const string& nm) const { return m_conf? m_conf->hasNameAnywhere(nm) : false; } - /** Get default charset for current keydir (was set during setKeydir) * filenames are handled differently */ const string &getDefCharset(bool filename = false) const; @@ -198,7 +193,7 @@ class RclConfig { /** Get stoplist file name */ string getStopfile() const; /** Get synonym groups file name */ - string getSynGroupsFile() const; + string getIdxSynGroupsFile() const; /** Get indexing pid file name */ string getPidfile() const; /** Get indexing status file name */ @@ -207,7 +202,7 @@ class RclConfig { /** Do path translation according to the ptrans table */ void urlrewrite(const string& dbdir, string& url) const; ConfSimple *getPTrans() { - return m_ptrans; + return m_ptrans; } /** Get Web Queue directory name */ string getWebQueueDir() const; @@ -215,13 +210,13 @@ class RclConfig { /** Get list of skipped file names for current keydir */ vector& getSkippedNames(); /** Get list of file name filters for current keydir (only those - names indexed) */ + names indexed) */ vector& getOnlyNames(); /** Get list of skipped paths patterns. Doesn't depend on the keydir */ vector getSkippedPaths() const; /** Get list of skipped paths patterns, daemon version (may add some) - Doesn't depend on the keydir */ + Doesn't depend on the keydir */ vector getDaemSkippedPaths() const; /** Return list of no content suffixes. Used by confgui, indexing uses @@ -260,7 +255,7 @@ class RclConfig { * @param whole the raw value. No way to escape a semi-colon in there. */ static bool valueSplitAttributes(const string& whole, string& value, - ConfSimple& attrs) ; + ConfSimple& attrs) ; /** Compute difference between 'base' and 'changed', as elements to be * added and substracted from base. Input and output strings are in @@ -288,9 +283,9 @@ class RclConfig { bool getGuiFilter(const string& filtername, string& frag) const; /** fields: get field prefix from field name. Use additional query - aliases if isquery is set */ + aliases if isquery is set */ bool getFieldTraits(const string& fldname, const FieldTraits **, - bool isquery = false) const; + bool isquery = false) const; const set& getStoredFields() const {return m_storedFields;} @@ -311,11 +306,11 @@ class RclConfig { */ vector getFieldSectNames(const string &sk, const char* = 0) const; bool getFieldConfParam(const string &name, const string &sk, string &value) - const; + const; /** mimeview: get/set external viewer exec string(s) for mimetype(s) */ string getMimeViewerDef(const string &mimetype, const string& apptag, - bool useall) const; + bool useall) const; set getMimeViewerAllEx() const; bool setMimeViewerAllEx(const set& allex); bool getMimeViewerDefs(vector >&) const; @@ -358,26 +353,25 @@ class RclConfig { string findFilter(const string& cmd) const; /** Thread config init is not done automatically because not all - programs need it and it uses the debug log so that it's better to - call it after primary init */ + programs need it and it uses the debug log so that it's better to + call it after primary init */ void initThrConf(); - const string& getOrigCwd() - { - return o_origcwd; + const string& getOrigCwd() { + return o_origcwd; } RclConfig& operator=(const RclConfig &r) { - if (this != &r) { - freeAll(); - initFrom(r); - } - return *this; + if (this != &r) { + freeAll(); + initFrom(r); + } + return *this; } friend class ParamStale; - private: +private: int m_ok; string m_reason; // Explanation for bad state string m_confdir; // User directory where the customized files are stored diff --git a/src/common/syngroups.cpp b/src/common/syngroups.cpp index 5bd33f2b..3b95022a 100644 --- a/src/common/syngroups.cpp +++ b/src/common/syngroups.cpp @@ -44,8 +44,7 @@ using namespace std; // groups anyway class SynGroups::Internal { public: - Internal() : ok(false) { - } + Internal() {} void setpath(const string& fn) { path = path_canon(fn); stat(path.c_str(), &st); @@ -61,16 +60,22 @@ public: } return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size; } - bool ok; + bool ok{false}; // Term to group num std::unordered_map terms; // Group num to group vector > groups; + + // Aux: set of multiword synonyms used for generating multiword + // terms while indexing + std::set multiwords; + size_t multiwords_maxlen{0}; + std::string path; struct stat st; }; -bool SynGroups::ok() +bool SynGroups::ok() const { return m && m->ok; } @@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn) if (fn.empty()) { delete m; m = 0; - return true; + return true; } if (m->samefile(fn)) { @@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn) ifstream input; input.open(fn.c_str(), ios::in); if (!input.is_open()) { - LOGSYSERR("SynGroups:setfile", "open", fn); - return false; + LOGSYSERR("SynGroups:setfile", "open", fn); + return false; } string cline; @@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn) string line; bool eof = false; int lnum = 0; - + m->groups.clear(); + m->terms.clear(); + m->multiwords.clear(); + m->multiwords_maxlen = 0; for (;;) { cline.clear(); - getline(input, cline); - if (!input.good()) { - if (input.bad()) { + getline(input, cline); + if (!input.good()) { + if (input.bad()) { LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n"); - return false; - } - // Must be eof ? But maybe we have a partial line which - // must be processed. This happens if the last line before - // eof ends with a backslash, or there is no final \n + return false; + } + // Must be eof ? But maybe we have a partial line which + // must be processed. This happens if the last line before + // eof ends with a backslash, or there is no final \n eof = true; - } - lnum++; + } + lnum++; { string::size_type pos = cline.find_last_not_of("\n\r"); @@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn) } } - if (appending) - line += cline; - else - line = cline; + if (appending) + line += cline; + else + line = cline; - // Note that we trim whitespace before checking for backslash-eol - // This avoids invisible whitespace problems. - trimstring(line); - if (line.empty() || line.at(0) == '#') { + // Note that we trim whitespace before checking for backslash-eol + // This avoids invisible whitespace problems. + trimstring(line); + if (line.empty() || line.at(0) == '#') { if (eof) break; - continue; - } - if (line[line.length() - 1] == '\\') { - line.erase(line.length() - 1); - appending = true; - continue; - } - appending = false; + continue; + } + if (line[line.length() - 1] == '\\') { + line.erase(line.length() - 1); + appending = true; + continue; + } + appending = false; - vector words; - if (!stringToStrings(line, words)) { - LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum << + vector words; + if (!stringToStrings(line, words)) { + LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum << ": " << line << "\n"); - continue; - } + continue; + } - if (words.empty()) - continue; - if (words.size() == 1) { - LOGERR("Syngroup::setfile(" << fn << "):single term group at line " + if (words.empty()) + continue; + if (words.size() == 1) { + LOGERR("Syngroup::setfile(" << fn << "):single term group at line " << lnum << " ??\n"); - continue; - } + continue; + } - m->groups.push_back(words); - for (const auto& word : words) { - m->terms[word] = m->groups.size()-1; - } - LOGDEB1("SynGroups::setfile: group: [" << + m->groups.push_back(words); + for (const auto& word : words) { + m->terms[word] = m->groups.size()-1; + } + LOGDEB1("SynGroups::setfile: group: [" << stringsToString(m->groups.back()) << "]\n"); } - LOGDEB("SynGroups::setfile: got " << m->groups.size() << - " distinct terms." << endl); + + for (const auto& group : m->groups) { + for (const auto& term : group) { + std::vector words; + stringToTokens(term, words); + if (words.size() > 1) { + std::string multiword; + for (const auto& word : words) { + if (!multiword.empty()) { + multiword += " "; + } + multiword += word; + } + m->multiwords.insert(multiword); + if (m->multiwords_maxlen < words.size()) { + m->multiwords_maxlen = words.size(); + } + } + } + } + LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. " + "Multiwords: " << stringsToString(m->multiwords) <<"\n"); m->ok = true; m->setpath(fn); return true; } -vector SynGroups::getgroup(const string& term) +vector SynGroups::getgroup(const string& term) const { vector ret; if (!ok()) - return ret; + return ret; const auto it1 = m->terms.find(term); if (it1 == m->terms.end()) { - LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n"); - return ret; + LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n"); + return ret; } unsigned int idx = it1->second; @@ -215,3 +243,18 @@ vector SynGroups::getgroup(const string& term) << endl); return m->groups[idx]; } + +const std::set& SynGroups::getmultiwords() const +{ + return m->multiwords; +} + +size_t SynGroups::getmultiwordsmaxlength() const +{ + return m->multiwords_maxlen; +} + +const std::string& SynGroups::getpath() const +{ + return m->path; +} diff --git a/src/common/syngroups.h b/src/common/syngroups.h index b5631204..520929fc 100644 --- a/src/common/syngroups.h +++ b/src/common/syngroups.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2015 J.F.Dockes +/* Copyright (C) 2015-2021 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -20,6 +20,7 @@ #include #include +#include // Manage synonym groups. This is very different from stemming and // case/diac expansion because there is no reference form: all terms @@ -34,8 +35,11 @@ public: SynGroups& operator=(const SynGroups&&) = delete; bool setfile(const std::string& fname); - std::vector getgroup(const std::string& term); - bool ok(); + std::vector getgroup(const std::string& term) const; + const std::set& getmultiwords() const; + size_t getmultiwordsmaxlength() const; + const std::string& getpath() const; + bool ok() const; private: class Internal; Internal *m; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 3b43ef7a..b242b801 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, bool Db::o_inPlaceReset; Db::Db(const RclConfig *cfp) - : m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0), - m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150), - m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), - m_flushMb(-1), m_maxFsOccupPc(0) { m_config = new RclConfig(*cfp); + m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); + m_config->getConfParam("idxflushmb", &m_flushMb); + m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen); + m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen); if (start_of_field_term.empty()) { if (o_index_stripchars) { start_of_field_term = "XXST"; @@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp) end_of_field_term = "XXND/"; } } - m_ndb = new Native(this); - if (m_config) { - m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); - m_config->getConfParam("idxflushmb", &m_flushMb); - m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen); - m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen); - } } Db::~Db() { LOGDEB2("Db::~Db\n"); - if (m_ndb == 0) + if (nullptr == m_ndb) return; LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << "\n"); @@ -913,7 +906,6 @@ vector Db::getStemmerNames() return res; } - bool Db::open(OpenMode mode, OpenError *error) { if (error) @@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error) if (!m_config->getStopfile().empty()) m_stops.setFile(m_config->getStopfile()); + if (isWriteMode(mode)) { + // Check for an index-time synonyms file. We use this to + // generate multiword terms for multiword synonyms + string synfile = m_config->getIdxSynGroupsFile(); + if (path_exists(synfile)) { + setSynGroupsFile(synfile); + } + } + string dir = m_config->getDbDir(); string ermsg; try { - switch (mode) { - case DbUpd: - case DbTrunc: + if (isWriteMode(mode)) { m_ndb->openWrite(dir, mode); updated = vector(m_ndb->xwdb.get_lastdocid() + 1, false); // We used to open a readonly object in addition to the @@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error) // so the query db is now a clone of the update one. m_ndb->xrdb = m_ndb->xwdb; LOGDEB("Db::open: lastdocid: " <xwdb.get_lastdocid()<<"\n"); - break; - case DbRO: - default: + } else { m_ndb->openRead(dir); for (auto& db : m_extraDbs) { if (error) @@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error) // but I can't see why m_ndb->xrdb.add_database(Xapian::Database(db)); } - break; } if (error) *error = DbOpenMainDb; @@ -1531,10 +1527,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; + TermProcMulti tpmulti(nxt, m_syngroups); + if (m_syngroups.getmultiwordsmaxlength() > 1) { + nxt = &tpmulti; + } + TermProcPrep tpprep(nxt); if (o_index_stripchars) nxt = &tpprep; - + TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt); tpidx.setTSD(&splitter); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index ba11d5fd..f695d6a1 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -114,14 +114,13 @@ public: class DbStats { public: - DbStats() - :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {} + DbStats() {} // Index-wide stats - unsigned int dbdoccount; - double dbavgdoclen; - size_t mindoclen; - size_t maxdoclen; - vector failedurls; /* Only set if requested */ + unsigned int dbdoccount{0}; + double dbavgdoclen{0}; + size_t mindoclen{0}; + size_t maxdoclen{0}; + std::vector failedurls; /* Only set if requested */ }; inline bool has_prefix(const string& trm) @@ -175,6 +174,9 @@ public: ~Db(); enum OpenMode {DbRO, DbUpd, DbTrunc}; + bool isWriteMode(OpenMode mode) { + return mode == DbUpd || mode == DbTrunc; + } enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb}; bool open(OpenMode mode, OpenError *error = 0); bool close(); @@ -342,7 +344,7 @@ public: bool setExtraQueryDbs(const std::vector& dbs); /** Check if document comes from the main index (this is used to - decide if we can update the index for it */ + decide if we can update the index for it */ bool fromMainIndex(const Doc& doc); /** Retrieve the stored doc text. This returns false if the index does not @@ -499,7 +501,8 @@ public: // Use empty fn for no synonyms bool setSynGroupsFile(const std::string& fn); - + const SynGroups& getSynGroups() {return m_syngroups;} + // Mark all documents with an UDI having input as prefix as // existing. Only works if the UDIs for the store are // hierarchical of course. Used by FsIndexer to avoid purging @@ -508,25 +511,26 @@ public: bool udiTreeMarkExisting(const string& udi); /* This has to be public for access by embedded Query::Native */ - Native *m_ndb; + Native *m_ndb{nullptr}; + private: const RclConfig *m_config; string m_reason; // Error explanation // Xapian directories for additional databases to query vector m_extraDbs; - OpenMode m_mode; + OpenMode m_mode{Db::DbRO}; // File existence vector: this is filled during the indexing pass. Any // document whose bit is not set at the end is purged vector updated; // Text bytes indexed since beginning - long long m_curtxtsz; + long long m_curtxtsz{0}; // Text bytes at last flush - long long m_flushtxtsz; + long long m_flushtxtsz{0}; // Text bytes at last fsoccup check - long long m_occtxtsz; + long long m_occtxtsz{0}; // First fs occup check ? - int m_occFirstCheck; + int m_occFirstCheck{1}; // Synonym groups. There is no strict reason that this has to be // an Rcl::Db member, as it is only used when building each It @@ -538,32 +542,31 @@ private: SynGroups m_syngroups; // Aspell object if needed - Aspell *m_aspell = nullptr; - + Aspell *m_aspell{nullptr}; + /*************** * Parameters cached out of the configuration files. Logically const * after init */ // Stop terms: those don't get indexed. StopList m_stops; - // Truncation length for stored meta fields - int m_idxMetaStoredLen; + int m_idxMetaStoredLen{150}; // This is how long an abstract we keep or build from beginning of // text when indexing. It only has an influence on the size of the // db as we are free to shorten it again when displaying - int m_idxAbsTruncLen; + int m_idxAbsTruncLen{250}; // Document text truncation length int m_idxTextTruncateLen{0}; // This is the size of the abstract that we synthetize out of query // term contexts at *query time* - int m_synthAbsLen; + int m_synthAbsLen{250}; // This is how many words (context size) we keep around query terms // when building the abstract - int m_synthAbsWordCtxLen; + int m_synthAbsWordCtxLen{4}; // Flush threshold. Megabytes of text indexed before we flush. - int m_flushMb; + int m_flushMb{-1}; // Maximum file system occupation percentage - int m_maxFsOccupPc; + int m_maxFsOccupPc{0}; // Database directory string m_basedir; // When this is set, all documents are considered as needing a reindex. diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp index 5faa919e..35dacba6 100644 --- a/src/rcldb/rclterms.cpp +++ b/src/rcldb/rclterms.cpp @@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, LOGDEB("Db::TermMatch: syngroups out: " << term << " -> " << stringsToString(sg) << "\n"); for (const auto& synonym : sg) { - if (synonym.find_first_of(" ") != string::npos) { + if (synonym.find(' ') != string::npos) { if (multiwords) { multiwords->push_back(synonym); } diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index be281cbf..f61957e9 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan( } // Push phrases for the multi-word expansions - for (vector::const_iterator mwp = multiwords.begin(); - mwp != multiwords.end(); mwp++) { + for (const auto& mw : multiwords) { vector phr; // We just do a basic split to keep things a bit simpler here // (no textsplit). This means though that no punctuation is // allowed in multi-word synonyms. - stringToTokens(*mwp, phr); + stringToTokens(mw, phr); if (!prefix.empty()) prefix_vector(phr, prefix); xq = Xapian::Query(Xapian::Query::OP_OR, xq, @@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan( // NEAR xapian query, the elements of which can themselves be OR // queries if the terms get expanded by stemming or wildcards (we // don't do stemming for PHRASE though) -void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, - TermProcQ *splitData, - int mods, void *pq, - bool useNear, int slack) +void SearchDataClauseSimple::processPhraseOrNear( + Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq, + bool useNear, int slack) { vector &pqueries(*(vector*)pq); Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::OP_PHRASE; vector orqueries; -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - bool hadmultiple = false; -#endif vector >groups; + bool useidxsynonyms = + db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile(); + string prefix; const FieldTraits *ftp; if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { @@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, } // Go through the list and perform stem/wildcard expansion for each element - vector::const_iterator nxit = splitData->nostemexps().begin(); - for (vector::const_iterator it = splitData->terms().begin(); + auto nxit = splitData->nostemexps().begin(); + for (auto it = splitData->terms().begin(); it != splitData->terms().end(); it++, nxit++) { LOGDEB0("ProcessPhrase: processing [" << *it << "]\n"); // Adjust when we do stem expansion. Not if disabled by // caller, not inside phrases, and some versions of xapian // will accept only one OR clause inside NEAR. - bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - || hadmultiple -#endif // single OR inside NEAR - ; + bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE); int lmods = mods; if (nostemexp) lmods |= SearchDataClause::SDCM_NOSTEMMING; string sterm; vector exp; - if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) + vector multiwords; + if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords)) return; + + // Note: because of how expandTerm works, the multiwords can + // only come from the synonyms expansion, which means that, if + // idxsynonyms is set, they have each been indexed as a single + // term. So, if idxsynonyms is set, and is the current active + // synonyms file, we just add them to the expansion. + if (!multiwords.empty() && useidxsynonyms) { + exp.insert(exp.end(), multiwords.begin(), multiwords.end()); + } + LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " << stringsToString(exp) << "\n"); // groups is used for highlighting, we don't want prefixes in there. vector noprefs; - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - noprefs.push_back(it->substr(prefix.size())); + for (const auto& prefterm : exp) { + noprefs.push_back(prefterm.substr(prefix.size())); } groups.push_back(noprefs); orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, @@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, m_curcl += exp.size(); if (m_curcl >= getMaxCl()) return; -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - if (exp.size() > 1) - hadmultiple = true; -#endif } if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h index e0f5950d..c276ff1b 100644 --- a/src/rcldb/termproc.h +++ b/src/rcldb/termproc.h @@ -19,12 +19,15 @@ #include #include +#include +#include #include "textsplit.h" #include "stoplist.h" #include "smallut.h" #include "utf8iter.h" #include "unacpp.h" +#include "syngroups.h" namespace Rcl { @@ -52,11 +55,13 @@ class TermProc { public: TermProc(TermProc* next) : m_next(next) {} virtual ~TermProc() {} + /* Copyconst and assignment forbidden */ + TermProc(const TermProc &) = delete; + TermProc& operator=(const TermProc &) = delete; virtual bool takeword(const string &term, int pos, int bs, int be) { if (m_next) return m_next->takeword(term, pos, bs, be); - else - return true; + return true; } // newpage() is like takeword(), but for page breaks. virtual void newpage(int pos) { @@ -66,16 +71,10 @@ public: virtual bool flush() { if (m_next) return m_next->flush(); - else - return true; + return true; } private: TermProc *m_next; - /* Copyconst and assignment private and forbidden */ - TermProc(const TermProc &) {} - TermProc& operator=(const TermProc &) { - return *this; - }; }; /** @@ -100,8 +99,7 @@ public: virtual bool takeword(const string& term, int pos, int bs, int be) { if (m_prc) return m_prc->takeword(term, pos, bs, be); - else - return true; + return true; } virtual void newpage(int pos) { @@ -119,12 +117,9 @@ private: class TermProcPrep : public TermProc { public: TermProcPrep(TermProc *nxt) - : TermProc(nxt), m_totalterms(0), m_unacerrors(0) - { - } + : TermProc(nxt) {} - virtual bool takeword(const string& itrm, int pos, int bs, int be) - { + virtual bool takeword(const string& itrm, int pos, int bs, int be) { m_totalterms++; string otrm; @@ -179,49 +174,37 @@ public: // change in here. This means that phrase searches and // snippets will be wrong, but at least searching for the // terms will work. - bool hasspace = false; - for (string::const_iterator it = otrm.begin();it < otrm.end();it++) { - if (*it == ' ') { - hasspace=true; - break; - } - } + bool hasspace = otrm.find(' ') != std::string::npos; if (hasspace) { std::vector terms; stringToTokens(otrm, terms, " ", true); - for (std::vector::const_iterator it = terms.begin(); - it < terms.end(); it++) { - if (!TermProc::takeword(*it, pos, bs, be)) { + for (const auto& term : terms) { + if (!TermProc::takeword(term, pos, bs, be)) { return false; } } return true; - } else { - return TermProc::takeword(otrm, pos, bs, be); } + return TermProc::takeword(otrm, pos, bs, be); } - virtual bool flush() - { + virtual bool flush() { m_totalterms = m_unacerrors = 0; return TermProc::flush(); } private: - int m_totalterms; - int m_unacerrors; + int m_totalterms{0}; + int m_unacerrors{0}; }; /** Compare to stop words list and discard if match found */ class TermProcStop : public TermProc { public: TermProcStop(TermProc *nxt, const Rcl::StopList& stops) - : TermProc(nxt), m_stops(stops) - { - } + : TermProc(nxt), m_stops(stops) {} - virtual bool takeword(const string& term, int pos, int bs, int be) - { + virtual bool takeword(const string& term, int pos, int bs, int be) { if (m_stops.isStop(term)) { return true; } @@ -232,6 +215,53 @@ private: const Rcl::StopList& m_stops; }; +/** Generate multiword terms for multiword synonyms. This allows + * NEAR/PHRASE searches for multiword synonyms. */ +class TermProcMulti : public TermProc { +public: + TermProcMulti(TermProc *nxt, const SynGroups& sg) + : TermProc(nxt), m_groups(sg.getmultiwords()), + m_maxl(sg.getmultiwordsmaxlength()) {} + + virtual bool takeword(const string& term, int pos, int bs, int be) { + if (m_maxl < 2) { + // Should not have been pushed?? + return TermProc::takeword(term, pos, bs, be); + } + m_terms.push_back(term); + if (m_terms.size() > m_maxl) { + m_terms.pop_front(); + } + string comp; + int gsz{1}; + for (const auto& gterm : m_terms) { + if (comp.empty()) { + comp = gterm; + continue; + } else { + comp += " "; + comp += gterm; + gsz++; + // We could optimize by not testing m_groups for sizes + // which do not exist. + // if not gsz in sizes continue; + } + if (m_groups.find(comp) != m_groups.end()) { + LOGDEB1("Found multiword synonym: [" << comp << "]\n"); + // TBD bs-be correct computation. Need to store the + // values in a parallel list + TermProc::takeword(comp, pos-gsz, bs-comp.size(), be); + } + } + return TermProc::takeword(term, pos, bs, be); + } + +private: + const std::set& m_groups; + size_t m_maxl{0}; + std::list m_terms; +}; + /** Handle common-gram generation: combine frequent terms with neighbours to * shorten the positions lists for phrase searches. * NOTE: This does not currently work because of bad interaction with the @@ -241,13 +271,11 @@ private: class TermProcCommongrams : public TermProc { public: TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) - : TermProc(nxt), m_stops(stops), m_onlygrams(false) - { - } + : TermProc(nxt), m_stops(stops), m_onlygrams(false) {} - virtual bool takeword(const string& term, int pos, int bs, int be) - { - LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" ); + virtual bool takeword(const string& term, int pos, int bs, int be) { + LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " << + be << " [" << term << "]\n"); bool isstop = m_stops.isStop(term); bool twogramemit = false; @@ -287,8 +315,7 @@ public: return true; } - virtual bool flush() - { + virtual bool flush() { if (!m_prevsent && !m_prevterm.empty()) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) return false; @@ -297,8 +324,7 @@ public: m_prevsent = true; return TermProc::flush(); } - void onlygrams(bool on) - { + void onlygrams(bool on) { m_onlygrams = on; } private: diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 8d161247..99f1335b 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh # space issues. #idxtexttruncatelen = 0 +# +# +# Name of the index-time synonyms file. +# This is used for indexing multiword synonyms as single terms, +# which in turn is only useful if you want to perform proximity searches +# with such terms. +#idxsynonyms = thereisnodefaultidxsynonyms + # # # Language definitions to use when creating the aspell