implemented multi-word terms indexing for phrase/prox search on multiword synonyms

This commit is contained in:
Jean-Francois Dockes 2021-01-15 12:04:06 +01:00
parent cb13b8b6df
commit aa2f0bfd73
10 changed files with 296 additions and 217 deletions

View File

@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
return getConfdirPath("stoplistfile", "stoplist.txt"); return getConfdirPath("stoplistfile", "stoplist.txt");
} }
string RclConfig::getSynGroupsFile() const string RclConfig::getIdxSynGroupsFile() const
{ {
return getConfdirPath("syngroupsfile", "syngroups.txt"); return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
} }
// The index status file is fast changing, so it's possible to put it outside // The index status file is fast changing, so it's possible to put it outside

View File

@ -90,7 +90,7 @@ struct FieldTraits {
}; };
class RclConfig { class RclConfig {
public: public:
// Constructor: we normally look for a configuration file, except // Constructor: we normally look for a configuration file, except
// if this was specified on the command line and passed through // if this was specified on the command line and passed through
@ -133,9 +133,8 @@ class RclConfig {
string getKeyDir() const {return m_keydir;} string getKeyDir() const {return m_keydir;}
/** Get generic configuration parameter according to current keydir */ /** Get generic configuration parameter according to current keydir */
bool getConfParam(const string &name, string &value, bool getConfParam(const string& name, string& value,
bool shallow=false) const bool shallow=false) const {
{
if (m_conf == 0) if (m_conf == 0)
return false; return false;
return m_conf->get(name, value, m_keydir, shallow); return m_conf->get(name, value, m_keydir, shallow);
@ -143,8 +142,7 @@ class RclConfig {
/** Variant with autoconversion to int */ /** Variant with autoconversion to int */
bool getConfParam(const string &name, int *value, bool shallow=false) const; bool getConfParam(const string &name, int *value, bool shallow=false) const;
/** Variant with autoconversion to bool */ /** Variant with autoconversion to bool */
bool getConfParam(const string &name, bool *value, bool getConfParam(const string &name, bool *value, bool shallow=false) const;
bool shallow=false) const;
/** Variant with conversion to vector<string> /** Variant with conversion to vector<string>
* (stringToStrings). Can fail if the string is malformed. */ * (stringToStrings). Can fail if the string is malformed. */
bool getConfParam(const string &name, vector<string> *value, bool getConfParam(const string &name, vector<string> *value,
@ -164,18 +162,15 @@ class RclConfig {
* Get list of config names under current sk, with possible * Get list of config names under current sk, with possible
* wildcard filtering * wildcard filtering
*/ */
vector<string> getConfNames(const char *pattern = 0) const vector<string> getConfNames(const char *pattern = 0) const {
{
return m_conf->getNames(m_keydir, pattern); return m_conf->getNames(m_keydir, pattern);
} }
/** Check if name exists anywhere in config */ /** Check if name exists anywhere in config */
bool hasNameAnywhere(const string& nm) const bool hasNameAnywhere(const string& nm) const {
{
return m_conf? m_conf->hasNameAnywhere(nm) : false; return m_conf? m_conf->hasNameAnywhere(nm) : false;
} }
/** Get default charset for current keydir (was set during setKeydir) /** Get default charset for current keydir (was set during setKeydir)
* filenames are handled differently */ * filenames are handled differently */
const string &getDefCharset(bool filename = false) const; const string &getDefCharset(bool filename = false) const;
@ -198,7 +193,7 @@ class RclConfig {
/** Get stoplist file name */ /** Get stoplist file name */
string getStopfile() const; string getStopfile() const;
/** Get synonym groups file name */ /** Get synonym groups file name */
string getSynGroupsFile() const; string getIdxSynGroupsFile() const;
/** Get indexing pid file name */ /** Get indexing pid file name */
string getPidfile() const; string getPidfile() const;
/** Get indexing status file name */ /** Get indexing status file name */
@ -362,8 +357,7 @@ class RclConfig {
call it after primary init */ call it after primary init */
void initThrConf(); void initThrConf();
const string& getOrigCwd() const string& getOrigCwd() {
{
return o_origcwd; return o_origcwd;
} }
@ -377,7 +371,7 @@ class RclConfig {
friend class ParamStale; friend class ParamStale;
private: private:
int m_ok; int m_ok;
string m_reason; // Explanation for bad state string m_reason; // Explanation for bad state
string m_confdir; // User directory where the customized files are stored string m_confdir; // User directory where the customized files are stored

View File

@ -44,8 +44,7 @@ using namespace std;
// groups anyway // groups anyway
class SynGroups::Internal { class SynGroups::Internal {
public: public:
Internal() : ok(false) { Internal() {}
}
void setpath(const string& fn) { void setpath(const string& fn) {
path = path_canon(fn); path = path_canon(fn);
stat(path.c_str(), &st); stat(path.c_str(), &st);
@ -61,16 +60,22 @@ public:
} }
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size; return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
} }
bool ok; bool ok{false};
// Term to group num // Term to group num
std::unordered_map<string, unsigned int> terms; std::unordered_map<string, unsigned int> terms;
// Group num to group // Group num to group
vector<vector<string> > groups; vector<vector<string> > groups;
// Aux: set of multiword synonyms used for generating multiword
// terms while indexing
std::set<std::string> multiwords;
size_t multiwords_maxlen{0};
std::string path; std::string path;
struct stat st; struct stat st;
}; };
bool SynGroups::ok() bool SynGroups::ok() const
{ {
return m && m->ok; return m && m->ok;
} }
@ -120,7 +125,10 @@ bool SynGroups::setfile(const string& fn)
string line; string line;
bool eof = false; bool eof = false;
int lnum = 0; int lnum = 0;
m->groups.clear();
m->terms.clear();
m->multiwords.clear();
m->multiwords_maxlen = 0;
for (;;) { for (;;) {
cline.clear(); cline.clear();
getline(input, cline); getline(input, cline);
@ -187,14 +195,34 @@ bool SynGroups::setfile(const string& fn)
LOGDEB1("SynGroups::setfile: group: [" << LOGDEB1("SynGroups::setfile: group: [" <<
stringsToString(m->groups.back()) << "]\n"); stringsToString(m->groups.back()) << "]\n");
} }
LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
" distinct terms." << endl); for (const auto& group : m->groups) {
for (const auto& term : group) {
std::vector<std::string> words;
stringToTokens(term, words);
if (words.size() > 1) {
std::string multiword;
for (const auto& word : words) {
if (!multiword.empty()) {
multiword += " ";
}
multiword += word;
}
m->multiwords.insert(multiword);
if (m->multiwords_maxlen < words.size()) {
m->multiwords_maxlen = words.size();
}
}
}
}
LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
"Multiwords: " << stringsToString(m->multiwords) <<"\n");
m->ok = true; m->ok = true;
m->setpath(fn); m->setpath(fn);
return true; return true;
} }
vector<string> SynGroups::getgroup(const string& term) vector<string> SynGroups::getgroup(const string& term) const
{ {
vector<string> ret; vector<string> ret;
if (!ok()) if (!ok())
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
<< endl); << endl);
return m->groups[idx]; return m->groups[idx];
} }
const std::set<std::string>& SynGroups::getmultiwords() const
{
return m->multiwords;
}
size_t SynGroups::getmultiwordsmaxlength() const
{
return m->multiwords_maxlen;
}
const std::string& SynGroups::getpath() const
{
return m->path;
}

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2015 J.F.Dockes /* Copyright (C) 2015-2021 J.F.Dockes
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
@ -20,6 +20,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <set>
// Manage synonym groups. This is very different from stemming and // Manage synonym groups. This is very different from stemming and
// case/diac expansion because there is no reference form: all terms // case/diac expansion because there is no reference form: all terms
@ -34,8 +35,11 @@ public:
SynGroups& operator=(const SynGroups&&) = delete; SynGroups& operator=(const SynGroups&&) = delete;
bool setfile(const std::string& fname); bool setfile(const std::string& fname);
std::vector<std::string> getgroup(const std::string& term); std::vector<std::string> getgroup(const std::string& term) const;
bool ok(); const std::set<std::string>& getmultiwords() const;
size_t getmultiwordsmaxlength() const;
const std::string& getpath() const;
bool ok() const;
private: private:
class Internal; class Internal;
Internal *m; Internal *m;

View File

@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
bool Db::o_inPlaceReset; bool Db::o_inPlaceReset;
Db::Db(const RclConfig *cfp) Db::Db(const RclConfig *cfp)
: m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4),
m_flushMb(-1), m_maxFsOccupPc(0)
{ {
m_config = new RclConfig(*cfp); m_config = new RclConfig(*cfp);
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
if (start_of_field_term.empty()) { if (start_of_field_term.empty()) {
if (o_index_stripchars) { if (o_index_stripchars) {
start_of_field_term = "XXST"; start_of_field_term = "XXST";
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
end_of_field_term = "XXND/"; end_of_field_term = "XXND/";
} }
} }
m_ndb = new Native(this); m_ndb = new Native(this);
if (m_config) {
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
}
} }
Db::~Db() Db::~Db()
{ {
LOGDEB2("Db::~Db\n"); LOGDEB2("Db::~Db\n");
if (m_ndb == 0) if (nullptr == m_ndb)
return; return;
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
m_ndb->m_iswritable << "\n"); m_ndb->m_iswritable << "\n");
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
return res; return res;
} }
bool Db::open(OpenMode mode, OpenError *error) bool Db::open(OpenMode mode, OpenError *error)
{ {
if (error) if (error)
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
if (!m_config->getStopfile().empty()) if (!m_config->getStopfile().empty())
m_stops.setFile(m_config->getStopfile()); m_stops.setFile(m_config->getStopfile());
if (isWriteMode(mode)) {
// Check for an index-time synonyms file. We use this to
// generate multiword terms for multiword synonyms
string synfile = m_config->getIdxSynGroupsFile();
if (path_exists(synfile)) {
setSynGroupsFile(synfile);
}
}
string dir = m_config->getDbDir(); string dir = m_config->getDbDir();
string ermsg; string ermsg;
try { try {
switch (mode) { if (isWriteMode(mode)) {
case DbUpd:
case DbTrunc:
m_ndb->openWrite(dir, mode); m_ndb->openWrite(dir, mode);
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false); updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
// We used to open a readonly object in addition to the // We used to open a readonly object in addition to the
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
// so the query db is now a clone of the update one. // so the query db is now a clone of the update one.
m_ndb->xrdb = m_ndb->xwdb; m_ndb->xrdb = m_ndb->xwdb;
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n"); LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
break; } else {
case DbRO:
default:
m_ndb->openRead(dir); m_ndb->openRead(dir);
for (auto& db : m_extraDbs) { for (auto& db : m_extraDbs) {
if (error) if (error)
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
// but I can't see why // but I can't see why
m_ndb->xrdb.add_database(Xapian::Database(db)); m_ndb->xrdb.add_database(Xapian::Database(db));
} }
break;
} }
if (error) if (error)
*error = DbOpenMainDb; *error = DbOpenMainDb;
@ -1531,6 +1527,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
TermProcMulti tpmulti(nxt, m_syngroups);
if (m_syngroups.getmultiwordsmaxlength() > 1) {
nxt = &tpmulti;
}
TermProcPrep tpprep(nxt); TermProcPrep tpprep(nxt);
if (o_index_stripchars) if (o_index_stripchars)
nxt = &tpprep; nxt = &tpprep;

View File

@ -114,14 +114,13 @@ public:
class DbStats { class DbStats {
public: public:
DbStats() DbStats() {}
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
// Index-wide stats // Index-wide stats
unsigned int dbdoccount; unsigned int dbdoccount{0};
double dbavgdoclen; double dbavgdoclen{0};
size_t mindoclen; size_t mindoclen{0};
size_t maxdoclen; size_t maxdoclen{0};
vector<string> failedurls; /* Only set if requested */ std::vector<std::string> failedurls; /* Only set if requested */
}; };
inline bool has_prefix(const string& trm) inline bool has_prefix(const string& trm)
@ -175,6 +174,9 @@ public:
~Db(); ~Db();
enum OpenMode {DbRO, DbUpd, DbTrunc}; enum OpenMode {DbRO, DbUpd, DbTrunc};
bool isWriteMode(OpenMode mode) {
return mode == DbUpd || mode == DbTrunc;
}
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb}; enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
bool open(OpenMode mode, OpenError *error = 0); bool open(OpenMode mode, OpenError *error = 0);
bool close(); bool close();
@ -499,6 +501,7 @@ public:
// Use empty fn for no synonyms // Use empty fn for no synonyms
bool setSynGroupsFile(const std::string& fn); bool setSynGroupsFile(const std::string& fn);
const SynGroups& getSynGroups() {return m_syngroups;}
// Mark all documents with an UDI having input as prefix as // Mark all documents with an UDI having input as prefix as
// existing. Only works if the UDIs for the store are // existing. Only works if the UDIs for the store are
@ -508,25 +511,26 @@ public:
bool udiTreeMarkExisting(const string& udi); bool udiTreeMarkExisting(const string& udi);
/* This has to be public for access by embedded Query::Native */ /* This has to be public for access by embedded Query::Native */
Native *m_ndb; Native *m_ndb{nullptr};
private: private:
const RclConfig *m_config; const RclConfig *m_config;
string m_reason; // Error explanation string m_reason; // Error explanation
// Xapian directories for additional databases to query // Xapian directories for additional databases to query
vector<string> m_extraDbs; vector<string> m_extraDbs;
OpenMode m_mode; OpenMode m_mode{Db::DbRO};
// File existence vector: this is filled during the indexing pass. Any // File existence vector: this is filled during the indexing pass. Any
// document whose bit is not set at the end is purged // document whose bit is not set at the end is purged
vector<bool> updated; vector<bool> updated;
// Text bytes indexed since beginning // Text bytes indexed since beginning
long long m_curtxtsz; long long m_curtxtsz{0};
// Text bytes at last flush // Text bytes at last flush
long long m_flushtxtsz; long long m_flushtxtsz{0};
// Text bytes at last fsoccup check // Text bytes at last fsoccup check
long long m_occtxtsz; long long m_occtxtsz{0};
// First fs occup check ? // First fs occup check ?
int m_occFirstCheck; int m_occFirstCheck{1};
// Synonym groups. There is no strict reason that this has to be // Synonym groups. There is no strict reason that this has to be
// an Rcl::Db member, as it is only used when building each It // an Rcl::Db member, as it is only used when building each It
@ -538,32 +542,31 @@ private:
SynGroups m_syngroups; SynGroups m_syngroups;
// Aspell object if needed // Aspell object if needed
Aspell *m_aspell = nullptr; Aspell *m_aspell{nullptr};
/*************** /***************
* Parameters cached out of the configuration files. Logically const * Parameters cached out of the configuration files. Logically const
* after init */ * after init */
// Stop terms: those don't get indexed. // Stop terms: those don't get indexed.
StopList m_stops; StopList m_stops;
// Truncation length for stored meta fields // Truncation length for stored meta fields
int m_idxMetaStoredLen; int m_idxMetaStoredLen{150};
// This is how long an abstract we keep or build from beginning of // This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the // text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying // db as we are free to shorten it again when displaying
int m_idxAbsTruncLen; int m_idxAbsTruncLen{250};
// Document text truncation length // Document text truncation length
int m_idxTextTruncateLen{0}; int m_idxTextTruncateLen{0};
// This is the size of the abstract that we synthetize out of query // This is the size of the abstract that we synthetize out of query
// term contexts at *query time* // term contexts at *query time*
int m_synthAbsLen; int m_synthAbsLen{250};
// This is how many words (context size) we keep around query terms // This is how many words (context size) we keep around query terms
// when building the abstract // when building the abstract
int m_synthAbsWordCtxLen; int m_synthAbsWordCtxLen{4};
// Flush threshold. Megabytes of text indexed before we flush. // Flush threshold. Megabytes of text indexed before we flush.
int m_flushMb; int m_flushMb{-1};
// Maximum file system occupation percentage // Maximum file system occupation percentage
int m_maxFsOccupPc; int m_maxFsOccupPc{0};
// Database directory // Database directory
string m_basedir; string m_basedir;
// When this is set, all documents are considered as needing a reindex. // When this is set, all documents are considered as needing a reindex.

View File

@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
LOGDEB("Db::TermMatch: syngroups out: " << LOGDEB("Db::TermMatch: syngroups out: " <<
term << " -> " << stringsToString(sg) << "\n"); term << " -> " << stringsToString(sg) << "\n");
for (const auto& synonym : sg) { for (const auto& synonym : sg) {
if (synonym.find_first_of(" ") != string::npos) { if (synonym.find(' ') != string::npos) {
if (multiwords) { if (multiwords) {
multiwords->push_back(synonym); multiwords->push_back(synonym);
} }

View File

@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
} }
// Push phrases for the multi-word expansions // Push phrases for the multi-word expansions
for (vector<string>::const_iterator mwp = multiwords.begin(); for (const auto& mw : multiwords) {
mwp != multiwords.end(); mwp++) {
vector<string> phr; vector<string> phr;
// We just do a basic split to keep things a bit simpler here // We just do a basic split to keep things a bit simpler here
// (no textsplit). This means though that no punctuation is // (no textsplit). This means though that no punctuation is
// allowed in multi-word synonyms. // allowed in multi-word synonyms.
stringToTokens(*mwp, phr); stringToTokens(mw, phr);
if (!prefix.empty()) if (!prefix.empty())
prefix_vector(phr, prefix); prefix_vector(phr, prefix);
xq = Xapian::Query(Xapian::Query::OP_OR, xq, xq = Xapian::Query(Xapian::Query::OP_OR, xq,
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
// NEAR xapian query, the elements of which can themselves be OR // NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we // queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though) // don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, void SearchDataClauseSimple::processPhraseOrNear(
TermProcQ *splitData, Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
int mods, void *pq,
bool useNear, int slack) bool useNear, int slack)
{ {
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq); vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE; Xapian::Query::OP_PHRASE;
vector<Xapian::Query> orqueries; vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
bool hadmultiple = false;
#endif
vector<vector<string> >groups; vector<vector<string> >groups;
bool useidxsynonyms =
db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
string prefix; string prefix;
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
} }
// Go through the list and perform stem/wildcard expansion for each element // Go through the list and perform stem/wildcard expansion for each element
vector<bool>::const_iterator nxit = splitData->nostemexps().begin(); auto nxit = splitData->nostemexps().begin();
for (vector<string>::const_iterator it = splitData->terms().begin(); for (auto it = splitData->terms().begin();
it != splitData->terms().end(); it++, nxit++) { it != splitData->terms().end(); it++, nxit++) {
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n"); LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
// Adjust when we do stem expansion. Not if disabled by // Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian // caller, not inside phrases, and some versions of xapian
// will accept only one OR clause inside NEAR. // will accept only one OR clause inside NEAR.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|| hadmultiple
#endif // single OR inside NEAR
;
int lmods = mods; int lmods = mods;
if (nostemexp) if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING; lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm; string sterm;
vector<string> exp; vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) vector<string> multiwords;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
return; return;
// Note: because of how expandTerm works, the multiwords can
// only come from the synonyms expansion, which means that, if
// idxsynonyms is set, they have each been indexed as a single
// term. So, if idxsynonyms is set, and is the current active
// synonyms file, we just add them to the expansion.
if (!multiwords.empty() && useidxsynonyms) {
exp.insert(exp.end(), multiwords.begin(), multiwords.end());
}
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " << LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
stringsToString(exp) << "\n"); stringsToString(exp) << "\n");
// groups is used for highlighting, we don't want prefixes in there. // groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs; vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin(); for (const auto& prefterm : exp) {
it != exp.end(); it++) { noprefs.push_back(prefterm.substr(prefix.size()));
noprefs.push_back(it->substr(prefix.size()));
} }
groups.push_back(noprefs); groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
m_curcl += exp.size(); m_curcl += exp.size();
if (m_curcl >= getMaxCl()) if (m_curcl >= getMaxCl())
return; return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
} }
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {

View File

@ -19,12 +19,15 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <set>
#include <list>
#include "textsplit.h" #include "textsplit.h"
#include "stoplist.h" #include "stoplist.h"
#include "smallut.h" #include "smallut.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "unacpp.h" #include "unacpp.h"
#include "syngroups.h"
namespace Rcl { namespace Rcl {
@ -52,10 +55,12 @@ class TermProc {
public: public:
TermProc(TermProc* next) : m_next(next) {} TermProc(TermProc* next) : m_next(next) {}
virtual ~TermProc() {} virtual ~TermProc() {}
/* Copyconst and assignment forbidden */
TermProc(const TermProc &) = delete;
TermProc& operator=(const TermProc &) = delete;
virtual bool takeword(const string &term, int pos, int bs, int be) { virtual bool takeword(const string &term, int pos, int bs, int be) {
if (m_next) if (m_next)
return m_next->takeword(term, pos, bs, be); return m_next->takeword(term, pos, bs, be);
else
return true; return true;
} }
// newpage() is like takeword(), but for page breaks. // newpage() is like takeword(), but for page breaks.
@ -66,16 +71,10 @@ public:
virtual bool flush() { virtual bool flush() {
if (m_next) if (m_next)
return m_next->flush(); return m_next->flush();
else
return true; return true;
} }
private: private:
TermProc *m_next; TermProc *m_next;
/* Copyconst and assignment private and forbidden */
TermProc(const TermProc &) {}
TermProc& operator=(const TermProc &) {
return *this;
};
}; };
/** /**
@ -100,7 +99,6 @@ public:
virtual bool takeword(const string& term, int pos, int bs, int be) { virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_prc) if (m_prc)
return m_prc->takeword(term, pos, bs, be); return m_prc->takeword(term, pos, bs, be);
else
return true; return true;
} }
@ -119,12 +117,9 @@ private:
class TermProcPrep : public TermProc { class TermProcPrep : public TermProc {
public: public:
TermProcPrep(TermProc *nxt) TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) : TermProc(nxt) {}
{
}
virtual bool takeword(const string& itrm, int pos, int bs, int be) virtual bool takeword(const string& itrm, int pos, int bs, int be) {
{
m_totalterms++; m_totalterms++;
string otrm; string otrm;
@ -179,49 +174,37 @@ public:
// change in here. This means that phrase searches and // change in here. This means that phrase searches and
// snippets will be wrong, but at least searching for the // snippets will be wrong, but at least searching for the
// terms will work. // terms will work.
bool hasspace = false; bool hasspace = otrm.find(' ') != std::string::npos;
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
if (*it == ' ') {
hasspace=true;
break;
}
}
if (hasspace) { if (hasspace) {
std::vector<std::string> terms; std::vector<std::string> terms;
stringToTokens(otrm, terms, " ", true); stringToTokens(otrm, terms, " ", true);
for (std::vector<std::string>::const_iterator it = terms.begin(); for (const auto& term : terms) {
it < terms.end(); it++) { if (!TermProc::takeword(term, pos, bs, be)) {
if (!TermProc::takeword(*it, pos, bs, be)) {
return false; return false;
} }
} }
return true; return true;
} else { }
return TermProc::takeword(otrm, pos, bs, be); return TermProc::takeword(otrm, pos, bs, be);
} }
}
virtual bool flush() virtual bool flush() {
{
m_totalterms = m_unacerrors = 0; m_totalterms = m_unacerrors = 0;
return TermProc::flush(); return TermProc::flush();
} }
private: private:
int m_totalterms; int m_totalterms{0};
int m_unacerrors; int m_unacerrors{0};
}; };
/** Compare to stop words list and discard if match found */ /** Compare to stop words list and discard if match found */
class TermProcStop : public TermProc { class TermProcStop : public TermProc {
public: public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops) TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) : TermProc(nxt), m_stops(stops) {}
{
}
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be) {
{
if (m_stops.isStop(term)) { if (m_stops.isStop(term)) {
return true; return true;
} }
@ -232,6 +215,53 @@ private:
const Rcl::StopList& m_stops; const Rcl::StopList& m_stops;
}; };
/** Generate multiword terms for multiword synonyms. This allows
* NEAR/PHRASE searches for multiword synonyms. */
class TermProcMulti : public TermProc {
public:
TermProcMulti(TermProc *nxt, const SynGroups& sg)
: TermProc(nxt), m_groups(sg.getmultiwords()),
m_maxl(sg.getmultiwordsmaxlength()) {}
virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_maxl < 2) {
// Should not have been pushed??
return TermProc::takeword(term, pos, bs, be);
}
m_terms.push_back(term);
if (m_terms.size() > m_maxl) {
m_terms.pop_front();
}
string comp;
int gsz{1};
for (const auto& gterm : m_terms) {
if (comp.empty()) {
comp = gterm;
continue;
} else {
comp += " ";
comp += gterm;
gsz++;
// We could optimize by not testing m_groups for sizes
// which do not exist.
// if not gsz in sizes continue;
}
if (m_groups.find(comp) != m_groups.end()) {
LOGDEB1("Found multiword synonym: [" << comp << "]\n");
// TBD bs-be correct computation. Need to store the
// values in a parallel list
TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
}
}
return TermProc::takeword(term, pos, bs, be);
}
private:
const std::set<std::string>& m_groups;
size_t m_maxl{0};
std::list<std::string> m_terms;
};
/** Handle common-gram generation: combine frequent terms with neighbours to /** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches. * shorten the positions lists for phrase searches.
* NOTE: This does not currently work because of bad interaction with the * NOTE: This does not currently work because of bad interaction with the
@ -241,13 +271,11 @@ private:
class TermProcCommongrams : public TermProc { class TermProcCommongrams : public TermProc {
public: public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops), m_onlygrams(false) : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
{
}
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be) {
{ LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" ); be << " [" << term << "]\n");
bool isstop = m_stops.isStop(term); bool isstop = m_stops.isStop(term);
bool twogramemit = false; bool twogramemit = false;
@ -287,8 +315,7 @@ public:
return true; return true;
} }
virtual bool flush() virtual bool flush() {
{
if (!m_prevsent && !m_prevterm.empty()) if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
return false; return false;
@ -297,8 +324,7 @@ public:
m_prevsent = true; m_prevsent = true;
return TermProc::flush(); return TermProc::flush();
} }
void onlygrams(bool on) void onlygrams(bool on) {
{
m_onlygrams = on; m_onlygrams = on;
} }
private: private:

View File

@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
# space issues.</descr></var> # space issues.</descr></var>
#idxtexttruncatelen = 0 #idxtexttruncatelen = 0
# <var name="idxsynonyms" type="fn">
#
# <brief>Name of the index-time synonyms file.</brief>
# <descr>This is used for indexing multiword synonyms as single terms,
# which in turn is only useful if you want to perform proximity searches
# with such terms.</descr></var>
#idxsynonyms = thereisnodefaultidxsynonyms
# <var name="aspellLanguage" type="string"> # <var name="aspellLanguage" type="string">
# #
# <brief>Language definitions to use when creating the aspell # <brief>Language definitions to use when creating the aspell