implemented multi-word terms indexing for phrase/prox search on multiword synonyms

This commit is contained in:
Jean-Francois Dockes 2021-01-15 12:04:06 +01:00
parent cb13b8b6df
commit aa2f0bfd73
10 changed files with 296 additions and 217 deletions

View File

@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
return getConfdirPath("stoplistfile", "stoplist.txt"); return getConfdirPath("stoplistfile", "stoplist.txt");
} }
string RclConfig::getSynGroupsFile() const string RclConfig::getIdxSynGroupsFile() const
{ {
return getConfdirPath("syngroupsfile", "syngroups.txt"); return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
} }
// The index status file is fast changing, so it's possible to put it outside // The index status file is fast changing, so it's possible to put it outside

View File

@ -72,8 +72,8 @@ private:
// Hold the description for an external metadata-gathering command // Hold the description for an external metadata-gathering command
struct MDReaper { struct MDReaper {
string fieldname; string fieldname;
vector<string> cmdv; vector<string> cmdv;
}; };
// Data associated to a indexed field name: // Data associated to a indexed field name:
@ -90,7 +90,7 @@ struct FieldTraits {
}; };
class RclConfig { class RclConfig {
public: public:
// Constructor: we normally look for a configuration file, except // Constructor: we normally look for a configuration file, except
// if this was specified on the command line and passed through // if this was specified on the command line and passed through
@ -100,7 +100,7 @@ class RclConfig {
RclConfig(const RclConfig &r); RclConfig(const RclConfig &r);
~RclConfig() { ~RclConfig() {
freeAll(); freeAll();
} }
// Return a writable clone of the main config. This belongs to the // Return a writable clone of the main config. This belongs to the
@ -133,18 +133,16 @@ class RclConfig {
string getKeyDir() const {return m_keydir;} string getKeyDir() const {return m_keydir;}
/** Get generic configuration parameter according to current keydir */ /** Get generic configuration parameter according to current keydir */
bool getConfParam(const string &name, string &value, bool getConfParam(const string& name, string& value,
bool shallow=false) const bool shallow=false) const {
{ if (m_conf == 0)
if (m_conf == 0) return false;
return false; return m_conf->get(name, value, m_keydir, shallow);
return m_conf->get(name, value, m_keydir, shallow);
} }
/** Variant with autoconversion to int */ /** Variant with autoconversion to int */
bool getConfParam(const string &name, int *value, bool shallow=false) const; bool getConfParam(const string &name, int *value, bool shallow=false) const;
/** Variant with autoconversion to bool */ /** Variant with autoconversion to bool */
bool getConfParam(const string &name, bool *value, bool getConfParam(const string &name, bool *value, bool shallow=false) const;
bool shallow=false) const;
/** Variant with conversion to vector<string> /** Variant with conversion to vector<string>
* (stringToStrings). Can fail if the string is malformed. */ * (stringToStrings). Can fail if the string is malformed. */
bool getConfParam(const string &name, vector<string> *value, bool getConfParam(const string &name, vector<string> *value,
@ -164,18 +162,15 @@ class RclConfig {
* Get list of config names under current sk, with possible * Get list of config names under current sk, with possible
* wildcard filtering * wildcard filtering
*/ */
vector<string> getConfNames(const char *pattern = 0) const vector<string> getConfNames(const char *pattern = 0) const {
{ return m_conf->getNames(m_keydir, pattern);
return m_conf->getNames(m_keydir, pattern);
} }
/** Check if name exists anywhere in config */ /** Check if name exists anywhere in config */
bool hasNameAnywhere(const string& nm) const bool hasNameAnywhere(const string& nm) const {
{
return m_conf? m_conf->hasNameAnywhere(nm) : false; return m_conf? m_conf->hasNameAnywhere(nm) : false;
} }
/** Get default charset for current keydir (was set during setKeydir) /** Get default charset for current keydir (was set during setKeydir)
* filenames are handled differently */ * filenames are handled differently */
const string &getDefCharset(bool filename = false) const; const string &getDefCharset(bool filename = false) const;
@ -198,7 +193,7 @@ class RclConfig {
/** Get stoplist file name */ /** Get stoplist file name */
string getStopfile() const; string getStopfile() const;
/** Get synonym groups file name */ /** Get synonym groups file name */
string getSynGroupsFile() const; string getIdxSynGroupsFile() const;
/** Get indexing pid file name */ /** Get indexing pid file name */
string getPidfile() const; string getPidfile() const;
/** Get indexing status file name */ /** Get indexing status file name */
@ -207,7 +202,7 @@ class RclConfig {
/** Do path translation according to the ptrans table */ /** Do path translation according to the ptrans table */
void urlrewrite(const string& dbdir, string& url) const; void urlrewrite(const string& dbdir, string& url) const;
ConfSimple *getPTrans() { ConfSimple *getPTrans() {
return m_ptrans; return m_ptrans;
} }
/** Get Web Queue directory name */ /** Get Web Queue directory name */
string getWebQueueDir() const; string getWebQueueDir() const;
@ -215,13 +210,13 @@ class RclConfig {
/** Get list of skipped file names for current keydir */ /** Get list of skipped file names for current keydir */
vector<string>& getSkippedNames(); vector<string>& getSkippedNames();
/** Get list of file name filters for current keydir (only those /** Get list of file name filters for current keydir (only those
names indexed) */ names indexed) */
vector<string>& getOnlyNames(); vector<string>& getOnlyNames();
/** Get list of skipped paths patterns. Doesn't depend on the keydir */ /** Get list of skipped paths patterns. Doesn't depend on the keydir */
vector<string> getSkippedPaths() const; vector<string> getSkippedPaths() const;
/** Get list of skipped paths patterns, daemon version (may add some) /** Get list of skipped paths patterns, daemon version (may add some)
Doesn't depend on the keydir */ Doesn't depend on the keydir */
vector<string> getDaemSkippedPaths() const; vector<string> getDaemSkippedPaths() const;
/** Return list of no content suffixes. Used by confgui, indexing uses /** Return list of no content suffixes. Used by confgui, indexing uses
@ -260,7 +255,7 @@ class RclConfig {
* @param whole the raw value. No way to escape a semi-colon in there. * @param whole the raw value. No way to escape a semi-colon in there.
*/ */
static bool valueSplitAttributes(const string& whole, string& value, static bool valueSplitAttributes(const string& whole, string& value,
ConfSimple& attrs) ; ConfSimple& attrs) ;
/** Compute difference between 'base' and 'changed', as elements to be /** Compute difference between 'base' and 'changed', as elements to be
* added and substracted from base. Input and output strings are in * added and substracted from base. Input and output strings are in
@ -288,9 +283,9 @@ class RclConfig {
bool getGuiFilter(const string& filtername, string& frag) const; bool getGuiFilter(const string& filtername, string& frag) const;
/** fields: get field prefix from field name. Use additional query /** fields: get field prefix from field name. Use additional query
aliases if isquery is set */ aliases if isquery is set */
bool getFieldTraits(const string& fldname, const FieldTraits **, bool getFieldTraits(const string& fldname, const FieldTraits **,
bool isquery = false) const; bool isquery = false) const;
const set<string>& getStoredFields() const {return m_storedFields;} const set<string>& getStoredFields() const {return m_storedFields;}
@ -311,11 +306,11 @@ class RclConfig {
*/ */
vector<string> getFieldSectNames(const string &sk, const char* = 0) const; vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
bool getFieldConfParam(const string &name, const string &sk, string &value) bool getFieldConfParam(const string &name, const string &sk, string &value)
const; const;
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */ /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
string getMimeViewerDef(const string &mimetype, const string& apptag, string getMimeViewerDef(const string &mimetype, const string& apptag,
bool useall) const; bool useall) const;
set<string> getMimeViewerAllEx() const; set<string> getMimeViewerAllEx() const;
bool setMimeViewerAllEx(const set<string>& allex); bool setMimeViewerAllEx(const set<string>& allex);
bool getMimeViewerDefs(vector<pair<string, string> >&) const; bool getMimeViewerDefs(vector<pair<string, string> >&) const;
@ -358,26 +353,25 @@ class RclConfig {
string findFilter(const string& cmd) const; string findFilter(const string& cmd) const;
/** Thread config init is not done automatically because not all /** Thread config init is not done automatically because not all
programs need it and it uses the debug log so that it's better to programs need it and it uses the debug log so that it's better to
call it after primary init */ call it after primary init */
void initThrConf(); void initThrConf();
const string& getOrigCwd() const string& getOrigCwd() {
{ return o_origcwd;
return o_origcwd;
} }
RclConfig& operator=(const RclConfig &r) { RclConfig& operator=(const RclConfig &r) {
if (this != &r) { if (this != &r) {
freeAll(); freeAll();
initFrom(r); initFrom(r);
} }
return *this; return *this;
} }
friend class ParamStale; friend class ParamStale;
private: private:
int m_ok; int m_ok;
string m_reason; // Explanation for bad state string m_reason; // Explanation for bad state
string m_confdir; // User directory where the customized files are stored string m_confdir; // User directory where the customized files are stored

View File

@ -44,8 +44,7 @@ using namespace std;
// groups anyway // groups anyway
class SynGroups::Internal { class SynGroups::Internal {
public: public:
Internal() : ok(false) { Internal() {}
}
void setpath(const string& fn) { void setpath(const string& fn) {
path = path_canon(fn); path = path_canon(fn);
stat(path.c_str(), &st); stat(path.c_str(), &st);
@ -61,16 +60,22 @@ public:
} }
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size; return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
} }
bool ok; bool ok{false};
// Term to group num // Term to group num
std::unordered_map<string, unsigned int> terms; std::unordered_map<string, unsigned int> terms;
// Group num to group // Group num to group
vector<vector<string> > groups; vector<vector<string> > groups;
// Aux: set of multiword synonyms used for generating multiword
// terms while indexing
std::set<std::string> multiwords;
size_t multiwords_maxlen{0};
std::string path; std::string path;
struct stat st; struct stat st;
}; };
bool SynGroups::ok() bool SynGroups::ok() const
{ {
return m && m->ok; return m && m->ok;
} }
@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
if (fn.empty()) { if (fn.empty()) {
delete m; delete m;
m = 0; m = 0;
return true; return true;
} }
if (m->samefile(fn)) { if (m->samefile(fn)) {
@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
ifstream input; ifstream input;
input.open(fn.c_str(), ios::in); input.open(fn.c_str(), ios::in);
if (!input.is_open()) { if (!input.is_open()) {
LOGSYSERR("SynGroups:setfile", "open", fn); LOGSYSERR("SynGroups:setfile", "open", fn);
return false; return false;
} }
string cline; string cline;
@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
string line; string line;
bool eof = false; bool eof = false;
int lnum = 0; int lnum = 0;
m->groups.clear();
m->terms.clear();
m->multiwords.clear();
m->multiwords_maxlen = 0;
for (;;) { for (;;) {
cline.clear(); cline.clear();
getline(input, cline); getline(input, cline);
if (!input.good()) { if (!input.good()) {
if (input.bad()) { if (input.bad()) {
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n"); LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
return false; return false;
} }
// Must be eof ? But maybe we have a partial line which // Must be eof ? But maybe we have a partial line which
// must be processed. This happens if the last line before // must be processed. This happens if the last line before
// eof ends with a backslash, or there is no final \n // eof ends with a backslash, or there is no final \n
eof = true; eof = true;
} }
lnum++; lnum++;
{ {
string::size_type pos = cline.find_last_not_of("\n\r"); string::size_type pos = cline.find_last_not_of("\n\r");
@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
} }
} }
if (appending) if (appending)
line += cline; line += cline;
else else
line = cline; line = cline;
// Note that we trim whitespace before checking for backslash-eol // Note that we trim whitespace before checking for backslash-eol
// This avoids invisible whitespace problems. // This avoids invisible whitespace problems.
trimstring(line); trimstring(line);
if (line.empty() || line.at(0) == '#') { if (line.empty() || line.at(0) == '#') {
if (eof) if (eof)
break; break;
continue; continue;
} }
if (line[line.length() - 1] == '\\') { if (line[line.length() - 1] == '\\') {
line.erase(line.length() - 1); line.erase(line.length() - 1);
appending = true; appending = true;
continue; continue;
} }
appending = false; appending = false;
vector<string> words; vector<string> words;
if (!stringToStrings(line, words)) { if (!stringToStrings(line, words)) {
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum << LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
": " << line << "\n"); ": " << line << "\n");
continue; continue;
} }
if (words.empty()) if (words.empty())
continue; continue;
if (words.size() == 1) { if (words.size() == 1) {
LOGERR("Syngroup::setfile(" << fn << "):single term group at line " LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
<< lnum << " ??\n"); << lnum << " ??\n");
continue; continue;
} }
m->groups.push_back(words); m->groups.push_back(words);
for (const auto& word : words) { for (const auto& word : words) {
m->terms[word] = m->groups.size()-1; m->terms[word] = m->groups.size()-1;
} }
LOGDEB1("SynGroups::setfile: group: [" << LOGDEB1("SynGroups::setfile: group: [" <<
stringsToString(m->groups.back()) << "]\n"); stringsToString(m->groups.back()) << "]\n");
} }
LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
" distinct terms." << endl); for (const auto& group : m->groups) {
for (const auto& term : group) {
std::vector<std::string> words;
stringToTokens(term, words);
if (words.size() > 1) {
std::string multiword;
for (const auto& word : words) {
if (!multiword.empty()) {
multiword += " ";
}
multiword += word;
}
m->multiwords.insert(multiword);
if (m->multiwords_maxlen < words.size()) {
m->multiwords_maxlen = words.size();
}
}
}
}
LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
"Multiwords: " << stringsToString(m->multiwords) <<"\n");
m->ok = true; m->ok = true;
m->setpath(fn); m->setpath(fn);
return true; return true;
} }
vector<string> SynGroups::getgroup(const string& term) vector<string> SynGroups::getgroup(const string& term) const
{ {
vector<string> ret; vector<string> ret;
if (!ok()) if (!ok())
return ret; return ret;
const auto it1 = m->terms.find(term); const auto it1 = m->terms.find(term);
if (it1 == m->terms.end()) { if (it1 == m->terms.end()) {
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n"); LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
return ret; return ret;
} }
unsigned int idx = it1->second; unsigned int idx = it1->second;
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
<< endl); << endl);
return m->groups[idx]; return m->groups[idx];
} }
const std::set<std::string>& SynGroups::getmultiwords() const
{
return m->multiwords;
}
size_t SynGroups::getmultiwordsmaxlength() const
{
return m->multiwords_maxlen;
}
const std::string& SynGroups::getpath() const
{
return m->path;
}

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2015 J.F.Dockes /* Copyright (C) 2015-2021 J.F.Dockes
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
@ -20,6 +20,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <set>
// Manage synonym groups. This is very different from stemming and // Manage synonym groups. This is very different from stemming and
// case/diac expansion because there is no reference form: all terms // case/diac expansion because there is no reference form: all terms
@ -34,8 +35,11 @@ public:
SynGroups& operator=(const SynGroups&&) = delete; SynGroups& operator=(const SynGroups&&) = delete;
bool setfile(const std::string& fname); bool setfile(const std::string& fname);
std::vector<std::string> getgroup(const std::string& term); std::vector<std::string> getgroup(const std::string& term) const;
bool ok(); const std::set<std::string>& getmultiwords() const;
size_t getmultiwordsmaxlength() const;
const std::string& getpath() const;
bool ok() const;
private: private:
class Internal; class Internal;
Internal *m; Internal *m;

View File

@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
bool Db::o_inPlaceReset; bool Db::o_inPlaceReset;
Db::Db(const RclConfig *cfp) Db::Db(const RclConfig *cfp)
: m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4),
m_flushMb(-1), m_maxFsOccupPc(0)
{ {
m_config = new RclConfig(*cfp); m_config = new RclConfig(*cfp);
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
if (start_of_field_term.empty()) { if (start_of_field_term.empty()) {
if (o_index_stripchars) { if (o_index_stripchars) {
start_of_field_term = "XXST"; start_of_field_term = "XXST";
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
end_of_field_term = "XXND/"; end_of_field_term = "XXND/";
} }
} }
m_ndb = new Native(this); m_ndb = new Native(this);
if (m_config) {
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
}
} }
Db::~Db() Db::~Db()
{ {
LOGDEB2("Db::~Db\n"); LOGDEB2("Db::~Db\n");
if (m_ndb == 0) if (nullptr == m_ndb)
return; return;
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
m_ndb->m_iswritable << "\n"); m_ndb->m_iswritable << "\n");
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
return res; return res;
} }
bool Db::open(OpenMode mode, OpenError *error) bool Db::open(OpenMode mode, OpenError *error)
{ {
if (error) if (error)
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
if (!m_config->getStopfile().empty()) if (!m_config->getStopfile().empty())
m_stops.setFile(m_config->getStopfile()); m_stops.setFile(m_config->getStopfile());
if (isWriteMode(mode)) {
// Check for an index-time synonyms file. We use this to
// generate multiword terms for multiword synonyms
string synfile = m_config->getIdxSynGroupsFile();
if (path_exists(synfile)) {
setSynGroupsFile(synfile);
}
}
string dir = m_config->getDbDir(); string dir = m_config->getDbDir();
string ermsg; string ermsg;
try { try {
switch (mode) { if (isWriteMode(mode)) {
case DbUpd:
case DbTrunc:
m_ndb->openWrite(dir, mode); m_ndb->openWrite(dir, mode);
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false); updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
// We used to open a readonly object in addition to the // We used to open a readonly object in addition to the
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
// so the query db is now a clone of the update one. // so the query db is now a clone of the update one.
m_ndb->xrdb = m_ndb->xwdb; m_ndb->xrdb = m_ndb->xwdb;
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n"); LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
break; } else {
case DbRO:
default:
m_ndb->openRead(dir); m_ndb->openRead(dir);
for (auto& db : m_extraDbs) { for (auto& db : m_extraDbs) {
if (error) if (error)
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
// but I can't see why // but I can't see why
m_ndb->xrdb.add_database(Xapian::Database(db)); m_ndb->xrdb.add_database(Xapian::Database(db));
} }
break;
} }
if (error) if (error)
*error = DbOpenMainDb; *error = DbOpenMainDb;
@ -1531,6 +1527,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
TermProcMulti tpmulti(nxt, m_syngroups);
if (m_syngroups.getmultiwordsmaxlength() > 1) {
nxt = &tpmulti;
}
TermProcPrep tpprep(nxt); TermProcPrep tpprep(nxt);
if (o_index_stripchars) if (o_index_stripchars)
nxt = &tpprep; nxt = &tpprep;

View File

@ -114,14 +114,13 @@ public:
class DbStats { class DbStats {
public: public:
DbStats() DbStats() {}
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
// Index-wide stats // Index-wide stats
unsigned int dbdoccount; unsigned int dbdoccount{0};
double dbavgdoclen; double dbavgdoclen{0};
size_t mindoclen; size_t mindoclen{0};
size_t maxdoclen; size_t maxdoclen{0};
vector<string> failedurls; /* Only set if requested */ std::vector<std::string> failedurls; /* Only set if requested */
}; };
inline bool has_prefix(const string& trm) inline bool has_prefix(const string& trm)
@ -175,6 +174,9 @@ public:
~Db(); ~Db();
enum OpenMode {DbRO, DbUpd, DbTrunc}; enum OpenMode {DbRO, DbUpd, DbTrunc};
bool isWriteMode(OpenMode mode) {
return mode == DbUpd || mode == DbTrunc;
}
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb}; enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
bool open(OpenMode mode, OpenError *error = 0); bool open(OpenMode mode, OpenError *error = 0);
bool close(); bool close();
@ -342,7 +344,7 @@ public:
bool setExtraQueryDbs(const std::vector<std::string>& dbs); bool setExtraQueryDbs(const std::vector<std::string>& dbs);
/** Check if document comes from the main index (this is used to /** Check if document comes from the main index (this is used to
decide if we can update the index for it */ decide if we can update the index for it */
bool fromMainIndex(const Doc& doc); bool fromMainIndex(const Doc& doc);
/** Retrieve the stored doc text. This returns false if the index does not /** Retrieve the stored doc text. This returns false if the index does not
@ -499,6 +501,7 @@ public:
// Use empty fn for no synonyms // Use empty fn for no synonyms
bool setSynGroupsFile(const std::string& fn); bool setSynGroupsFile(const std::string& fn);
const SynGroups& getSynGroups() {return m_syngroups;}
// Mark all documents with an UDI having input as prefix as // Mark all documents with an UDI having input as prefix as
// existing. Only works if the UDIs for the store are // existing. Only works if the UDIs for the store are
@ -508,25 +511,26 @@ public:
bool udiTreeMarkExisting(const string& udi); bool udiTreeMarkExisting(const string& udi);
/* This has to be public for access by embedded Query::Native */ /* This has to be public for access by embedded Query::Native */
Native *m_ndb; Native *m_ndb{nullptr};
private: private:
const RclConfig *m_config; const RclConfig *m_config;
string m_reason; // Error explanation string m_reason; // Error explanation
// Xapian directories for additional databases to query // Xapian directories for additional databases to query
vector<string> m_extraDbs; vector<string> m_extraDbs;
OpenMode m_mode; OpenMode m_mode{Db::DbRO};
// File existence vector: this is filled during the indexing pass. Any // File existence vector: this is filled during the indexing pass. Any
// document whose bit is not set at the end is purged // document whose bit is not set at the end is purged
vector<bool> updated; vector<bool> updated;
// Text bytes indexed since beginning // Text bytes indexed since beginning
long long m_curtxtsz; long long m_curtxtsz{0};
// Text bytes at last flush // Text bytes at last flush
long long m_flushtxtsz; long long m_flushtxtsz{0};
// Text bytes at last fsoccup check // Text bytes at last fsoccup check
long long m_occtxtsz; long long m_occtxtsz{0};
// First fs occup check ? // First fs occup check ?
int m_occFirstCheck; int m_occFirstCheck{1};
// Synonym groups. There is no strict reason that this has to be // Synonym groups. There is no strict reason that this has to be
// an Rcl::Db member, as it is only used when building each It // an Rcl::Db member, as it is only used when building each It
@ -538,32 +542,31 @@ private:
SynGroups m_syngroups; SynGroups m_syngroups;
// Aspell object if needed // Aspell object if needed
Aspell *m_aspell = nullptr; Aspell *m_aspell{nullptr};
/*************** /***************
* Parameters cached out of the configuration files. Logically const * Parameters cached out of the configuration files. Logically const
* after init */ * after init */
// Stop terms: those don't get indexed. // Stop terms: those don't get indexed.
StopList m_stops; StopList m_stops;
// Truncation length for stored meta fields // Truncation length for stored meta fields
int m_idxMetaStoredLen; int m_idxMetaStoredLen{150};
// This is how long an abstract we keep or build from beginning of // This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the // text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying // db as we are free to shorten it again when displaying
int m_idxAbsTruncLen; int m_idxAbsTruncLen{250};
// Document text truncation length // Document text truncation length
int m_idxTextTruncateLen{0}; int m_idxTextTruncateLen{0};
// This is the size of the abstract that we synthetize out of query // This is the size of the abstract that we synthetize out of query
// term contexts at *query time* // term contexts at *query time*
int m_synthAbsLen; int m_synthAbsLen{250};
// This is how many words (context size) we keep around query terms // This is how many words (context size) we keep around query terms
// when building the abstract // when building the abstract
int m_synthAbsWordCtxLen; int m_synthAbsWordCtxLen{4};
// Flush threshold. Megabytes of text indexed before we flush. // Flush threshold. Megabytes of text indexed before we flush.
int m_flushMb; int m_flushMb{-1};
// Maximum file system occupation percentage // Maximum file system occupation percentage
int m_maxFsOccupPc; int m_maxFsOccupPc{0};
// Database directory // Database directory
string m_basedir; string m_basedir;
// When this is set, all documents are considered as needing a reindex. // When this is set, all documents are considered as needing a reindex.

View File

@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
LOGDEB("Db::TermMatch: syngroups out: " << LOGDEB("Db::TermMatch: syngroups out: " <<
term << " -> " << stringsToString(sg) << "\n"); term << " -> " << stringsToString(sg) << "\n");
for (const auto& synonym : sg) { for (const auto& synonym : sg) {
if (synonym.find_first_of(" ") != string::npos) { if (synonym.find(' ') != string::npos) {
if (multiwords) { if (multiwords) {
multiwords->push_back(synonym); multiwords->push_back(synonym);
} }

View File

@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
} }
// Push phrases for the multi-word expansions // Push phrases for the multi-word expansions
for (vector<string>::const_iterator mwp = multiwords.begin(); for (const auto& mw : multiwords) {
mwp != multiwords.end(); mwp++) {
vector<string> phr; vector<string> phr;
// We just do a basic split to keep things a bit simpler here // We just do a basic split to keep things a bit simpler here
// (no textsplit). This means though that no punctuation is // (no textsplit). This means though that no punctuation is
// allowed in multi-word synonyms. // allowed in multi-word synonyms.
stringToTokens(*mwp, phr); stringToTokens(mw, phr);
if (!prefix.empty()) if (!prefix.empty())
prefix_vector(phr, prefix); prefix_vector(phr, prefix);
xq = Xapian::Query(Xapian::Query::OP_OR, xq, xq = Xapian::Query(Xapian::Query::OP_OR, xq,
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
// NEAR xapian query, the elements of which can themselves be OR // NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we // queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though) // don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, void SearchDataClauseSimple::processPhraseOrNear(
TermProcQ *splitData, Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
int mods, void *pq, bool useNear, int slack)
bool useNear, int slack)
{ {
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq); vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE; Xapian::Query::OP_PHRASE;
vector<Xapian::Query> orqueries; vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
bool hadmultiple = false;
#endif
vector<vector<string> >groups; vector<vector<string> >groups;
bool useidxsynonyms =
db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
string prefix; string prefix;
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
} }
// Go through the list and perform stem/wildcard expansion for each element // Go through the list and perform stem/wildcard expansion for each element
vector<bool>::const_iterator nxit = splitData->nostemexps().begin(); auto nxit = splitData->nostemexps().begin();
for (vector<string>::const_iterator it = splitData->terms().begin(); for (auto it = splitData->terms().begin();
it != splitData->terms().end(); it++, nxit++) { it != splitData->terms().end(); it++, nxit++) {
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n"); LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
// Adjust when we do stem expansion. Not if disabled by // Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian // caller, not inside phrases, and some versions of xapian
// will accept only one OR clause inside NEAR. // will accept only one OR clause inside NEAR.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|| hadmultiple
#endif // single OR inside NEAR
;
int lmods = mods; int lmods = mods;
if (nostemexp) if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING; lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm; string sterm;
vector<string> exp; vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) vector<string> multiwords;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
return; return;
// Note: because of how expandTerm works, the multiwords can
// only come from the synonyms expansion, which means that, if
// idxsynonyms is set, they have each been indexed as a single
// term. So, if idxsynonyms is set, and is the current active
// synonyms file, we just add them to the expansion.
if (!multiwords.empty() && useidxsynonyms) {
exp.insert(exp.end(), multiwords.begin(), multiwords.end());
}
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " << LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
stringsToString(exp) << "\n"); stringsToString(exp) << "\n");
// groups is used for highlighting, we don't want prefixes in there. // groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs; vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin(); for (const auto& prefterm : exp) {
it != exp.end(); it++) { noprefs.push_back(prefterm.substr(prefix.size()));
noprefs.push_back(it->substr(prefix.size()));
} }
groups.push_back(noprefs); groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
m_curcl += exp.size(); m_curcl += exp.size();
if (m_curcl >= getMaxCl()) if (m_curcl >= getMaxCl())
return; return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
} }
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {

View File

@ -19,12 +19,15 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <set>
#include <list>
#include "textsplit.h" #include "textsplit.h"
#include "stoplist.h" #include "stoplist.h"
#include "smallut.h" #include "smallut.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "unacpp.h" #include "unacpp.h"
#include "syngroups.h"
namespace Rcl { namespace Rcl {
@ -52,11 +55,13 @@ class TermProc {
public: public:
TermProc(TermProc* next) : m_next(next) {} TermProc(TermProc* next) : m_next(next) {}
virtual ~TermProc() {} virtual ~TermProc() {}
/* Copyconst and assignment forbidden */
TermProc(const TermProc &) = delete;
TermProc& operator=(const TermProc &) = delete;
virtual bool takeword(const string &term, int pos, int bs, int be) { virtual bool takeword(const string &term, int pos, int bs, int be) {
if (m_next) if (m_next)
return m_next->takeword(term, pos, bs, be); return m_next->takeword(term, pos, bs, be);
else return true;
return true;
} }
// newpage() is like takeword(), but for page breaks. // newpage() is like takeword(), but for page breaks.
virtual void newpage(int pos) { virtual void newpage(int pos) {
@ -66,16 +71,10 @@ public:
virtual bool flush() { virtual bool flush() {
if (m_next) if (m_next)
return m_next->flush(); return m_next->flush();
else return true;
return true;
} }
private: private:
TermProc *m_next; TermProc *m_next;
/* Copyconst and assignment private and forbidden */
TermProc(const TermProc &) {}
TermProc& operator=(const TermProc &) {
return *this;
};
}; };
/** /**
@ -100,8 +99,7 @@ public:
virtual bool takeword(const string& term, int pos, int bs, int be) { virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_prc) if (m_prc)
return m_prc->takeword(term, pos, bs, be); return m_prc->takeword(term, pos, bs, be);
else return true;
return true;
} }
virtual void newpage(int pos) { virtual void newpage(int pos) {
@ -119,12 +117,9 @@ private:
class TermProcPrep : public TermProc { class TermProcPrep : public TermProc {
public: public:
TermProcPrep(TermProc *nxt) TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) : TermProc(nxt) {}
{
}
virtual bool takeword(const string& itrm, int pos, int bs, int be) virtual bool takeword(const string& itrm, int pos, int bs, int be) {
{
m_totalterms++; m_totalterms++;
string otrm; string otrm;
@ -179,49 +174,37 @@ public:
// change in here. This means that phrase searches and // change in here. This means that phrase searches and
// snippets will be wrong, but at least searching for the // snippets will be wrong, but at least searching for the
// terms will work. // terms will work.
bool hasspace = false; bool hasspace = otrm.find(' ') != std::string::npos;
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
if (*it == ' ') {
hasspace=true;
break;
}
}
if (hasspace) { if (hasspace) {
std::vector<std::string> terms; std::vector<std::string> terms;
stringToTokens(otrm, terms, " ", true); stringToTokens(otrm, terms, " ", true);
for (std::vector<std::string>::const_iterator it = terms.begin(); for (const auto& term : terms) {
it < terms.end(); it++) { if (!TermProc::takeword(term, pos, bs, be)) {
if (!TermProc::takeword(*it, pos, bs, be)) {
return false; return false;
} }
} }
return true; return true;
} else {
return TermProc::takeword(otrm, pos, bs, be);
} }
return TermProc::takeword(otrm, pos, bs, be);
} }
virtual bool flush() virtual bool flush() {
{
m_totalterms = m_unacerrors = 0; m_totalterms = m_unacerrors = 0;
return TermProc::flush(); return TermProc::flush();
} }
private: private:
int m_totalterms; int m_totalterms{0};
int m_unacerrors; int m_unacerrors{0};
}; };
/** Compare to stop words list and discard if match found */ /** Compare to stop words list and discard if match found */
class TermProcStop : public TermProc { class TermProcStop : public TermProc {
public: public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops) TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) : TermProc(nxt), m_stops(stops) {}
{
}
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be) {
{
if (m_stops.isStop(term)) { if (m_stops.isStop(term)) {
return true; return true;
} }
@ -232,6 +215,53 @@ private:
const Rcl::StopList& m_stops; const Rcl::StopList& m_stops;
}; };
/** Generate multiword terms for multiword synonyms. This allows
* NEAR/PHRASE searches for multiword synonyms. */
class TermProcMulti : public TermProc {
public:
TermProcMulti(TermProc *nxt, const SynGroups& sg)
: TermProc(nxt), m_groups(sg.getmultiwords()),
m_maxl(sg.getmultiwordsmaxlength()) {}
virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_maxl < 2) {
// Should not have been pushed??
return TermProc::takeword(term, pos, bs, be);
}
m_terms.push_back(term);
if (m_terms.size() > m_maxl) {
m_terms.pop_front();
}
string comp;
int gsz{1};
for (const auto& gterm : m_terms) {
if (comp.empty()) {
comp = gterm;
continue;
} else {
comp += " ";
comp += gterm;
gsz++;
// We could optimize by not testing m_groups for sizes
// which do not exist.
// if not gsz in sizes continue;
}
if (m_groups.find(comp) != m_groups.end()) {
LOGDEB1("Found multiword synonym: [" << comp << "]\n");
// TBD bs-be correct computation. Need to store the
// values in a parallel list
TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
}
}
return TermProc::takeword(term, pos, bs, be);
}
private:
const std::set<std::string>& m_groups;
size_t m_maxl{0};
std::list<std::string> m_terms;
};
/** Handle common-gram generation: combine frequent terms with neighbours to /** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches. * shorten the positions lists for phrase searches.
* NOTE: This does not currently work because of bad interaction with the * NOTE: This does not currently work because of bad interaction with the
@ -241,13 +271,11 @@ private:
class TermProcCommongrams : public TermProc { class TermProcCommongrams : public TermProc {
public: public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops), m_onlygrams(false) : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
{
}
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be) {
{ LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" ); be << " [" << term << "]\n");
bool isstop = m_stops.isStop(term); bool isstop = m_stops.isStop(term);
bool twogramemit = false; bool twogramemit = false;
@ -287,8 +315,7 @@ public:
return true; return true;
} }
virtual bool flush() virtual bool flush() {
{
if (!m_prevsent && !m_prevterm.empty()) if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
return false; return false;
@ -297,8 +324,7 @@ public:
m_prevsent = true; m_prevsent = true;
return TermProc::flush(); return TermProc::flush();
} }
void onlygrams(bool on) void onlygrams(bool on) {
{
m_onlygrams = on; m_onlygrams = on;
} }
private: private:

View File

@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
# space issues.</descr></var> # space issues.</descr></var>
#idxtexttruncatelen = 0 #idxtexttruncatelen = 0
# <var name="idxsynonyms" type="fn">
#
# <brief>Name of the index-time synonyms file.</brief>
# <descr>This is used for indexing multiword synonyms as single terms,
# which in turn is only useful if you want to perform proximity searches
# with such terms.</descr></var>
#idxsynonyms = thereisnodefaultidxsynonyms
# <var name="aspellLanguage" type="string"> # <var name="aspellLanguage" type="string">
# #
# <brief>Language definitions to use when creating the aspell # <brief>Language definitions to use when creating the aspell