implemented multi-word terms indexing for phrase/prox search on multiword synonyms

This commit is contained in:
Jean-Francois Dockes 2021-01-15 12:04:06 +01:00
parent cb13b8b6df
commit aa2f0bfd73
10 changed files with 296 additions and 217 deletions

View File

@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
return getConfdirPath("stoplistfile", "stoplist.txt");
}
string RclConfig::getSynGroupsFile() const
string RclConfig::getIdxSynGroupsFile() const
{
return getConfdirPath("syngroupsfile", "syngroups.txt");
return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
}
// The index status file is fast changing, so it's possible to put it outside

View File

@ -72,8 +72,8 @@ private:
// Hold the description for an external metadata-gathering command
struct MDReaper {
string fieldname;
vector<string> cmdv;
string fieldname;
vector<string> cmdv;
};
// Data associated to a indexed field name:
@ -90,7 +90,7 @@ struct FieldTraits {
};
class RclConfig {
public:
public:
// Constructor: we normally look for a configuration file, except
// if this was specified on the command line and passed through
@ -100,7 +100,7 @@ class RclConfig {
RclConfig(const RclConfig &r);
~RclConfig() {
freeAll();
freeAll();
}
// Return a writable clone of the main config. This belongs to the
@ -133,18 +133,16 @@ class RclConfig {
string getKeyDir() const {return m_keydir;}
/** Get generic configuration parameter according to current keydir */
bool getConfParam(const string &name, string &value,
bool shallow=false) const
{
if (m_conf == 0)
return false;
return m_conf->get(name, value, m_keydir, shallow);
bool getConfParam(const string& name, string& value,
bool shallow=false) const {
if (m_conf == 0)
return false;
return m_conf->get(name, value, m_keydir, shallow);
}
/** Variant with autoconversion to int */
bool getConfParam(const string &name, int *value, bool shallow=false) const;
/** Variant with autoconversion to bool */
bool getConfParam(const string &name, bool *value,
bool shallow=false) const;
bool getConfParam(const string &name, bool *value, bool shallow=false) const;
/** Variant with conversion to vector<string>
* (stringToStrings). Can fail if the string is malformed. */
bool getConfParam(const string &name, vector<string> *value,
@ -164,18 +162,15 @@ class RclConfig {
* Get list of config names under current sk, with possible
* wildcard filtering
*/
vector<string> getConfNames(const char *pattern = 0) const
{
return m_conf->getNames(m_keydir, pattern);
vector<string> getConfNames(const char *pattern = 0) const {
return m_conf->getNames(m_keydir, pattern);
}
/** Check if name exists anywhere in config */
bool hasNameAnywhere(const string& nm) const
{
bool hasNameAnywhere(const string& nm) const {
return m_conf? m_conf->hasNameAnywhere(nm) : false;
}
/** Get default charset for current keydir (was set during setKeydir)
* filenames are handled differently */
const string &getDefCharset(bool filename = false) const;
@ -198,7 +193,7 @@ class RclConfig {
/** Get stoplist file name */
string getStopfile() const;
/** Get synonym groups file name */
string getSynGroupsFile() const;
string getIdxSynGroupsFile() const;
/** Get indexing pid file name */
string getPidfile() const;
/** Get indexing status file name */
@ -207,7 +202,7 @@ class RclConfig {
/** Do path translation according to the ptrans table */
void urlrewrite(const string& dbdir, string& url) const;
ConfSimple *getPTrans() {
return m_ptrans;
return m_ptrans;
}
/** Get Web Queue directory name */
string getWebQueueDir() const;
@ -215,13 +210,13 @@ class RclConfig {
/** Get list of skipped file names for current keydir */
vector<string>& getSkippedNames();
/** Get list of file name filters for current keydir (only those
names indexed) */
names indexed) */
vector<string>& getOnlyNames();
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
vector<string> getSkippedPaths() const;
/** Get list of skipped paths patterns, daemon version (may add some)
Doesn't depend on the keydir */
Doesn't depend on the keydir */
vector<string> getDaemSkippedPaths() const;
/** Return list of no content suffixes. Used by confgui, indexing uses
@ -260,7 +255,7 @@ class RclConfig {
* @param whole the raw value. No way to escape a semi-colon in there.
*/
static bool valueSplitAttributes(const string& whole, string& value,
ConfSimple& attrs) ;
ConfSimple& attrs) ;
/** Compute difference between 'base' and 'changed', as elements to be
* added and substracted from base. Input and output strings are in
@ -288,9 +283,9 @@ class RclConfig {
bool getGuiFilter(const string& filtername, string& frag) const;
/** fields: get field prefix from field name. Use additional query
aliases if isquery is set */
aliases if isquery is set */
bool getFieldTraits(const string& fldname, const FieldTraits **,
bool isquery = false) const;
bool isquery = false) const;
const set<string>& getStoredFields() const {return m_storedFields;}
@ -311,11 +306,11 @@ class RclConfig {
*/
vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
bool getFieldConfParam(const string &name, const string &sk, string &value)
const;
const;
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
string getMimeViewerDef(const string &mimetype, const string& apptag,
bool useall) const;
bool useall) const;
set<string> getMimeViewerAllEx() const;
bool setMimeViewerAllEx(const set<string>& allex);
bool getMimeViewerDefs(vector<pair<string, string> >&) const;
@ -358,26 +353,25 @@ class RclConfig {
string findFilter(const string& cmd) const;
/** Thread config init is not done automatically because not all
programs need it and it uses the debug log so that it's better to
call it after primary init */
programs need it and it uses the debug log so that it's better to
call it after primary init */
void initThrConf();
const string& getOrigCwd()
{
return o_origcwd;
const string& getOrigCwd() {
return o_origcwd;
}
RclConfig& operator=(const RclConfig &r) {
if (this != &r) {
freeAll();
initFrom(r);
}
return *this;
if (this != &r) {
freeAll();
initFrom(r);
}
return *this;
}
friend class ParamStale;
private:
private:
int m_ok;
string m_reason; // Explanation for bad state
string m_confdir; // User directory where the customized files are stored

View File

@ -44,8 +44,7 @@ using namespace std;
// groups anyway
class SynGroups::Internal {
public:
Internal() : ok(false) {
}
Internal() {}
void setpath(const string& fn) {
path = path_canon(fn);
stat(path.c_str(), &st);
@ -61,16 +60,22 @@ public:
}
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
}
bool ok;
bool ok{false};
// Term to group num
std::unordered_map<string, unsigned int> terms;
// Group num to group
vector<vector<string> > groups;
// Aux: set of multiword synonyms used for generating multiword
// terms while indexing
std::set<std::string> multiwords;
size_t multiwords_maxlen{0};
std::string path;
struct stat st;
};
bool SynGroups::ok()
bool SynGroups::ok() const
{
return m && m->ok;
}
@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
if (fn.empty()) {
delete m;
m = 0;
return true;
return true;
}
if (m->samefile(fn)) {
@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
ifstream input;
input.open(fn.c_str(), ios::in);
if (!input.is_open()) {
LOGSYSERR("SynGroups:setfile", "open", fn);
return false;
LOGSYSERR("SynGroups:setfile", "open", fn);
return false;
}
string cline;
@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
string line;
bool eof = false;
int lnum = 0;
m->groups.clear();
m->terms.clear();
m->multiwords.clear();
m->multiwords_maxlen = 0;
for (;;) {
cline.clear();
getline(input, cline);
if (!input.good()) {
if (input.bad()) {
getline(input, cline);
if (!input.good()) {
if (input.bad()) {
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
return false;
}
// Must be eof ? But maybe we have a partial line which
// must be processed. This happens if the last line before
// eof ends with a backslash, or there is no final \n
return false;
}
// Must be eof ? But maybe we have a partial line which
// must be processed. This happens if the last line before
// eof ends with a backslash, or there is no final \n
eof = true;
}
lnum++;
}
lnum++;
{
string::size_type pos = cline.find_last_not_of("\n\r");
@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
}
}
if (appending)
line += cline;
else
line = cline;
if (appending)
line += cline;
else
line = cline;
// Note that we trim whitespace before checking for backslash-eol
// This avoids invisible whitespace problems.
trimstring(line);
if (line.empty() || line.at(0) == '#') {
// Note that we trim whitespace before checking for backslash-eol
// This avoids invisible whitespace problems.
trimstring(line);
if (line.empty() || line.at(0) == '#') {
if (eof)
break;
continue;
}
if (line[line.length() - 1] == '\\') {
line.erase(line.length() - 1);
appending = true;
continue;
}
appending = false;
continue;
}
if (line[line.length() - 1] == '\\') {
line.erase(line.length() - 1);
appending = true;
continue;
}
appending = false;
vector<string> words;
if (!stringToStrings(line, words)) {
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
vector<string> words;
if (!stringToStrings(line, words)) {
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
": " << line << "\n");
continue;
}
continue;
}
if (words.empty())
continue;
if (words.size() == 1) {
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
if (words.empty())
continue;
if (words.size() == 1) {
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
<< lnum << " ??\n");
continue;
}
continue;
}
m->groups.push_back(words);
for (const auto& word : words) {
m->terms[word] = m->groups.size()-1;
}
LOGDEB1("SynGroups::setfile: group: [" <<
m->groups.push_back(words);
for (const auto& word : words) {
m->terms[word] = m->groups.size()-1;
}
LOGDEB1("SynGroups::setfile: group: [" <<
stringsToString(m->groups.back()) << "]\n");
}
LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
" distinct terms." << endl);
for (const auto& group : m->groups) {
for (const auto& term : group) {
std::vector<std::string> words;
stringToTokens(term, words);
if (words.size() > 1) {
std::string multiword;
for (const auto& word : words) {
if (!multiword.empty()) {
multiword += " ";
}
multiword += word;
}
m->multiwords.insert(multiword);
if (m->multiwords_maxlen < words.size()) {
m->multiwords_maxlen = words.size();
}
}
}
}
LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
"Multiwords: " << stringsToString(m->multiwords) <<"\n");
m->ok = true;
m->setpath(fn);
return true;
}
vector<string> SynGroups::getgroup(const string& term)
vector<string> SynGroups::getgroup(const string& term) const
{
vector<string> ret;
if (!ok())
return ret;
return ret;
const auto it1 = m->terms.find(term);
if (it1 == m->terms.end()) {
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
return ret;
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
return ret;
}
unsigned int idx = it1->second;
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
<< endl);
return m->groups[idx];
}
const std::set<std::string>& SynGroups::getmultiwords() const
{
return m->multiwords;
}
size_t SynGroups::getmultiwordsmaxlength() const
{
return m->multiwords_maxlen;
}
const std::string& SynGroups::getpath() const
{
return m->path;
}

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2015 J.F.Dockes
/* Copyright (C) 2015-2021 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -20,6 +20,7 @@
#include <string>
#include <vector>
#include <set>
// Manage synonym groups. This is very different from stemming and
// case/diac expansion because there is no reference form: all terms
@ -34,8 +35,11 @@ public:
SynGroups& operator=(const SynGroups&&) = delete;
bool setfile(const std::string& fname);
std::vector<std::string> getgroup(const std::string& term);
bool ok();
std::vector<std::string> getgroup(const std::string& term) const;
const std::set<std::string>& getmultiwords() const;
size_t getmultiwordsmaxlength() const;
const std::string& getpath() const;
bool ok() const;
private:
class Internal;
Internal *m;

View File

@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
bool Db::o_inPlaceReset;
Db::Db(const RclConfig *cfp)
: m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4),
m_flushMb(-1), m_maxFsOccupPc(0)
{
m_config = new RclConfig(*cfp);
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
if (start_of_field_term.empty()) {
if (o_index_stripchars) {
start_of_field_term = "XXST";
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
end_of_field_term = "XXND/";
}
}
m_ndb = new Native(this);
if (m_config) {
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
m_config->getConfParam("idxflushmb", &m_flushMb);
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
}
}
Db::~Db()
{
LOGDEB2("Db::~Db\n");
if (m_ndb == 0)
if (nullptr == m_ndb)
return;
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
m_ndb->m_iswritable << "\n");
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
return res;
}
bool Db::open(OpenMode mode, OpenError *error)
{
if (error)
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
if (!m_config->getStopfile().empty())
m_stops.setFile(m_config->getStopfile());
if (isWriteMode(mode)) {
// Check for an index-time synonyms file. We use this to
// generate multiword terms for multiword synonyms
string synfile = m_config->getIdxSynGroupsFile();
if (path_exists(synfile)) {
setSynGroupsFile(synfile);
}
}
string dir = m_config->getDbDir();
string ermsg;
try {
switch (mode) {
case DbUpd:
case DbTrunc:
if (isWriteMode(mode)) {
m_ndb->openWrite(dir, mode);
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
// We used to open a readonly object in addition to the
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
// so the query db is now a clone of the update one.
m_ndb->xrdb = m_ndb->xwdb;
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
break;
case DbRO:
default:
} else {
m_ndb->openRead(dir);
for (auto& db : m_extraDbs) {
if (error)
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
// but I can't see why
m_ndb->xrdb.add_database(Xapian::Database(db));
}
break;
}
if (error)
*error = DbOpenMainDb;
@ -1531,10 +1527,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
TermProcMulti tpmulti(nxt, m_syngroups);
if (m_syngroups.getmultiwordsmaxlength() > 1) {
nxt = &tpmulti;
}
TermProcPrep tpprep(nxt);
if (o_index_stripchars)
nxt = &tpprep;
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
tpidx.setTSD(&splitter);

View File

@ -114,14 +114,13 @@ public:
class DbStats {
public:
DbStats()
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
DbStats() {}
// Index-wide stats
unsigned int dbdoccount;
double dbavgdoclen;
size_t mindoclen;
size_t maxdoclen;
vector<string> failedurls; /* Only set if requested */
unsigned int dbdoccount{0};
double dbavgdoclen{0};
size_t mindoclen{0};
size_t maxdoclen{0};
std::vector<std::string> failedurls; /* Only set if requested */
};
inline bool has_prefix(const string& trm)
@ -175,6 +174,9 @@ public:
~Db();
enum OpenMode {DbRO, DbUpd, DbTrunc};
bool isWriteMode(OpenMode mode) {
return mode == DbUpd || mode == DbTrunc;
}
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
bool open(OpenMode mode, OpenError *error = 0);
bool close();
@ -342,7 +344,7 @@ public:
bool setExtraQueryDbs(const std::vector<std::string>& dbs);
/** Check if document comes from the main index (this is used to
decide if we can update the index for it */
decide if we can update the index for it */
bool fromMainIndex(const Doc& doc);
/** Retrieve the stored doc text. This returns false if the index does not
@ -499,7 +501,8 @@ public:
// Use empty fn for no synonyms
bool setSynGroupsFile(const std::string& fn);
const SynGroups& getSynGroups() {return m_syngroups;}
// Mark all documents with an UDI having input as prefix as
// existing. Only works if the UDIs for the store are
// hierarchical of course. Used by FsIndexer to avoid purging
@ -508,25 +511,26 @@ public:
bool udiTreeMarkExisting(const string& udi);
/* This has to be public for access by embedded Query::Native */
Native *m_ndb;
Native *m_ndb{nullptr};
private:
const RclConfig *m_config;
string m_reason; // Error explanation
// Xapian directories for additional databases to query
vector<string> m_extraDbs;
OpenMode m_mode;
OpenMode m_mode{Db::DbRO};
// File existence vector: this is filled during the indexing pass. Any
// document whose bit is not set at the end is purged
vector<bool> updated;
// Text bytes indexed since beginning
long long m_curtxtsz;
long long m_curtxtsz{0};
// Text bytes at last flush
long long m_flushtxtsz;
long long m_flushtxtsz{0};
// Text bytes at last fsoccup check
long long m_occtxtsz;
long long m_occtxtsz{0};
// First fs occup check ?
int m_occFirstCheck;
int m_occFirstCheck{1};
// Synonym groups. There is no strict reason that this has to be
// an Rcl::Db member, as it is only used when building each It
@ -538,32 +542,31 @@ private:
SynGroups m_syngroups;
// Aspell object if needed
Aspell *m_aspell = nullptr;
Aspell *m_aspell{nullptr};
/***************
* Parameters cached out of the configuration files. Logically const
* after init */
// Stop terms: those don't get indexed.
StopList m_stops;
// Truncation length for stored meta fields
int m_idxMetaStoredLen;
int m_idxMetaStoredLen{150};
// This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying
int m_idxAbsTruncLen;
int m_idxAbsTruncLen{250};
// Document text truncation length
int m_idxTextTruncateLen{0};
// This is the size of the abstract that we synthetize out of query
// term contexts at *query time*
int m_synthAbsLen;
int m_synthAbsLen{250};
// This is how many words (context size) we keep around query terms
// when building the abstract
int m_synthAbsWordCtxLen;
int m_synthAbsWordCtxLen{4};
// Flush threshold. Megabytes of text indexed before we flush.
int m_flushMb;
int m_flushMb{-1};
// Maximum file system occupation percentage
int m_maxFsOccupPc;
int m_maxFsOccupPc{0};
// Database directory
string m_basedir;
// When this is set, all documents are considered as needing a reindex.

View File

@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
LOGDEB("Db::TermMatch: syngroups out: " <<
term << " -> " << stringsToString(sg) << "\n");
for (const auto& synonym : sg) {
if (synonym.find_first_of(" ") != string::npos) {
if (synonym.find(' ') != string::npos) {
if (multiwords) {
multiwords->push_back(synonym);
}

View File

@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
}
// Push phrases for the multi-word expansions
for (vector<string>::const_iterator mwp = multiwords.begin();
mwp != multiwords.end(); mwp++) {
for (const auto& mw : multiwords) {
vector<string> phr;
// We just do a basic split to keep things a bit simpler here
// (no textsplit). This means though that no punctuation is
// allowed in multi-word synonyms.
stringToTokens(*mwp, phr);
stringToTokens(mw, phr);
if (!prefix.empty())
prefix_vector(phr, prefix);
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
TermProcQ *splitData,
int mods, void *pq,
bool useNear, int slack)
void SearchDataClauseSimple::processPhraseOrNear(
Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
bool useNear, int slack)
{
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
bool hadmultiple = false;
#endif
vector<vector<string> >groups;
bool useidxsynonyms =
db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
}
// Go through the list and perform stem/wildcard expansion for each element
vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
for (vector<string>::const_iterator it = splitData->terms().begin();
auto nxit = splitData->nostemexps().begin();
for (auto it = splitData->terms().begin();
it != splitData->terms().end(); it++, nxit++) {
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
// Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian
// will accept only one OR clause inside NEAR.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|| hadmultiple
#endif // single OR inside NEAR
;
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
int lmods = mods;
if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm;
vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
vector<string> multiwords;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
return;
// Note: because of how expandTerm works, the multiwords can
// only come from the synonyms expansion, which means that, if
// idxsynonyms is set, they have each been indexed as a single
// term. So, if idxsynonyms is set, and is the current active
// synonyms file, we just add them to the expansion.
if (!multiwords.empty() && useidxsynonyms) {
exp.insert(exp.end(), multiwords.begin(), multiwords.end());
}
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
stringsToString(exp) << "\n");
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
noprefs.push_back(it->substr(prefix.size()));
for (const auto& prefterm : exp) {
noprefs.push_back(prefterm.substr(prefix.size()));
}
groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
m_curcl += exp.size();
if (m_curcl >= getMaxCl())
return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {

View File

@ -19,12 +19,15 @@
#include <vector>
#include <string>
#include <set>
#include <list>
#include "textsplit.h"
#include "stoplist.h"
#include "smallut.h"
#include "utf8iter.h"
#include "unacpp.h"
#include "syngroups.h"
namespace Rcl {
@ -52,11 +55,13 @@ class TermProc {
public:
TermProc(TermProc* next) : m_next(next) {}
virtual ~TermProc() {}
/* Copyconst and assignment forbidden */
TermProc(const TermProc &) = delete;
TermProc& operator=(const TermProc &) = delete;
virtual bool takeword(const string &term, int pos, int bs, int be) {
if (m_next)
return m_next->takeword(term, pos, bs, be);
else
return true;
return true;
}
// newpage() is like takeword(), but for page breaks.
virtual void newpage(int pos) {
@ -66,16 +71,10 @@ public:
virtual bool flush() {
if (m_next)
return m_next->flush();
else
return true;
return true;
}
private:
TermProc *m_next;
/* Copyconst and assignment private and forbidden */
TermProc(const TermProc &) {}
TermProc& operator=(const TermProc &) {
return *this;
};
};
/**
@ -100,8 +99,7 @@ public:
virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_prc)
return m_prc->takeword(term, pos, bs, be);
else
return true;
return true;
}
virtual void newpage(int pos) {
@ -119,12 +117,9 @@ private:
class TermProcPrep : public TermProc {
public:
TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
{
}
: TermProc(nxt) {}
virtual bool takeword(const string& itrm, int pos, int bs, int be)
{
virtual bool takeword(const string& itrm, int pos, int bs, int be) {
m_totalterms++;
string otrm;
@ -179,49 +174,37 @@ public:
// change in here. This means that phrase searches and
// snippets will be wrong, but at least searching for the
// terms will work.
bool hasspace = false;
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
if (*it == ' ') {
hasspace=true;
break;
}
}
bool hasspace = otrm.find(' ') != std::string::npos;
if (hasspace) {
std::vector<std::string> terms;
stringToTokens(otrm, terms, " ", true);
for (std::vector<std::string>::const_iterator it = terms.begin();
it < terms.end(); it++) {
if (!TermProc::takeword(*it, pos, bs, be)) {
for (const auto& term : terms) {
if (!TermProc::takeword(term, pos, bs, be)) {
return false;
}
}
return true;
} else {
return TermProc::takeword(otrm, pos, bs, be);
}
return TermProc::takeword(otrm, pos, bs, be);
}
virtual bool flush()
{
virtual bool flush() {
m_totalterms = m_unacerrors = 0;
return TermProc::flush();
}
private:
int m_totalterms;
int m_unacerrors;
int m_totalterms{0};
int m_unacerrors{0};
};
/** Compare to stop words list and discard if match found */
class TermProcStop : public TermProc {
public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops)
{
}
: TermProc(nxt), m_stops(stops) {}
virtual bool takeword(const string& term, int pos, int bs, int be)
{
virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_stops.isStop(term)) {
return true;
}
@ -232,6 +215,53 @@ private:
const Rcl::StopList& m_stops;
};
/** Generate multiword terms for multiword synonyms. This allows
* NEAR/PHRASE searches for multiword synonyms. */
class TermProcMulti : public TermProc {
public:
TermProcMulti(TermProc *nxt, const SynGroups& sg)
: TermProc(nxt), m_groups(sg.getmultiwords()),
m_maxl(sg.getmultiwordsmaxlength()) {}
virtual bool takeword(const string& term, int pos, int bs, int be) {
if (m_maxl < 2) {
// Should not have been pushed??
return TermProc::takeword(term, pos, bs, be);
}
m_terms.push_back(term);
if (m_terms.size() > m_maxl) {
m_terms.pop_front();
}
string comp;
int gsz{1};
for (const auto& gterm : m_terms) {
if (comp.empty()) {
comp = gterm;
continue;
} else {
comp += " ";
comp += gterm;
gsz++;
// We could optimize by not testing m_groups for sizes
// which do not exist.
// if not gsz in sizes continue;
}
if (m_groups.find(comp) != m_groups.end()) {
LOGDEB1("Found multiword synonym: [" << comp << "]\n");
// TBD bs-be correct computation. Need to store the
// values in a parallel list
TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
}
}
return TermProc::takeword(term, pos, bs, be);
}
private:
const std::set<std::string>& m_groups;
size_t m_maxl{0};
std::list<std::string> m_terms;
};
/** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches.
* NOTE: This does not currently work because of bad interaction with the
@ -241,13 +271,11 @@ private:
class TermProcCommongrams : public TermProc {
public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
{
}
: TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
virtual bool takeword(const string& term, int pos, int bs, int be)
{
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" );
virtual bool takeword(const string& term, int pos, int bs, int be) {
LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
be << " [" << term << "]\n");
bool isstop = m_stops.isStop(term);
bool twogramemit = false;
@ -287,8 +315,7 @@ public:
return true;
}
virtual bool flush()
{
virtual bool flush() {
if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
return false;
@ -297,8 +324,7 @@ public:
m_prevsent = true;
return TermProc::flush();
}
void onlygrams(bool on)
{
void onlygrams(bool on) {
m_onlygrams = on;
}
private:

View File

@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
# space issues.</descr></var>
#idxtexttruncatelen = 0
# <var name="idxsynonyms" type="fn">
#
# <brief>Name of the index-time synonyms file.</brief>
# <descr>This is used for indexing multiword synonyms as single terms,
# which in turn is only useful if you want to perform proximity searches
# with such terms.</descr></var>
#idxsynonyms = thereisnodefaultidxsynonyms
# <var name="aspellLanguage" type="string">
#
# <brief>Language definitions to use when creating the aspell