implemented multi-word terms indexing for phrase/prox search on multiword synonyms
This commit is contained in:
parent
cb13b8b6df
commit
aa2f0bfd73
@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
|
||||
return getConfdirPath("stoplistfile", "stoplist.txt");
|
||||
}
|
||||
|
||||
string RclConfig::getSynGroupsFile() const
|
||||
string RclConfig::getIdxSynGroupsFile() const
|
||||
{
|
||||
return getConfdirPath("syngroupsfile", "syngroups.txt");
|
||||
return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
|
||||
}
|
||||
|
||||
// The index status file is fast changing, so it's possible to put it outside
|
||||
|
||||
@ -72,8 +72,8 @@ private:
|
||||
|
||||
// Hold the description for an external metadata-gathering command
|
||||
struct MDReaper {
|
||||
string fieldname;
|
||||
vector<string> cmdv;
|
||||
string fieldname;
|
||||
vector<string> cmdv;
|
||||
};
|
||||
|
||||
// Data associated to a indexed field name:
|
||||
@ -90,7 +90,7 @@ struct FieldTraits {
|
||||
};
|
||||
|
||||
class RclConfig {
|
||||
public:
|
||||
public:
|
||||
|
||||
// Constructor: we normally look for a configuration file, except
|
||||
// if this was specified on the command line and passed through
|
||||
@ -100,7 +100,7 @@ class RclConfig {
|
||||
RclConfig(const RclConfig &r);
|
||||
|
||||
~RclConfig() {
|
||||
freeAll();
|
||||
freeAll();
|
||||
}
|
||||
|
||||
// Return a writable clone of the main config. This belongs to the
|
||||
@ -133,18 +133,16 @@ class RclConfig {
|
||||
string getKeyDir() const {return m_keydir;}
|
||||
|
||||
/** Get generic configuration parameter according to current keydir */
|
||||
bool getConfParam(const string &name, string &value,
|
||||
bool shallow=false) const
|
||||
{
|
||||
if (m_conf == 0)
|
||||
return false;
|
||||
return m_conf->get(name, value, m_keydir, shallow);
|
||||
bool getConfParam(const string& name, string& value,
|
||||
bool shallow=false) const {
|
||||
if (m_conf == 0)
|
||||
return false;
|
||||
return m_conf->get(name, value, m_keydir, shallow);
|
||||
}
|
||||
/** Variant with autoconversion to int */
|
||||
bool getConfParam(const string &name, int *value, bool shallow=false) const;
|
||||
/** Variant with autoconversion to bool */
|
||||
bool getConfParam(const string &name, bool *value,
|
||||
bool shallow=false) const;
|
||||
bool getConfParam(const string &name, bool *value, bool shallow=false) const;
|
||||
/** Variant with conversion to vector<string>
|
||||
* (stringToStrings). Can fail if the string is malformed. */
|
||||
bool getConfParam(const string &name, vector<string> *value,
|
||||
@ -164,18 +162,15 @@ class RclConfig {
|
||||
* Get list of config names under current sk, with possible
|
||||
* wildcard filtering
|
||||
*/
|
||||
vector<string> getConfNames(const char *pattern = 0) const
|
||||
{
|
||||
return m_conf->getNames(m_keydir, pattern);
|
||||
vector<string> getConfNames(const char *pattern = 0) const {
|
||||
return m_conf->getNames(m_keydir, pattern);
|
||||
}
|
||||
|
||||
/** Check if name exists anywhere in config */
|
||||
bool hasNameAnywhere(const string& nm) const
|
||||
{
|
||||
bool hasNameAnywhere(const string& nm) const {
|
||||
return m_conf? m_conf->hasNameAnywhere(nm) : false;
|
||||
}
|
||||
|
||||
|
||||
/** Get default charset for current keydir (was set during setKeydir)
|
||||
* filenames are handled differently */
|
||||
const string &getDefCharset(bool filename = false) const;
|
||||
@ -198,7 +193,7 @@ class RclConfig {
|
||||
/** Get stoplist file name */
|
||||
string getStopfile() const;
|
||||
/** Get synonym groups file name */
|
||||
string getSynGroupsFile() const;
|
||||
string getIdxSynGroupsFile() const;
|
||||
/** Get indexing pid file name */
|
||||
string getPidfile() const;
|
||||
/** Get indexing status file name */
|
||||
@ -207,7 +202,7 @@ class RclConfig {
|
||||
/** Do path translation according to the ptrans table */
|
||||
void urlrewrite(const string& dbdir, string& url) const;
|
||||
ConfSimple *getPTrans() {
|
||||
return m_ptrans;
|
||||
return m_ptrans;
|
||||
}
|
||||
/** Get Web Queue directory name */
|
||||
string getWebQueueDir() const;
|
||||
@ -215,13 +210,13 @@ class RclConfig {
|
||||
/** Get list of skipped file names for current keydir */
|
||||
vector<string>& getSkippedNames();
|
||||
/** Get list of file name filters for current keydir (only those
|
||||
names indexed) */
|
||||
names indexed) */
|
||||
vector<string>& getOnlyNames();
|
||||
|
||||
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
||||
vector<string> getSkippedPaths() const;
|
||||
/** Get list of skipped paths patterns, daemon version (may add some)
|
||||
Doesn't depend on the keydir */
|
||||
Doesn't depend on the keydir */
|
||||
vector<string> getDaemSkippedPaths() const;
|
||||
|
||||
/** Return list of no content suffixes. Used by confgui, indexing uses
|
||||
@ -260,7 +255,7 @@ class RclConfig {
|
||||
* @param whole the raw value. No way to escape a semi-colon in there.
|
||||
*/
|
||||
static bool valueSplitAttributes(const string& whole, string& value,
|
||||
ConfSimple& attrs) ;
|
||||
ConfSimple& attrs) ;
|
||||
|
||||
/** Compute difference between 'base' and 'changed', as elements to be
|
||||
* added and substracted from base. Input and output strings are in
|
||||
@ -288,9 +283,9 @@ class RclConfig {
|
||||
bool getGuiFilter(const string& filtername, string& frag) const;
|
||||
|
||||
/** fields: get field prefix from field name. Use additional query
|
||||
aliases if isquery is set */
|
||||
aliases if isquery is set */
|
||||
bool getFieldTraits(const string& fldname, const FieldTraits **,
|
||||
bool isquery = false) const;
|
||||
bool isquery = false) const;
|
||||
|
||||
const set<string>& getStoredFields() const {return m_storedFields;}
|
||||
|
||||
@ -311,11 +306,11 @@ class RclConfig {
|
||||
*/
|
||||
vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
|
||||
bool getFieldConfParam(const string &name, const string &sk, string &value)
|
||||
const;
|
||||
const;
|
||||
|
||||
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
|
||||
string getMimeViewerDef(const string &mimetype, const string& apptag,
|
||||
bool useall) const;
|
||||
bool useall) const;
|
||||
set<string> getMimeViewerAllEx() const;
|
||||
bool setMimeViewerAllEx(const set<string>& allex);
|
||||
bool getMimeViewerDefs(vector<pair<string, string> >&) const;
|
||||
@ -358,26 +353,25 @@ class RclConfig {
|
||||
string findFilter(const string& cmd) const;
|
||||
|
||||
/** Thread config init is not done automatically because not all
|
||||
programs need it and it uses the debug log so that it's better to
|
||||
call it after primary init */
|
||||
programs need it and it uses the debug log so that it's better to
|
||||
call it after primary init */
|
||||
void initThrConf();
|
||||
|
||||
const string& getOrigCwd()
|
||||
{
|
||||
return o_origcwd;
|
||||
const string& getOrigCwd() {
|
||||
return o_origcwd;
|
||||
}
|
||||
|
||||
RclConfig& operator=(const RclConfig &r) {
|
||||
if (this != &r) {
|
||||
freeAll();
|
||||
initFrom(r);
|
||||
}
|
||||
return *this;
|
||||
if (this != &r) {
|
||||
freeAll();
|
||||
initFrom(r);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend class ParamStale;
|
||||
|
||||
private:
|
||||
private:
|
||||
int m_ok;
|
||||
string m_reason; // Explanation for bad state
|
||||
string m_confdir; // User directory where the customized files are stored
|
||||
|
||||
@ -44,8 +44,7 @@ using namespace std;
|
||||
// groups anyway
|
||||
class SynGroups::Internal {
|
||||
public:
|
||||
Internal() : ok(false) {
|
||||
}
|
||||
Internal() {}
|
||||
void setpath(const string& fn) {
|
||||
path = path_canon(fn);
|
||||
stat(path.c_str(), &st);
|
||||
@ -61,16 +60,22 @@ public:
|
||||
}
|
||||
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
|
||||
}
|
||||
bool ok;
|
||||
bool ok{false};
|
||||
// Term to group num
|
||||
std::unordered_map<string, unsigned int> terms;
|
||||
// Group num to group
|
||||
vector<vector<string> > groups;
|
||||
|
||||
// Aux: set of multiword synonyms used for generating multiword
|
||||
// terms while indexing
|
||||
std::set<std::string> multiwords;
|
||||
size_t multiwords_maxlen{0};
|
||||
|
||||
std::string path;
|
||||
struct stat st;
|
||||
};
|
||||
|
||||
bool SynGroups::ok()
|
||||
bool SynGroups::ok() const
|
||||
{
|
||||
return m && m->ok;
|
||||
}
|
||||
@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
|
||||
if (fn.empty()) {
|
||||
delete m;
|
||||
m = 0;
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (m->samefile(fn)) {
|
||||
@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
|
||||
ifstream input;
|
||||
input.open(fn.c_str(), ios::in);
|
||||
if (!input.is_open()) {
|
||||
LOGSYSERR("SynGroups:setfile", "open", fn);
|
||||
return false;
|
||||
LOGSYSERR("SynGroups:setfile", "open", fn);
|
||||
return false;
|
||||
}
|
||||
|
||||
string cline;
|
||||
@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
|
||||
string line;
|
||||
bool eof = false;
|
||||
int lnum = 0;
|
||||
|
||||
m->groups.clear();
|
||||
m->terms.clear();
|
||||
m->multiwords.clear();
|
||||
m->multiwords_maxlen = 0;
|
||||
for (;;) {
|
||||
cline.clear();
|
||||
getline(input, cline);
|
||||
if (!input.good()) {
|
||||
if (input.bad()) {
|
||||
getline(input, cline);
|
||||
if (!input.good()) {
|
||||
if (input.bad()) {
|
||||
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
|
||||
return false;
|
||||
}
|
||||
// Must be eof ? But maybe we have a partial line which
|
||||
// must be processed. This happens if the last line before
|
||||
// eof ends with a backslash, or there is no final \n
|
||||
return false;
|
||||
}
|
||||
// Must be eof ? But maybe we have a partial line which
|
||||
// must be processed. This happens if the last line before
|
||||
// eof ends with a backslash, or there is no final \n
|
||||
eof = true;
|
||||
}
|
||||
lnum++;
|
||||
}
|
||||
lnum++;
|
||||
|
||||
{
|
||||
string::size_type pos = cline.find_last_not_of("\n\r");
|
||||
@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
|
||||
}
|
||||
}
|
||||
|
||||
if (appending)
|
||||
line += cline;
|
||||
else
|
||||
line = cline;
|
||||
if (appending)
|
||||
line += cline;
|
||||
else
|
||||
line = cline;
|
||||
|
||||
// Note that we trim whitespace before checking for backslash-eol
|
||||
// This avoids invisible whitespace problems.
|
||||
trimstring(line);
|
||||
if (line.empty() || line.at(0) == '#') {
|
||||
// Note that we trim whitespace before checking for backslash-eol
|
||||
// This avoids invisible whitespace problems.
|
||||
trimstring(line);
|
||||
if (line.empty() || line.at(0) == '#') {
|
||||
if (eof)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
if (line[line.length() - 1] == '\\') {
|
||||
line.erase(line.length() - 1);
|
||||
appending = true;
|
||||
continue;
|
||||
}
|
||||
appending = false;
|
||||
continue;
|
||||
}
|
||||
if (line[line.length() - 1] == '\\') {
|
||||
line.erase(line.length() - 1);
|
||||
appending = true;
|
||||
continue;
|
||||
}
|
||||
appending = false;
|
||||
|
||||
vector<string> words;
|
||||
if (!stringToStrings(line, words)) {
|
||||
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
|
||||
vector<string> words;
|
||||
if (!stringToStrings(line, words)) {
|
||||
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
|
||||
": " << line << "\n");
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (words.empty())
|
||||
continue;
|
||||
if (words.size() == 1) {
|
||||
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
|
||||
if (words.empty())
|
||||
continue;
|
||||
if (words.size() == 1) {
|
||||
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
|
||||
<< lnum << " ??\n");
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
m->groups.push_back(words);
|
||||
for (const auto& word : words) {
|
||||
m->terms[word] = m->groups.size()-1;
|
||||
}
|
||||
LOGDEB1("SynGroups::setfile: group: [" <<
|
||||
m->groups.push_back(words);
|
||||
for (const auto& word : words) {
|
||||
m->terms[word] = m->groups.size()-1;
|
||||
}
|
||||
LOGDEB1("SynGroups::setfile: group: [" <<
|
||||
stringsToString(m->groups.back()) << "]\n");
|
||||
}
|
||||
LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
|
||||
" distinct terms." << endl);
|
||||
|
||||
for (const auto& group : m->groups) {
|
||||
for (const auto& term : group) {
|
||||
std::vector<std::string> words;
|
||||
stringToTokens(term, words);
|
||||
if (words.size() > 1) {
|
||||
std::string multiword;
|
||||
for (const auto& word : words) {
|
||||
if (!multiword.empty()) {
|
||||
multiword += " ";
|
||||
}
|
||||
multiword += word;
|
||||
}
|
||||
m->multiwords.insert(multiword);
|
||||
if (m->multiwords_maxlen < words.size()) {
|
||||
m->multiwords_maxlen = words.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
|
||||
"Multiwords: " << stringsToString(m->multiwords) <<"\n");
|
||||
m->ok = true;
|
||||
m->setpath(fn);
|
||||
return true;
|
||||
}
|
||||
|
||||
vector<string> SynGroups::getgroup(const string& term)
|
||||
vector<string> SynGroups::getgroup(const string& term) const
|
||||
{
|
||||
vector<string> ret;
|
||||
if (!ok())
|
||||
return ret;
|
||||
return ret;
|
||||
|
||||
const auto it1 = m->terms.find(term);
|
||||
if (it1 == m->terms.end()) {
|
||||
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
|
||||
return ret;
|
||||
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned int idx = it1->second;
|
||||
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
|
||||
<< endl);
|
||||
return m->groups[idx];
|
||||
}
|
||||
|
||||
const std::set<std::string>& SynGroups::getmultiwords() const
|
||||
{
|
||||
return m->multiwords;
|
||||
}
|
||||
|
||||
size_t SynGroups::getmultiwordsmaxlength() const
|
||||
{
|
||||
return m->multiwords_maxlen;
|
||||
}
|
||||
|
||||
const std::string& SynGroups::getpath() const
|
||||
{
|
||||
return m->path;
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2015 J.F.Dockes
|
||||
/* Copyright (C) 2015-2021 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -20,6 +20,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
// Manage synonym groups. This is very different from stemming and
|
||||
// case/diac expansion because there is no reference form: all terms
|
||||
@ -34,8 +35,11 @@ public:
|
||||
SynGroups& operator=(const SynGroups&&) = delete;
|
||||
|
||||
bool setfile(const std::string& fname);
|
||||
std::vector<std::string> getgroup(const std::string& term);
|
||||
bool ok();
|
||||
std::vector<std::string> getgroup(const std::string& term) const;
|
||||
const std::set<std::string>& getmultiwords() const;
|
||||
size_t getmultiwordsmaxlength() const;
|
||||
const std::string& getpath() const;
|
||||
bool ok() const;
|
||||
private:
|
||||
class Internal;
|
||||
Internal *m;
|
||||
|
||||
@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
|
||||
bool Db::o_inPlaceReset;
|
||||
|
||||
Db::Db(const RclConfig *cfp)
|
||||
: m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
|
||||
m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
|
||||
m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4),
|
||||
m_flushMb(-1), m_maxFsOccupPc(0)
|
||||
{
|
||||
m_config = new RclConfig(*cfp);
|
||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||
m_config->getConfParam("idxflushmb", &m_flushMb);
|
||||
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
|
||||
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
|
||||
if (start_of_field_term.empty()) {
|
||||
if (o_index_stripchars) {
|
||||
start_of_field_term = "XXST";
|
||||
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
|
||||
end_of_field_term = "XXND/";
|
||||
}
|
||||
}
|
||||
|
||||
m_ndb = new Native(this);
|
||||
if (m_config) {
|
||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||
m_config->getConfParam("idxflushmb", &m_flushMb);
|
||||
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
|
||||
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
|
||||
}
|
||||
}
|
||||
|
||||
Db::~Db()
|
||||
{
|
||||
LOGDEB2("Db::~Db\n");
|
||||
if (m_ndb == 0)
|
||||
if (nullptr == m_ndb)
|
||||
return;
|
||||
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
|
||||
m_ndb->m_iswritable << "\n");
|
||||
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
bool Db::open(OpenMode mode, OpenError *error)
|
||||
{
|
||||
if (error)
|
||||
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
|
||||
if (!m_config->getStopfile().empty())
|
||||
m_stops.setFile(m_config->getStopfile());
|
||||
|
||||
if (isWriteMode(mode)) {
|
||||
// Check for an index-time synonyms file. We use this to
|
||||
// generate multiword terms for multiword synonyms
|
||||
string synfile = m_config->getIdxSynGroupsFile();
|
||||
if (path_exists(synfile)) {
|
||||
setSynGroupsFile(synfile);
|
||||
}
|
||||
}
|
||||
|
||||
string dir = m_config->getDbDir();
|
||||
string ermsg;
|
||||
try {
|
||||
switch (mode) {
|
||||
case DbUpd:
|
||||
case DbTrunc:
|
||||
if (isWriteMode(mode)) {
|
||||
m_ndb->openWrite(dir, mode);
|
||||
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
|
||||
// We used to open a readonly object in addition to the
|
||||
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
|
||||
// so the query db is now a clone of the update one.
|
||||
m_ndb->xrdb = m_ndb->xwdb;
|
||||
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
|
||||
break;
|
||||
case DbRO:
|
||||
default:
|
||||
} else {
|
||||
m_ndb->openRead(dir);
|
||||
for (auto& db : m_extraDbs) {
|
||||
if (error)
|
||||
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
|
||||
// but I can't see why
|
||||
m_ndb->xrdb.add_database(Xapian::Database(db));
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (error)
|
||||
*error = DbOpenMainDb;
|
||||
@ -1531,10 +1527,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||
|
||||
TermProcMulti tpmulti(nxt, m_syngroups);
|
||||
if (m_syngroups.getmultiwordsmaxlength() > 1) {
|
||||
nxt = &tpmulti;
|
||||
}
|
||||
|
||||
TermProcPrep tpprep(nxt);
|
||||
if (o_index_stripchars)
|
||||
nxt = &tpprep;
|
||||
|
||||
|
||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
||||
|
||||
@ -114,14 +114,13 @@ public:
|
||||
|
||||
class DbStats {
|
||||
public:
|
||||
DbStats()
|
||||
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
|
||||
DbStats() {}
|
||||
// Index-wide stats
|
||||
unsigned int dbdoccount;
|
||||
double dbavgdoclen;
|
||||
size_t mindoclen;
|
||||
size_t maxdoclen;
|
||||
vector<string> failedurls; /* Only set if requested */
|
||||
unsigned int dbdoccount{0};
|
||||
double dbavgdoclen{0};
|
||||
size_t mindoclen{0};
|
||||
size_t maxdoclen{0};
|
||||
std::vector<std::string> failedurls; /* Only set if requested */
|
||||
};
|
||||
|
||||
inline bool has_prefix(const string& trm)
|
||||
@ -175,6 +174,9 @@ public:
|
||||
~Db();
|
||||
|
||||
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
||||
bool isWriteMode(OpenMode mode) {
|
||||
return mode == DbUpd || mode == DbTrunc;
|
||||
}
|
||||
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
|
||||
bool open(OpenMode mode, OpenError *error = 0);
|
||||
bool close();
|
||||
@ -342,7 +344,7 @@ public:
|
||||
bool setExtraQueryDbs(const std::vector<std::string>& dbs);
|
||||
|
||||
/** Check if document comes from the main index (this is used to
|
||||
decide if we can update the index for it */
|
||||
decide if we can update the index for it */
|
||||
bool fromMainIndex(const Doc& doc);
|
||||
|
||||
/** Retrieve the stored doc text. This returns false if the index does not
|
||||
@ -499,7 +501,8 @@ public:
|
||||
|
||||
// Use empty fn for no synonyms
|
||||
bool setSynGroupsFile(const std::string& fn);
|
||||
|
||||
const SynGroups& getSynGroups() {return m_syngroups;}
|
||||
|
||||
// Mark all documents with an UDI having input as prefix as
|
||||
// existing. Only works if the UDIs for the store are
|
||||
// hierarchical of course. Used by FsIndexer to avoid purging
|
||||
@ -508,25 +511,26 @@ public:
|
||||
bool udiTreeMarkExisting(const string& udi);
|
||||
|
||||
/* This has to be public for access by embedded Query::Native */
|
||||
Native *m_ndb;
|
||||
Native *m_ndb{nullptr};
|
||||
|
||||
private:
|
||||
const RclConfig *m_config;
|
||||
string m_reason; // Error explanation
|
||||
|
||||
// Xapian directories for additional databases to query
|
||||
vector<string> m_extraDbs;
|
||||
OpenMode m_mode;
|
||||
OpenMode m_mode{Db::DbRO};
|
||||
// File existence vector: this is filled during the indexing pass. Any
|
||||
// document whose bit is not set at the end is purged
|
||||
vector<bool> updated;
|
||||
// Text bytes indexed since beginning
|
||||
long long m_curtxtsz;
|
||||
long long m_curtxtsz{0};
|
||||
// Text bytes at last flush
|
||||
long long m_flushtxtsz;
|
||||
long long m_flushtxtsz{0};
|
||||
// Text bytes at last fsoccup check
|
||||
long long m_occtxtsz;
|
||||
long long m_occtxtsz{0};
|
||||
// First fs occup check ?
|
||||
int m_occFirstCheck;
|
||||
int m_occFirstCheck{1};
|
||||
|
||||
// Synonym groups. There is no strict reason that this has to be
|
||||
// an Rcl::Db member, as it is only used when building each It
|
||||
@ -538,32 +542,31 @@ private:
|
||||
SynGroups m_syngroups;
|
||||
|
||||
// Aspell object if needed
|
||||
Aspell *m_aspell = nullptr;
|
||||
|
||||
Aspell *m_aspell{nullptr};
|
||||
|
||||
/***************
|
||||
* Parameters cached out of the configuration files. Logically const
|
||||
* after init */
|
||||
// Stop terms: those don't get indexed.
|
||||
StopList m_stops;
|
||||
|
||||
// Truncation length for stored meta fields
|
||||
int m_idxMetaStoredLen;
|
||||
int m_idxMetaStoredLen{150};
|
||||
// This is how long an abstract we keep or build from beginning of
|
||||
// text when indexing. It only has an influence on the size of the
|
||||
// db as we are free to shorten it again when displaying
|
||||
int m_idxAbsTruncLen;
|
||||
int m_idxAbsTruncLen{250};
|
||||
// Document text truncation length
|
||||
int m_idxTextTruncateLen{0};
|
||||
// This is the size of the abstract that we synthetize out of query
|
||||
// term contexts at *query time*
|
||||
int m_synthAbsLen;
|
||||
int m_synthAbsLen{250};
|
||||
// This is how many words (context size) we keep around query terms
|
||||
// when building the abstract
|
||||
int m_synthAbsWordCtxLen;
|
||||
int m_synthAbsWordCtxLen{4};
|
||||
// Flush threshold. Megabytes of text indexed before we flush.
|
||||
int m_flushMb;
|
||||
int m_flushMb{-1};
|
||||
// Maximum file system occupation percentage
|
||||
int m_maxFsOccupPc;
|
||||
int m_maxFsOccupPc{0};
|
||||
// Database directory
|
||||
string m_basedir;
|
||||
// When this is set, all documents are considered as needing a reindex.
|
||||
|
||||
@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||
LOGDEB("Db::TermMatch: syngroups out: " <<
|
||||
term << " -> " << stringsToString(sg) << "\n");
|
||||
for (const auto& synonym : sg) {
|
||||
if (synonym.find_first_of(" ") != string::npos) {
|
||||
if (synonym.find(' ') != string::npos) {
|
||||
if (multiwords) {
|
||||
multiwords->push_back(synonym);
|
||||
}
|
||||
|
||||
@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
|
||||
}
|
||||
|
||||
// Push phrases for the multi-word expansions
|
||||
for (vector<string>::const_iterator mwp = multiwords.begin();
|
||||
mwp != multiwords.end(); mwp++) {
|
||||
for (const auto& mw : multiwords) {
|
||||
vector<string> phr;
|
||||
// We just do a basic split to keep things a bit simpler here
|
||||
// (no textsplit). This means though that no punctuation is
|
||||
// allowed in multi-word synonyms.
|
||||
stringToTokens(*mwp, phr);
|
||||
stringToTokens(mw, phr);
|
||||
if (!prefix.empty())
|
||||
prefix_vector(phr, prefix);
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
|
||||
// NEAR xapian query, the elements of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
TermProcQ *splitData,
|
||||
int mods, void *pq,
|
||||
bool useNear, int slack)
|
||||
void SearchDataClauseSimple::processPhraseOrNear(
|
||||
Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
|
||||
bool useNear, int slack)
|
||||
{
|
||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||
Xapian::Query::OP_PHRASE;
|
||||
vector<Xapian::Query> orqueries;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
bool hadmultiple = false;
|
||||
#endif
|
||||
vector<vector<string> >groups;
|
||||
|
||||
bool useidxsynonyms =
|
||||
db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
}
|
||||
|
||||
// Go through the list and perform stem/wildcard expansion for each element
|
||||
vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
|
||||
for (vector<string>::const_iterator it = splitData->terms().begin();
|
||||
auto nxit = splitData->nostemexps().begin();
|
||||
for (auto it = splitData->terms().begin();
|
||||
it != splitData->terms().end(); it++, nxit++) {
|
||||
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
|
||||
// Adjust when we do stem expansion. Not if disabled by
|
||||
// caller, not inside phrases, and some versions of xapian
|
||||
// will accept only one OR clause inside NEAR.
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
|| hadmultiple
|
||||
#endif // single OR inside NEAR
|
||||
;
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
|
||||
int lmods = mods;
|
||||
if (nostemexp)
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
||||
vector<string> multiwords;
|
||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
|
||||
return;
|
||||
|
||||
// Note: because of how expandTerm works, the multiwords can
|
||||
// only come from the synonyms expansion, which means that, if
|
||||
// idxsynonyms is set, they have each been indexed as a single
|
||||
// term. So, if idxsynonyms is set, and is the current active
|
||||
// synonyms file, we just add them to the expansion.
|
||||
if (!multiwords.empty() && useidxsynonyms) {
|
||||
exp.insert(exp.end(), multiwords.begin(), multiwords.end());
|
||||
}
|
||||
|
||||
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
|
||||
stringsToString(exp) << "\n");
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
vector<string> noprefs;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
noprefs.push_back(it->substr(prefix.size()));
|
||||
for (const auto& prefterm : exp) {
|
||||
noprefs.push_back(prefterm.substr(prefix.size()));
|
||||
}
|
||||
groups.push_back(noprefs);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= getMaxCl())
|
||||
return;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
if (exp.size() > 1)
|
||||
hadmultiple = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
||||
|
||||
@ -19,12 +19,15 @@
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <list>
|
||||
|
||||
#include "textsplit.h"
|
||||
#include "stoplist.h"
|
||||
#include "smallut.h"
|
||||
#include "utf8iter.h"
|
||||
#include "unacpp.h"
|
||||
#include "syngroups.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
@ -52,11 +55,13 @@ class TermProc {
|
||||
public:
|
||||
TermProc(TermProc* next) : m_next(next) {}
|
||||
virtual ~TermProc() {}
|
||||
/* Copyconst and assignment forbidden */
|
||||
TermProc(const TermProc &) = delete;
|
||||
TermProc& operator=(const TermProc &) = delete;
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be) {
|
||||
if (m_next)
|
||||
return m_next->takeword(term, pos, bs, be);
|
||||
else
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
// newpage() is like takeword(), but for page breaks.
|
||||
virtual void newpage(int pos) {
|
||||
@ -66,16 +71,10 @@ public:
|
||||
virtual bool flush() {
|
||||
if (m_next)
|
||||
return m_next->flush();
|
||||
else
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TermProc *m_next;
|
||||
/* Copyconst and assignment private and forbidden */
|
||||
TermProc(const TermProc &) {}
|
||||
TermProc& operator=(const TermProc &) {
|
||||
return *this;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
@ -100,8 +99,7 @@ public:
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||
if (m_prc)
|
||||
return m_prc->takeword(term, pos, bs, be);
|
||||
else
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void newpage(int pos) {
|
||||
@ -119,12 +117,9 @@ private:
|
||||
class TermProcPrep : public TermProc {
|
||||
public:
|
||||
TermProcPrep(TermProc *nxt)
|
||||
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
||||
{
|
||||
}
|
||||
: TermProc(nxt) {}
|
||||
|
||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||
{
|
||||
virtual bool takeword(const string& itrm, int pos, int bs, int be) {
|
||||
m_totalterms++;
|
||||
string otrm;
|
||||
|
||||
@ -179,49 +174,37 @@ public:
|
||||
// change in here. This means that phrase searches and
|
||||
// snippets will be wrong, but at least searching for the
|
||||
// terms will work.
|
||||
bool hasspace = false;
|
||||
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
|
||||
if (*it == ' ') {
|
||||
hasspace=true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bool hasspace = otrm.find(' ') != std::string::npos;
|
||||
if (hasspace) {
|
||||
std::vector<std::string> terms;
|
||||
stringToTokens(otrm, terms, " ", true);
|
||||
for (std::vector<std::string>::const_iterator it = terms.begin();
|
||||
it < terms.end(); it++) {
|
||||
if (!TermProc::takeword(*it, pos, bs, be)) {
|
||||
for (const auto& term : terms) {
|
||||
if (!TermProc::takeword(term, pos, bs, be)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return TermProc::takeword(otrm, pos, bs, be);
|
||||
}
|
||||
return TermProc::takeword(otrm, pos, bs, be);
|
||||
}
|
||||
|
||||
virtual bool flush()
|
||||
{
|
||||
virtual bool flush() {
|
||||
m_totalterms = m_unacerrors = 0;
|
||||
return TermProc::flush();
|
||||
}
|
||||
|
||||
private:
|
||||
int m_totalterms;
|
||||
int m_unacerrors;
|
||||
int m_totalterms{0};
|
||||
int m_unacerrors{0};
|
||||
};
|
||||
|
||||
/** Compare to stop words list and discard if match found */
|
||||
class TermProcStop : public TermProc {
|
||||
public:
|
||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||
: TermProc(nxt), m_stops(stops)
|
||||
{
|
||||
}
|
||||
: TermProc(nxt), m_stops(stops) {}
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||
{
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||
if (m_stops.isStop(term)) {
|
||||
return true;
|
||||
}
|
||||
@ -232,6 +215,53 @@ private:
|
||||
const Rcl::StopList& m_stops;
|
||||
};
|
||||
|
||||
/** Generate multiword terms for multiword synonyms. This allows
|
||||
* NEAR/PHRASE searches for multiword synonyms. */
|
||||
class TermProcMulti : public TermProc {
|
||||
public:
|
||||
TermProcMulti(TermProc *nxt, const SynGroups& sg)
|
||||
: TermProc(nxt), m_groups(sg.getmultiwords()),
|
||||
m_maxl(sg.getmultiwordsmaxlength()) {}
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||
if (m_maxl < 2) {
|
||||
// Should not have been pushed??
|
||||
return TermProc::takeword(term, pos, bs, be);
|
||||
}
|
||||
m_terms.push_back(term);
|
||||
if (m_terms.size() > m_maxl) {
|
||||
m_terms.pop_front();
|
||||
}
|
||||
string comp;
|
||||
int gsz{1};
|
||||
for (const auto& gterm : m_terms) {
|
||||
if (comp.empty()) {
|
||||
comp = gterm;
|
||||
continue;
|
||||
} else {
|
||||
comp += " ";
|
||||
comp += gterm;
|
||||
gsz++;
|
||||
// We could optimize by not testing m_groups for sizes
|
||||
// which do not exist.
|
||||
// if not gsz in sizes continue;
|
||||
}
|
||||
if (m_groups.find(comp) != m_groups.end()) {
|
||||
LOGDEB1("Found multiword synonym: [" << comp << "]\n");
|
||||
// TBD bs-be correct computation. Need to store the
|
||||
// values in a parallel list
|
||||
TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
|
||||
}
|
||||
}
|
||||
return TermProc::takeword(term, pos, bs, be);
|
||||
}
|
||||
|
||||
private:
|
||||
const std::set<std::string>& m_groups;
|
||||
size_t m_maxl{0};
|
||||
std::list<std::string> m_terms;
|
||||
};
|
||||
|
||||
/** Handle common-gram generation: combine frequent terms with neighbours to
|
||||
* shorten the positions lists for phrase searches.
|
||||
* NOTE: This does not currently work because of bad interaction with the
|
||||
@ -241,13 +271,11 @@ private:
|
||||
class TermProcCommongrams : public TermProc {
|
||||
public:
|
||||
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
||||
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
||||
{
|
||||
}
|
||||
: TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||
{
|
||||
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" );
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||
LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
|
||||
be << " [" << term << "]\n");
|
||||
bool isstop = m_stops.isStop(term);
|
||||
bool twogramemit = false;
|
||||
|
||||
@ -287,8 +315,7 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual bool flush()
|
||||
{
|
||||
virtual bool flush() {
|
||||
if (!m_prevsent && !m_prevterm.empty())
|
||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||
return false;
|
||||
@ -297,8 +324,7 @@ public:
|
||||
m_prevsent = true;
|
||||
return TermProc::flush();
|
||||
}
|
||||
void onlygrams(bool on)
|
||||
{
|
||||
void onlygrams(bool on) {
|
||||
m_onlygrams = on;
|
||||
}
|
||||
private:
|
||||
|
||||
@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
|
||||
# space issues.</descr></var>
|
||||
#idxtexttruncatelen = 0
|
||||
|
||||
# <var name="idxsynonyms" type="fn">
|
||||
#
|
||||
# <brief>Name of the index-time synonyms file.</brief>
|
||||
# <descr>This is used for indexing multiword synonyms as single terms,
|
||||
# which in turn is only useful if you want to perform proximity searches
|
||||
# with such terms.</descr></var>
|
||||
#idxsynonyms = thereisnodefaultidxsynonyms
|
||||
|
||||
# <var name="aspellLanguage" type="string">
|
||||
#
|
||||
# <brief>Language definitions to use when creating the aspell
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user