implemented multi-word terms indexing for phrase/prox search on multiword synonyms
This commit is contained in:
parent
cb13b8b6df
commit
aa2f0bfd73
@ -1374,9 +1374,9 @@ string RclConfig::getStopfile() const
|
|||||||
return getConfdirPath("stoplistfile", "stoplist.txt");
|
return getConfdirPath("stoplistfile", "stoplist.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
string RclConfig::getSynGroupsFile() const
|
string RclConfig::getIdxSynGroupsFile() const
|
||||||
{
|
{
|
||||||
return getConfdirPath("syngroupsfile", "syngroups.txt");
|
return getConfdirPath("idxsynonyms", "thereisnodefaultidxsynonyms");
|
||||||
}
|
}
|
||||||
|
|
||||||
// The index status file is fast changing, so it's possible to put it outside
|
// The index status file is fast changing, so it's possible to put it outside
|
||||||
|
|||||||
@ -72,8 +72,8 @@ private:
|
|||||||
|
|
||||||
// Hold the description for an external metadata-gathering command
|
// Hold the description for an external metadata-gathering command
|
||||||
struct MDReaper {
|
struct MDReaper {
|
||||||
string fieldname;
|
string fieldname;
|
||||||
vector<string> cmdv;
|
vector<string> cmdv;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Data associated to a indexed field name:
|
// Data associated to a indexed field name:
|
||||||
@ -90,7 +90,7 @@ struct FieldTraits {
|
|||||||
};
|
};
|
||||||
|
|
||||||
class RclConfig {
|
class RclConfig {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Constructor: we normally look for a configuration file, except
|
// Constructor: we normally look for a configuration file, except
|
||||||
// if this was specified on the command line and passed through
|
// if this was specified on the command line and passed through
|
||||||
@ -100,7 +100,7 @@ class RclConfig {
|
|||||||
RclConfig(const RclConfig &r);
|
RclConfig(const RclConfig &r);
|
||||||
|
|
||||||
~RclConfig() {
|
~RclConfig() {
|
||||||
freeAll();
|
freeAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return a writable clone of the main config. This belongs to the
|
// Return a writable clone of the main config. This belongs to the
|
||||||
@ -133,18 +133,16 @@ class RclConfig {
|
|||||||
string getKeyDir() const {return m_keydir;}
|
string getKeyDir() const {return m_keydir;}
|
||||||
|
|
||||||
/** Get generic configuration parameter according to current keydir */
|
/** Get generic configuration parameter according to current keydir */
|
||||||
bool getConfParam(const string &name, string &value,
|
bool getConfParam(const string& name, string& value,
|
||||||
bool shallow=false) const
|
bool shallow=false) const {
|
||||||
{
|
if (m_conf == 0)
|
||||||
if (m_conf == 0)
|
return false;
|
||||||
return false;
|
return m_conf->get(name, value, m_keydir, shallow);
|
||||||
return m_conf->get(name, value, m_keydir, shallow);
|
|
||||||
}
|
}
|
||||||
/** Variant with autoconversion to int */
|
/** Variant with autoconversion to int */
|
||||||
bool getConfParam(const string &name, int *value, bool shallow=false) const;
|
bool getConfParam(const string &name, int *value, bool shallow=false) const;
|
||||||
/** Variant with autoconversion to bool */
|
/** Variant with autoconversion to bool */
|
||||||
bool getConfParam(const string &name, bool *value,
|
bool getConfParam(const string &name, bool *value, bool shallow=false) const;
|
||||||
bool shallow=false) const;
|
|
||||||
/** Variant with conversion to vector<string>
|
/** Variant with conversion to vector<string>
|
||||||
* (stringToStrings). Can fail if the string is malformed. */
|
* (stringToStrings). Can fail if the string is malformed. */
|
||||||
bool getConfParam(const string &name, vector<string> *value,
|
bool getConfParam(const string &name, vector<string> *value,
|
||||||
@ -164,18 +162,15 @@ class RclConfig {
|
|||||||
* Get list of config names under current sk, with possible
|
* Get list of config names under current sk, with possible
|
||||||
* wildcard filtering
|
* wildcard filtering
|
||||||
*/
|
*/
|
||||||
vector<string> getConfNames(const char *pattern = 0) const
|
vector<string> getConfNames(const char *pattern = 0) const {
|
||||||
{
|
return m_conf->getNames(m_keydir, pattern);
|
||||||
return m_conf->getNames(m_keydir, pattern);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check if name exists anywhere in config */
|
/** Check if name exists anywhere in config */
|
||||||
bool hasNameAnywhere(const string& nm) const
|
bool hasNameAnywhere(const string& nm) const {
|
||||||
{
|
|
||||||
return m_conf? m_conf->hasNameAnywhere(nm) : false;
|
return m_conf? m_conf->hasNameAnywhere(nm) : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Get default charset for current keydir (was set during setKeydir)
|
/** Get default charset for current keydir (was set during setKeydir)
|
||||||
* filenames are handled differently */
|
* filenames are handled differently */
|
||||||
const string &getDefCharset(bool filename = false) const;
|
const string &getDefCharset(bool filename = false) const;
|
||||||
@ -198,7 +193,7 @@ class RclConfig {
|
|||||||
/** Get stoplist file name */
|
/** Get stoplist file name */
|
||||||
string getStopfile() const;
|
string getStopfile() const;
|
||||||
/** Get synonym groups file name */
|
/** Get synonym groups file name */
|
||||||
string getSynGroupsFile() const;
|
string getIdxSynGroupsFile() const;
|
||||||
/** Get indexing pid file name */
|
/** Get indexing pid file name */
|
||||||
string getPidfile() const;
|
string getPidfile() const;
|
||||||
/** Get indexing status file name */
|
/** Get indexing status file name */
|
||||||
@ -207,7 +202,7 @@ class RclConfig {
|
|||||||
/** Do path translation according to the ptrans table */
|
/** Do path translation according to the ptrans table */
|
||||||
void urlrewrite(const string& dbdir, string& url) const;
|
void urlrewrite(const string& dbdir, string& url) const;
|
||||||
ConfSimple *getPTrans() {
|
ConfSimple *getPTrans() {
|
||||||
return m_ptrans;
|
return m_ptrans;
|
||||||
}
|
}
|
||||||
/** Get Web Queue directory name */
|
/** Get Web Queue directory name */
|
||||||
string getWebQueueDir() const;
|
string getWebQueueDir() const;
|
||||||
@ -215,13 +210,13 @@ class RclConfig {
|
|||||||
/** Get list of skipped file names for current keydir */
|
/** Get list of skipped file names for current keydir */
|
||||||
vector<string>& getSkippedNames();
|
vector<string>& getSkippedNames();
|
||||||
/** Get list of file name filters for current keydir (only those
|
/** Get list of file name filters for current keydir (only those
|
||||||
names indexed) */
|
names indexed) */
|
||||||
vector<string>& getOnlyNames();
|
vector<string>& getOnlyNames();
|
||||||
|
|
||||||
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
||||||
vector<string> getSkippedPaths() const;
|
vector<string> getSkippedPaths() const;
|
||||||
/** Get list of skipped paths patterns, daemon version (may add some)
|
/** Get list of skipped paths patterns, daemon version (may add some)
|
||||||
Doesn't depend on the keydir */
|
Doesn't depend on the keydir */
|
||||||
vector<string> getDaemSkippedPaths() const;
|
vector<string> getDaemSkippedPaths() const;
|
||||||
|
|
||||||
/** Return list of no content suffixes. Used by confgui, indexing uses
|
/** Return list of no content suffixes. Used by confgui, indexing uses
|
||||||
@ -260,7 +255,7 @@ class RclConfig {
|
|||||||
* @param whole the raw value. No way to escape a semi-colon in there.
|
* @param whole the raw value. No way to escape a semi-colon in there.
|
||||||
*/
|
*/
|
||||||
static bool valueSplitAttributes(const string& whole, string& value,
|
static bool valueSplitAttributes(const string& whole, string& value,
|
||||||
ConfSimple& attrs) ;
|
ConfSimple& attrs) ;
|
||||||
|
|
||||||
/** Compute difference between 'base' and 'changed', as elements to be
|
/** Compute difference between 'base' and 'changed', as elements to be
|
||||||
* added and substracted from base. Input and output strings are in
|
* added and substracted from base. Input and output strings are in
|
||||||
@ -288,9 +283,9 @@ class RclConfig {
|
|||||||
bool getGuiFilter(const string& filtername, string& frag) const;
|
bool getGuiFilter(const string& filtername, string& frag) const;
|
||||||
|
|
||||||
/** fields: get field prefix from field name. Use additional query
|
/** fields: get field prefix from field name. Use additional query
|
||||||
aliases if isquery is set */
|
aliases if isquery is set */
|
||||||
bool getFieldTraits(const string& fldname, const FieldTraits **,
|
bool getFieldTraits(const string& fldname, const FieldTraits **,
|
||||||
bool isquery = false) const;
|
bool isquery = false) const;
|
||||||
|
|
||||||
const set<string>& getStoredFields() const {return m_storedFields;}
|
const set<string>& getStoredFields() const {return m_storedFields;}
|
||||||
|
|
||||||
@ -311,11 +306,11 @@ class RclConfig {
|
|||||||
*/
|
*/
|
||||||
vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
|
vector<string> getFieldSectNames(const string &sk, const char* = 0) const;
|
||||||
bool getFieldConfParam(const string &name, const string &sk, string &value)
|
bool getFieldConfParam(const string &name, const string &sk, string &value)
|
||||||
const;
|
const;
|
||||||
|
|
||||||
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
|
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
|
||||||
string getMimeViewerDef(const string &mimetype, const string& apptag,
|
string getMimeViewerDef(const string &mimetype, const string& apptag,
|
||||||
bool useall) const;
|
bool useall) const;
|
||||||
set<string> getMimeViewerAllEx() const;
|
set<string> getMimeViewerAllEx() const;
|
||||||
bool setMimeViewerAllEx(const set<string>& allex);
|
bool setMimeViewerAllEx(const set<string>& allex);
|
||||||
bool getMimeViewerDefs(vector<pair<string, string> >&) const;
|
bool getMimeViewerDefs(vector<pair<string, string> >&) const;
|
||||||
@ -358,26 +353,25 @@ class RclConfig {
|
|||||||
string findFilter(const string& cmd) const;
|
string findFilter(const string& cmd) const;
|
||||||
|
|
||||||
/** Thread config init is not done automatically because not all
|
/** Thread config init is not done automatically because not all
|
||||||
programs need it and it uses the debug log so that it's better to
|
programs need it and it uses the debug log so that it's better to
|
||||||
call it after primary init */
|
call it after primary init */
|
||||||
void initThrConf();
|
void initThrConf();
|
||||||
|
|
||||||
const string& getOrigCwd()
|
const string& getOrigCwd() {
|
||||||
{
|
return o_origcwd;
|
||||||
return o_origcwd;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RclConfig& operator=(const RclConfig &r) {
|
RclConfig& operator=(const RclConfig &r) {
|
||||||
if (this != &r) {
|
if (this != &r) {
|
||||||
freeAll();
|
freeAll();
|
||||||
initFrom(r);
|
initFrom(r);
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
friend class ParamStale;
|
friend class ParamStale;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int m_ok;
|
int m_ok;
|
||||||
string m_reason; // Explanation for bad state
|
string m_reason; // Explanation for bad state
|
||||||
string m_confdir; // User directory where the customized files are stored
|
string m_confdir; // User directory where the customized files are stored
|
||||||
|
|||||||
@ -44,8 +44,7 @@ using namespace std;
|
|||||||
// groups anyway
|
// groups anyway
|
||||||
class SynGroups::Internal {
|
class SynGroups::Internal {
|
||||||
public:
|
public:
|
||||||
Internal() : ok(false) {
|
Internal() {}
|
||||||
}
|
|
||||||
void setpath(const string& fn) {
|
void setpath(const string& fn) {
|
||||||
path = path_canon(fn);
|
path = path_canon(fn);
|
||||||
stat(path.c_str(), &st);
|
stat(path.c_str(), &st);
|
||||||
@ -61,16 +60,22 @@ public:
|
|||||||
}
|
}
|
||||||
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
|
return st.st_mtime == st1.st_mtime && st.st_size == st1.st_size;
|
||||||
}
|
}
|
||||||
bool ok;
|
bool ok{false};
|
||||||
// Term to group num
|
// Term to group num
|
||||||
std::unordered_map<string, unsigned int> terms;
|
std::unordered_map<string, unsigned int> terms;
|
||||||
// Group num to group
|
// Group num to group
|
||||||
vector<vector<string> > groups;
|
vector<vector<string> > groups;
|
||||||
|
|
||||||
|
// Aux: set of multiword synonyms used for generating multiword
|
||||||
|
// terms while indexing
|
||||||
|
std::set<std::string> multiwords;
|
||||||
|
size_t multiwords_maxlen{0};
|
||||||
|
|
||||||
std::string path;
|
std::string path;
|
||||||
struct stat st;
|
struct stat st;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool SynGroups::ok()
|
bool SynGroups::ok() const
|
||||||
{
|
{
|
||||||
return m && m->ok;
|
return m && m->ok;
|
||||||
}
|
}
|
||||||
@ -99,7 +104,7 @@ bool SynGroups::setfile(const string& fn)
|
|||||||
if (fn.empty()) {
|
if (fn.empty()) {
|
||||||
delete m;
|
delete m;
|
||||||
m = 0;
|
m = 0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m->samefile(fn)) {
|
if (m->samefile(fn)) {
|
||||||
@ -111,8 +116,8 @@ bool SynGroups::setfile(const string& fn)
|
|||||||
ifstream input;
|
ifstream input;
|
||||||
input.open(fn.c_str(), ios::in);
|
input.open(fn.c_str(), ios::in);
|
||||||
if (!input.is_open()) {
|
if (!input.is_open()) {
|
||||||
LOGSYSERR("SynGroups:setfile", "open", fn);
|
LOGSYSERR("SynGroups:setfile", "open", fn);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
string cline;
|
string cline;
|
||||||
@ -120,21 +125,24 @@ bool SynGroups::setfile(const string& fn)
|
|||||||
string line;
|
string line;
|
||||||
bool eof = false;
|
bool eof = false;
|
||||||
int lnum = 0;
|
int lnum = 0;
|
||||||
|
m->groups.clear();
|
||||||
|
m->terms.clear();
|
||||||
|
m->multiwords.clear();
|
||||||
|
m->multiwords_maxlen = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
cline.clear();
|
cline.clear();
|
||||||
getline(input, cline);
|
getline(input, cline);
|
||||||
if (!input.good()) {
|
if (!input.good()) {
|
||||||
if (input.bad()) {
|
if (input.bad()) {
|
||||||
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
|
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Must be eof ? But maybe we have a partial line which
|
// Must be eof ? But maybe we have a partial line which
|
||||||
// must be processed. This happens if the last line before
|
// must be processed. This happens if the last line before
|
||||||
// eof ends with a backslash, or there is no final \n
|
// eof ends with a backslash, or there is no final \n
|
||||||
eof = true;
|
eof = true;
|
||||||
}
|
}
|
||||||
lnum++;
|
lnum++;
|
||||||
|
|
||||||
{
|
{
|
||||||
string::size_type pos = cline.find_last_not_of("\n\r");
|
string::size_type pos = cline.find_last_not_of("\n\r");
|
||||||
@ -145,65 +153,85 @@ bool SynGroups::setfile(const string& fn)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (appending)
|
if (appending)
|
||||||
line += cline;
|
line += cline;
|
||||||
else
|
else
|
||||||
line = cline;
|
line = cline;
|
||||||
|
|
||||||
// Note that we trim whitespace before checking for backslash-eol
|
// Note that we trim whitespace before checking for backslash-eol
|
||||||
// This avoids invisible whitespace problems.
|
// This avoids invisible whitespace problems.
|
||||||
trimstring(line);
|
trimstring(line);
|
||||||
if (line.empty() || line.at(0) == '#') {
|
if (line.empty() || line.at(0) == '#') {
|
||||||
if (eof)
|
if (eof)
|
||||||
break;
|
break;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (line[line.length() - 1] == '\\') {
|
if (line[line.length() - 1] == '\\') {
|
||||||
line.erase(line.length() - 1);
|
line.erase(line.length() - 1);
|
||||||
appending = true;
|
appending = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
appending = false;
|
appending = false;
|
||||||
|
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if (!stringToStrings(line, words)) {
|
if (!stringToStrings(line, words)) {
|
||||||
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
|
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
|
||||||
": " << line << "\n");
|
": " << line << "\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (words.empty())
|
if (words.empty())
|
||||||
continue;
|
continue;
|
||||||
if (words.size() == 1) {
|
if (words.size() == 1) {
|
||||||
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
|
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
|
||||||
<< lnum << " ??\n");
|
<< lnum << " ??\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
m->groups.push_back(words);
|
m->groups.push_back(words);
|
||||||
for (const auto& word : words) {
|
for (const auto& word : words) {
|
||||||
m->terms[word] = m->groups.size()-1;
|
m->terms[word] = m->groups.size()-1;
|
||||||
}
|
}
|
||||||
LOGDEB1("SynGroups::setfile: group: [" <<
|
LOGDEB1("SynGroups::setfile: group: [" <<
|
||||||
stringsToString(m->groups.back()) << "]\n");
|
stringsToString(m->groups.back()) << "]\n");
|
||||||
}
|
}
|
||||||
LOGDEB("SynGroups::setfile: got " << m->groups.size() <<
|
|
||||||
" distinct terms." << endl);
|
for (const auto& group : m->groups) {
|
||||||
|
for (const auto& term : group) {
|
||||||
|
std::vector<std::string> words;
|
||||||
|
stringToTokens(term, words);
|
||||||
|
if (words.size() > 1) {
|
||||||
|
std::string multiword;
|
||||||
|
for (const auto& word : words) {
|
||||||
|
if (!multiword.empty()) {
|
||||||
|
multiword += " ";
|
||||||
|
}
|
||||||
|
multiword += word;
|
||||||
|
}
|
||||||
|
m->multiwords.insert(multiword);
|
||||||
|
if (m->multiwords_maxlen < words.size()) {
|
||||||
|
m->multiwords_maxlen = words.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGDEB("SynGroups::setfile: got " << m->groups.size() << " distinct terms. "
|
||||||
|
"Multiwords: " << stringsToString(m->multiwords) <<"\n");
|
||||||
m->ok = true;
|
m->ok = true;
|
||||||
m->setpath(fn);
|
m->setpath(fn);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> SynGroups::getgroup(const string& term)
|
vector<string> SynGroups::getgroup(const string& term) const
|
||||||
{
|
{
|
||||||
vector<string> ret;
|
vector<string> ret;
|
||||||
if (!ok())
|
if (!ok())
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
const auto it1 = m->terms.find(term);
|
const auto it1 = m->terms.find(term);
|
||||||
if (it1 == m->terms.end()) {
|
if (it1 == m->terms.end()) {
|
||||||
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
|
LOGDEB0("SynGroups::getgroup: [" << term << "] not found in map\n");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int idx = it1->second;
|
unsigned int idx = it1->second;
|
||||||
@ -215,3 +243,18 @@ vector<string> SynGroups::getgroup(const string& term)
|
|||||||
<< endl);
|
<< endl);
|
||||||
return m->groups[idx];
|
return m->groups[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::set<std::string>& SynGroups::getmultiwords() const
|
||||||
|
{
|
||||||
|
return m->multiwords;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t SynGroups::getmultiwordsmaxlength() const
|
||||||
|
{
|
||||||
|
return m->multiwords_maxlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string& SynGroups::getpath() const
|
||||||
|
{
|
||||||
|
return m->path;
|
||||||
|
}
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2015 J.F.Dockes
|
/* Copyright (C) 2015-2021 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
// Manage synonym groups. This is very different from stemming and
|
// Manage synonym groups. This is very different from stemming and
|
||||||
// case/diac expansion because there is no reference form: all terms
|
// case/diac expansion because there is no reference form: all terms
|
||||||
@ -34,8 +35,11 @@ public:
|
|||||||
SynGroups& operator=(const SynGroups&&) = delete;
|
SynGroups& operator=(const SynGroups&&) = delete;
|
||||||
|
|
||||||
bool setfile(const std::string& fname);
|
bool setfile(const std::string& fname);
|
||||||
std::vector<std::string> getgroup(const std::string& term);
|
std::vector<std::string> getgroup(const std::string& term) const;
|
||||||
bool ok();
|
const std::set<std::string>& getmultiwords() const;
|
||||||
|
size_t getmultiwordsmaxlength() const;
|
||||||
|
const std::string& getpath() const;
|
||||||
|
bool ok() const;
|
||||||
private:
|
private:
|
||||||
class Internal;
|
class Internal;
|
||||||
Internal *m;
|
Internal *m;
|
||||||
|
|||||||
@ -867,12 +867,12 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi,
|
|||||||
bool Db::o_inPlaceReset;
|
bool Db::o_inPlaceReset;
|
||||||
|
|
||||||
Db::Db(const RclConfig *cfp)
|
Db::Db(const RclConfig *cfp)
|
||||||
: m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0),
|
|
||||||
m_occtxtsz(0), m_occFirstCheck(1), m_idxMetaStoredLen(150),
|
|
||||||
m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4),
|
|
||||||
m_flushMb(-1), m_maxFsOccupPc(0)
|
|
||||||
{
|
{
|
||||||
m_config = new RclConfig(*cfp);
|
m_config = new RclConfig(*cfp);
|
||||||
|
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||||
|
m_config->getConfParam("idxflushmb", &m_flushMb);
|
||||||
|
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
|
||||||
|
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
|
||||||
if (start_of_field_term.empty()) {
|
if (start_of_field_term.empty()) {
|
||||||
if (o_index_stripchars) {
|
if (o_index_stripchars) {
|
||||||
start_of_field_term = "XXST";
|
start_of_field_term = "XXST";
|
||||||
@ -882,20 +882,13 @@ Db::Db(const RclConfig *cfp)
|
|||||||
end_of_field_term = "XXND/";
|
end_of_field_term = "XXND/";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m_ndb = new Native(this);
|
m_ndb = new Native(this);
|
||||||
if (m_config) {
|
|
||||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
|
||||||
m_config->getConfParam("idxflushmb", &m_flushMb);
|
|
||||||
m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen);
|
|
||||||
m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Db::~Db()
|
Db::~Db()
|
||||||
{
|
{
|
||||||
LOGDEB2("Db::~Db\n");
|
LOGDEB2("Db::~Db\n");
|
||||||
if (m_ndb == 0)
|
if (nullptr == m_ndb)
|
||||||
return;
|
return;
|
||||||
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
|
LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " <<
|
||||||
m_ndb->m_iswritable << "\n");
|
m_ndb->m_iswritable << "\n");
|
||||||
@ -913,7 +906,6 @@ vector<string> Db::getStemmerNames()
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Db::open(OpenMode mode, OpenError *error)
|
bool Db::open(OpenMode mode, OpenError *error)
|
||||||
{
|
{
|
||||||
if (error)
|
if (error)
|
||||||
@ -934,12 +926,19 @@ bool Db::open(OpenMode mode, OpenError *error)
|
|||||||
if (!m_config->getStopfile().empty())
|
if (!m_config->getStopfile().empty())
|
||||||
m_stops.setFile(m_config->getStopfile());
|
m_stops.setFile(m_config->getStopfile());
|
||||||
|
|
||||||
|
if (isWriteMode(mode)) {
|
||||||
|
// Check for an index-time synonyms file. We use this to
|
||||||
|
// generate multiword terms for multiword synonyms
|
||||||
|
string synfile = m_config->getIdxSynGroupsFile();
|
||||||
|
if (path_exists(synfile)) {
|
||||||
|
setSynGroupsFile(synfile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
string dir = m_config->getDbDir();
|
string dir = m_config->getDbDir();
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
switch (mode) {
|
if (isWriteMode(mode)) {
|
||||||
case DbUpd:
|
|
||||||
case DbTrunc:
|
|
||||||
m_ndb->openWrite(dir, mode);
|
m_ndb->openWrite(dir, mode);
|
||||||
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
|
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
|
||||||
// We used to open a readonly object in addition to the
|
// We used to open a readonly object in addition to the
|
||||||
@ -951,9 +950,7 @@ bool Db::open(OpenMode mode, OpenError *error)
|
|||||||
// so the query db is now a clone of the update one.
|
// so the query db is now a clone of the update one.
|
||||||
m_ndb->xrdb = m_ndb->xwdb;
|
m_ndb->xrdb = m_ndb->xwdb;
|
||||||
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
|
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
|
||||||
break;
|
} else {
|
||||||
case DbRO:
|
|
||||||
default:
|
|
||||||
m_ndb->openRead(dir);
|
m_ndb->openRead(dir);
|
||||||
for (auto& db : m_extraDbs) {
|
for (auto& db : m_extraDbs) {
|
||||||
if (error)
|
if (error)
|
||||||
@ -963,7 +960,6 @@ bool Db::open(OpenMode mode, OpenError *error)
|
|||||||
// but I can't see why
|
// but I can't see why
|
||||||
m_ndb->xrdb.add_database(Xapian::Database(db));
|
m_ndb->xrdb.add_database(Xapian::Database(db));
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
if (error)
|
if (error)
|
||||||
*error = DbOpenMainDb;
|
*error = DbOpenMainDb;
|
||||||
@ -1531,6 +1527,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||||
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||||
|
|
||||||
|
TermProcMulti tpmulti(nxt, m_syngroups);
|
||||||
|
if (m_syngroups.getmultiwordsmaxlength() > 1) {
|
||||||
|
nxt = &tpmulti;
|
||||||
|
}
|
||||||
|
|
||||||
TermProcPrep tpprep(nxt);
|
TermProcPrep tpprep(nxt);
|
||||||
if (o_index_stripchars)
|
if (o_index_stripchars)
|
||||||
nxt = &tpprep;
|
nxt = &tpprep;
|
||||||
|
|||||||
@ -114,14 +114,13 @@ public:
|
|||||||
|
|
||||||
class DbStats {
|
class DbStats {
|
||||||
public:
|
public:
|
||||||
DbStats()
|
DbStats() {}
|
||||||
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
|
|
||||||
// Index-wide stats
|
// Index-wide stats
|
||||||
unsigned int dbdoccount;
|
unsigned int dbdoccount{0};
|
||||||
double dbavgdoclen;
|
double dbavgdoclen{0};
|
||||||
size_t mindoclen;
|
size_t mindoclen{0};
|
||||||
size_t maxdoclen;
|
size_t maxdoclen{0};
|
||||||
vector<string> failedurls; /* Only set if requested */
|
std::vector<std::string> failedurls; /* Only set if requested */
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool has_prefix(const string& trm)
|
inline bool has_prefix(const string& trm)
|
||||||
@ -175,6 +174,9 @@ public:
|
|||||||
~Db();
|
~Db();
|
||||||
|
|
||||||
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
enum OpenMode {DbRO, DbUpd, DbTrunc};
|
||||||
|
bool isWriteMode(OpenMode mode) {
|
||||||
|
return mode == DbUpd || mode == DbTrunc;
|
||||||
|
}
|
||||||
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
|
enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
|
||||||
bool open(OpenMode mode, OpenError *error = 0);
|
bool open(OpenMode mode, OpenError *error = 0);
|
||||||
bool close();
|
bool close();
|
||||||
@ -342,7 +344,7 @@ public:
|
|||||||
bool setExtraQueryDbs(const std::vector<std::string>& dbs);
|
bool setExtraQueryDbs(const std::vector<std::string>& dbs);
|
||||||
|
|
||||||
/** Check if document comes from the main index (this is used to
|
/** Check if document comes from the main index (this is used to
|
||||||
decide if we can update the index for it */
|
decide if we can update the index for it */
|
||||||
bool fromMainIndex(const Doc& doc);
|
bool fromMainIndex(const Doc& doc);
|
||||||
|
|
||||||
/** Retrieve the stored doc text. This returns false if the index does not
|
/** Retrieve the stored doc text. This returns false if the index does not
|
||||||
@ -499,6 +501,7 @@ public:
|
|||||||
|
|
||||||
// Use empty fn for no synonyms
|
// Use empty fn for no synonyms
|
||||||
bool setSynGroupsFile(const std::string& fn);
|
bool setSynGroupsFile(const std::string& fn);
|
||||||
|
const SynGroups& getSynGroups() {return m_syngroups;}
|
||||||
|
|
||||||
// Mark all documents with an UDI having input as prefix as
|
// Mark all documents with an UDI having input as prefix as
|
||||||
// existing. Only works if the UDIs for the store are
|
// existing. Only works if the UDIs for the store are
|
||||||
@ -508,25 +511,26 @@ public:
|
|||||||
bool udiTreeMarkExisting(const string& udi);
|
bool udiTreeMarkExisting(const string& udi);
|
||||||
|
|
||||||
/* This has to be public for access by embedded Query::Native */
|
/* This has to be public for access by embedded Query::Native */
|
||||||
Native *m_ndb;
|
Native *m_ndb{nullptr};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const RclConfig *m_config;
|
const RclConfig *m_config;
|
||||||
string m_reason; // Error explanation
|
string m_reason; // Error explanation
|
||||||
|
|
||||||
// Xapian directories for additional databases to query
|
// Xapian directories for additional databases to query
|
||||||
vector<string> m_extraDbs;
|
vector<string> m_extraDbs;
|
||||||
OpenMode m_mode;
|
OpenMode m_mode{Db::DbRO};
|
||||||
// File existence vector: this is filled during the indexing pass. Any
|
// File existence vector: this is filled during the indexing pass. Any
|
||||||
// document whose bit is not set at the end is purged
|
// document whose bit is not set at the end is purged
|
||||||
vector<bool> updated;
|
vector<bool> updated;
|
||||||
// Text bytes indexed since beginning
|
// Text bytes indexed since beginning
|
||||||
long long m_curtxtsz;
|
long long m_curtxtsz{0};
|
||||||
// Text bytes at last flush
|
// Text bytes at last flush
|
||||||
long long m_flushtxtsz;
|
long long m_flushtxtsz{0};
|
||||||
// Text bytes at last fsoccup check
|
// Text bytes at last fsoccup check
|
||||||
long long m_occtxtsz;
|
long long m_occtxtsz{0};
|
||||||
// First fs occup check ?
|
// First fs occup check ?
|
||||||
int m_occFirstCheck;
|
int m_occFirstCheck{1};
|
||||||
|
|
||||||
// Synonym groups. There is no strict reason that this has to be
|
// Synonym groups. There is no strict reason that this has to be
|
||||||
// an Rcl::Db member, as it is only used when building each It
|
// an Rcl::Db member, as it is only used when building each It
|
||||||
@ -538,32 +542,31 @@ private:
|
|||||||
SynGroups m_syngroups;
|
SynGroups m_syngroups;
|
||||||
|
|
||||||
// Aspell object if needed
|
// Aspell object if needed
|
||||||
Aspell *m_aspell = nullptr;
|
Aspell *m_aspell{nullptr};
|
||||||
|
|
||||||
/***************
|
/***************
|
||||||
* Parameters cached out of the configuration files. Logically const
|
* Parameters cached out of the configuration files. Logically const
|
||||||
* after init */
|
* after init */
|
||||||
// Stop terms: those don't get indexed.
|
// Stop terms: those don't get indexed.
|
||||||
StopList m_stops;
|
StopList m_stops;
|
||||||
|
|
||||||
// Truncation length for stored meta fields
|
// Truncation length for stored meta fields
|
||||||
int m_idxMetaStoredLen;
|
int m_idxMetaStoredLen{150};
|
||||||
// This is how long an abstract we keep or build from beginning of
|
// This is how long an abstract we keep or build from beginning of
|
||||||
// text when indexing. It only has an influence on the size of the
|
// text when indexing. It only has an influence on the size of the
|
||||||
// db as we are free to shorten it again when displaying
|
// db as we are free to shorten it again when displaying
|
||||||
int m_idxAbsTruncLen;
|
int m_idxAbsTruncLen{250};
|
||||||
// Document text truncation length
|
// Document text truncation length
|
||||||
int m_idxTextTruncateLen{0};
|
int m_idxTextTruncateLen{0};
|
||||||
// This is the size of the abstract that we synthetize out of query
|
// This is the size of the abstract that we synthetize out of query
|
||||||
// term contexts at *query time*
|
// term contexts at *query time*
|
||||||
int m_synthAbsLen;
|
int m_synthAbsLen{250};
|
||||||
// This is how many words (context size) we keep around query terms
|
// This is how many words (context size) we keep around query terms
|
||||||
// when building the abstract
|
// when building the abstract
|
||||||
int m_synthAbsWordCtxLen;
|
int m_synthAbsWordCtxLen{4};
|
||||||
// Flush threshold. Megabytes of text indexed before we flush.
|
// Flush threshold. Megabytes of text indexed before we flush.
|
||||||
int m_flushMb;
|
int m_flushMb{-1};
|
||||||
// Maximum file system occupation percentage
|
// Maximum file system occupation percentage
|
||||||
int m_maxFsOccupPc;
|
int m_maxFsOccupPc{0};
|
||||||
// Database directory
|
// Database directory
|
||||||
string m_basedir;
|
string m_basedir;
|
||||||
// When this is set, all documents are considered as needing a reindex.
|
// When this is set, all documents are considered as needing a reindex.
|
||||||
|
|||||||
@ -283,7 +283,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
|||||||
LOGDEB("Db::TermMatch: syngroups out: " <<
|
LOGDEB("Db::TermMatch: syngroups out: " <<
|
||||||
term << " -> " << stringsToString(sg) << "\n");
|
term << " -> " << stringsToString(sg) << "\n");
|
||||||
for (const auto& synonym : sg) {
|
for (const auto& synonym : sg) {
|
||||||
if (synonym.find_first_of(" ") != string::npos) {
|
if (synonym.find(' ') != string::npos) {
|
||||||
if (multiwords) {
|
if (multiwords) {
|
||||||
multiwords->push_back(synonym);
|
multiwords->push_back(synonym);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -602,13 +602,12 @@ void SearchDataClauseSimple::processSimpleSpan(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Push phrases for the multi-word expansions
|
// Push phrases for the multi-word expansions
|
||||||
for (vector<string>::const_iterator mwp = multiwords.begin();
|
for (const auto& mw : multiwords) {
|
||||||
mwp != multiwords.end(); mwp++) {
|
|
||||||
vector<string> phr;
|
vector<string> phr;
|
||||||
// We just do a basic split to keep things a bit simpler here
|
// We just do a basic split to keep things a bit simpler here
|
||||||
// (no textsplit). This means though that no punctuation is
|
// (no textsplit). This means though that no punctuation is
|
||||||
// allowed in multi-word synonyms.
|
// allowed in multi-word synonyms.
|
||||||
stringToTokens(*mwp, phr);
|
stringToTokens(mw, phr);
|
||||||
if (!prefix.empty())
|
if (!prefix.empty())
|
||||||
prefix_vector(phr, prefix);
|
prefix_vector(phr, prefix);
|
||||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||||
@ -624,20 +623,19 @@ void SearchDataClauseSimple::processSimpleSpan(
|
|||||||
// NEAR xapian query, the elements of which can themselves be OR
|
// NEAR xapian query, the elements of which can themselves be OR
|
||||||
// queries if the terms get expanded by stemming or wildcards (we
|
// queries if the terms get expanded by stemming or wildcards (we
|
||||||
// don't do stemming for PHRASE though)
|
// don't do stemming for PHRASE though)
|
||||||
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
void SearchDataClauseSimple::processPhraseOrNear(
|
||||||
TermProcQ *splitData,
|
Rcl::Db &db, string& ermsg, TermProcQ *splitData, int mods, void *pq,
|
||||||
int mods, void *pq,
|
bool useNear, int slack)
|
||||||
bool useNear, int slack)
|
|
||||||
{
|
{
|
||||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||||
Xapian::Query::OP_PHRASE;
|
Xapian::Query::OP_PHRASE;
|
||||||
vector<Xapian::Query> orqueries;
|
vector<Xapian::Query> orqueries;
|
||||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
||||||
bool hadmultiple = false;
|
|
||||||
#endif
|
|
||||||
vector<vector<string> >groups;
|
vector<vector<string> >groups;
|
||||||
|
|
||||||
|
bool useidxsynonyms =
|
||||||
|
db.getSynGroups().getpath() == db.getConf()->getIdxSynGroupsFile();
|
||||||
|
|
||||||
string prefix;
|
string prefix;
|
||||||
const FieldTraits *ftp;
|
const FieldTraits *ftp;
|
||||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||||
@ -650,32 +648,38 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Go through the list and perform stem/wildcard expansion for each element
|
// Go through the list and perform stem/wildcard expansion for each element
|
||||||
vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
|
auto nxit = splitData->nostemexps().begin();
|
||||||
for (vector<string>::const_iterator it = splitData->terms().begin();
|
for (auto it = splitData->terms().begin();
|
||||||
it != splitData->terms().end(); it++, nxit++) {
|
it != splitData->terms().end(); it++, nxit++) {
|
||||||
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
|
LOGDEB0("ProcessPhrase: processing [" << *it << "]\n");
|
||||||
// Adjust when we do stem expansion. Not if disabled by
|
// Adjust when we do stem expansion. Not if disabled by
|
||||||
// caller, not inside phrases, and some versions of xapian
|
// caller, not inside phrases, and some versions of xapian
|
||||||
// will accept only one OR clause inside NEAR.
|
// will accept only one OR clause inside NEAR.
|
||||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE);
|
||||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
||||||
|| hadmultiple
|
|
||||||
#endif // single OR inside NEAR
|
|
||||||
;
|
|
||||||
int lmods = mods;
|
int lmods = mods;
|
||||||
if (nostemexp)
|
if (nostemexp)
|
||||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||||
string sterm;
|
string sterm;
|
||||||
vector<string> exp;
|
vector<string> exp;
|
||||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
vector<string> multiwords;
|
||||||
|
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix, &multiwords))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// Note: because of how expandTerm works, the multiwords can
|
||||||
|
// only come from the synonyms expansion, which means that, if
|
||||||
|
// idxsynonyms is set, they have each been indexed as a single
|
||||||
|
// term. So, if idxsynonyms is set, and is the current active
|
||||||
|
// synonyms file, we just add them to the expansion.
|
||||||
|
if (!multiwords.empty() && useidxsynonyms) {
|
||||||
|
exp.insert(exp.end(), multiwords.begin(), multiwords.end());
|
||||||
|
}
|
||||||
|
|
||||||
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
|
LOGDEB0("ProcessPhraseOrNear: exp size " << exp.size() << ", exp: " <<
|
||||||
stringsToString(exp) << "\n");
|
stringsToString(exp) << "\n");
|
||||||
// groups is used for highlighting, we don't want prefixes in there.
|
// groups is used for highlighting, we don't want prefixes in there.
|
||||||
vector<string> noprefs;
|
vector<string> noprefs;
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
for (const auto& prefterm : exp) {
|
||||||
it != exp.end(); it++) {
|
noprefs.push_back(prefterm.substr(prefix.size()));
|
||||||
noprefs.push_back(it->substr(prefix.size()));
|
|
||||||
}
|
}
|
||||||
groups.push_back(noprefs);
|
groups.push_back(noprefs);
|
||||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||||
@ -683,10 +687,6 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|||||||
m_curcl += exp.size();
|
m_curcl += exp.size();
|
||||||
if (m_curcl >= getMaxCl())
|
if (m_curcl >= getMaxCl())
|
||||||
return;
|
return;
|
||||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
||||||
if (exp.size() > 1)
|
|
||||||
hadmultiple = true;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
||||||
|
|||||||
@ -19,12 +19,15 @@
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
#include <list>
|
||||||
|
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
|
#include "syngroups.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
@ -52,11 +55,13 @@ class TermProc {
|
|||||||
public:
|
public:
|
||||||
TermProc(TermProc* next) : m_next(next) {}
|
TermProc(TermProc* next) : m_next(next) {}
|
||||||
virtual ~TermProc() {}
|
virtual ~TermProc() {}
|
||||||
|
/* Copyconst and assignment forbidden */
|
||||||
|
TermProc(const TermProc &) = delete;
|
||||||
|
TermProc& operator=(const TermProc &) = delete;
|
||||||
virtual bool takeword(const string &term, int pos, int bs, int be) {
|
virtual bool takeword(const string &term, int pos, int bs, int be) {
|
||||||
if (m_next)
|
if (m_next)
|
||||||
return m_next->takeword(term, pos, bs, be);
|
return m_next->takeword(term, pos, bs, be);
|
||||||
else
|
return true;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
// newpage() is like takeword(), but for page breaks.
|
// newpage() is like takeword(), but for page breaks.
|
||||||
virtual void newpage(int pos) {
|
virtual void newpage(int pos) {
|
||||||
@ -66,16 +71,10 @@ public:
|
|||||||
virtual bool flush() {
|
virtual bool flush() {
|
||||||
if (m_next)
|
if (m_next)
|
||||||
return m_next->flush();
|
return m_next->flush();
|
||||||
else
|
return true;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
TermProc *m_next;
|
TermProc *m_next;
|
||||||
/* Copyconst and assignment private and forbidden */
|
|
||||||
TermProc(const TermProc &) {}
|
|
||||||
TermProc& operator=(const TermProc &) {
|
|
||||||
return *this;
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -100,8 +99,7 @@ public:
|
|||||||
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||||
if (m_prc)
|
if (m_prc)
|
||||||
return m_prc->takeword(term, pos, bs, be);
|
return m_prc->takeword(term, pos, bs, be);
|
||||||
else
|
return true;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void newpage(int pos) {
|
virtual void newpage(int pos) {
|
||||||
@ -119,12 +117,9 @@ private:
|
|||||||
class TermProcPrep : public TermProc {
|
class TermProcPrep : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcPrep(TermProc *nxt)
|
TermProcPrep(TermProc *nxt)
|
||||||
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
: TermProc(nxt) {}
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
virtual bool takeword(const string& itrm, int pos, int bs, int be) {
|
||||||
{
|
|
||||||
m_totalterms++;
|
m_totalterms++;
|
||||||
string otrm;
|
string otrm;
|
||||||
|
|
||||||
@ -179,49 +174,37 @@ public:
|
|||||||
// change in here. This means that phrase searches and
|
// change in here. This means that phrase searches and
|
||||||
// snippets will be wrong, but at least searching for the
|
// snippets will be wrong, but at least searching for the
|
||||||
// terms will work.
|
// terms will work.
|
||||||
bool hasspace = false;
|
bool hasspace = otrm.find(' ') != std::string::npos;
|
||||||
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
|
|
||||||
if (*it == ' ') {
|
|
||||||
hasspace=true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (hasspace) {
|
if (hasspace) {
|
||||||
std::vector<std::string> terms;
|
std::vector<std::string> terms;
|
||||||
stringToTokens(otrm, terms, " ", true);
|
stringToTokens(otrm, terms, " ", true);
|
||||||
for (std::vector<std::string>::const_iterator it = terms.begin();
|
for (const auto& term : terms) {
|
||||||
it < terms.end(); it++) {
|
if (!TermProc::takeword(term, pos, bs, be)) {
|
||||||
if (!TermProc::takeword(*it, pos, bs, be)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
|
||||||
return TermProc::takeword(otrm, pos, bs, be);
|
|
||||||
}
|
}
|
||||||
|
return TermProc::takeword(otrm, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool flush()
|
virtual bool flush() {
|
||||||
{
|
|
||||||
m_totalterms = m_unacerrors = 0;
|
m_totalterms = m_unacerrors = 0;
|
||||||
return TermProc::flush();
|
return TermProc::flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int m_totalterms;
|
int m_totalterms{0};
|
||||||
int m_unacerrors;
|
int m_unacerrors{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Compare to stop words list and discard if match found */
|
/** Compare to stop words list and discard if match found */
|
||||||
class TermProcStop : public TermProc {
|
class TermProcStop : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
: TermProc(nxt), m_stops(stops)
|
: TermProc(nxt), m_stops(stops) {}
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||||
{
|
|
||||||
if (m_stops.isStop(term)) {
|
if (m_stops.isStop(term)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -232,6 +215,53 @@ private:
|
|||||||
const Rcl::StopList& m_stops;
|
const Rcl::StopList& m_stops;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Generate multiword terms for multiword synonyms. This allows
|
||||||
|
* NEAR/PHRASE searches for multiword synonyms. */
|
||||||
|
class TermProcMulti : public TermProc {
|
||||||
|
public:
|
||||||
|
TermProcMulti(TermProc *nxt, const SynGroups& sg)
|
||||||
|
: TermProc(nxt), m_groups(sg.getmultiwords()),
|
||||||
|
m_maxl(sg.getmultiwordsmaxlength()) {}
|
||||||
|
|
||||||
|
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||||
|
if (m_maxl < 2) {
|
||||||
|
// Should not have been pushed??
|
||||||
|
return TermProc::takeword(term, pos, bs, be);
|
||||||
|
}
|
||||||
|
m_terms.push_back(term);
|
||||||
|
if (m_terms.size() > m_maxl) {
|
||||||
|
m_terms.pop_front();
|
||||||
|
}
|
||||||
|
string comp;
|
||||||
|
int gsz{1};
|
||||||
|
for (const auto& gterm : m_terms) {
|
||||||
|
if (comp.empty()) {
|
||||||
|
comp = gterm;
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
comp += " ";
|
||||||
|
comp += gterm;
|
||||||
|
gsz++;
|
||||||
|
// We could optimize by not testing m_groups for sizes
|
||||||
|
// which do not exist.
|
||||||
|
// if not gsz in sizes continue;
|
||||||
|
}
|
||||||
|
if (m_groups.find(comp) != m_groups.end()) {
|
||||||
|
LOGDEB1("Found multiword synonym: [" << comp << "]\n");
|
||||||
|
// TBD bs-be correct computation. Need to store the
|
||||||
|
// values in a parallel list
|
||||||
|
TermProc::takeword(comp, pos-gsz, bs-comp.size(), be);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return TermProc::takeword(term, pos, bs, be);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const std::set<std::string>& m_groups;
|
||||||
|
size_t m_maxl{0};
|
||||||
|
std::list<std::string> m_terms;
|
||||||
|
};
|
||||||
|
|
||||||
/** Handle common-gram generation: combine frequent terms with neighbours to
|
/** Handle common-gram generation: combine frequent terms with neighbours to
|
||||||
* shorten the positions lists for phrase searches.
|
* shorten the positions lists for phrase searches.
|
||||||
* NOTE: This does not currently work because of bad interaction with the
|
* NOTE: This does not currently work because of bad interaction with the
|
||||||
@ -241,13 +271,11 @@ private:
|
|||||||
class TermProcCommongrams : public TermProc {
|
class TermProcCommongrams : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
: TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||||
{
|
LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
|
||||||
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" );
|
be << " [" << term << "]\n");
|
||||||
bool isstop = m_stops.isStop(term);
|
bool isstop = m_stops.isStop(term);
|
||||||
bool twogramemit = false;
|
bool twogramemit = false;
|
||||||
|
|
||||||
@ -287,8 +315,7 @@ public:
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool flush()
|
virtual bool flush() {
|
||||||
{
|
|
||||||
if (!m_prevsent && !m_prevterm.empty())
|
if (!m_prevsent && !m_prevterm.empty())
|
||||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||||
return false;
|
return false;
|
||||||
@ -297,8 +324,7 @@ public:
|
|||||||
m_prevsent = true;
|
m_prevsent = true;
|
||||||
return TermProc::flush();
|
return TermProc::flush();
|
||||||
}
|
}
|
||||||
void onlygrams(bool on)
|
void onlygrams(bool on) {
|
||||||
{
|
|
||||||
m_onlygrams = on;
|
m_onlygrams = on;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -767,6 +767,14 @@ checkneedretryindexscript = rclcheckneedretry.sh
|
|||||||
# space issues.</descr></var>
|
# space issues.</descr></var>
|
||||||
#idxtexttruncatelen = 0
|
#idxtexttruncatelen = 0
|
||||||
|
|
||||||
|
# <var name="idxsynonyms" type="fn">
|
||||||
|
#
|
||||||
|
# <brief>Name of the index-time synonyms file.</brief>
|
||||||
|
# <descr>This is used for indexing multiword synonyms as single terms,
|
||||||
|
# which in turn is only useful if you want to perform proximity searches
|
||||||
|
# with such terms.</descr></var>
|
||||||
|
#idxsynonyms = thereisnodefaultidxsynonyms
|
||||||
|
|
||||||
# <var name="aspellLanguage" type="string">
|
# <var name="aspellLanguage" type="string">
|
||||||
#
|
#
|
||||||
# <brief>Language definitions to use when creating the aspell
|
# <brief>Language definitions to use when creating the aspell
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user