Support multi-word synonyms and add modifier to turn-off synonyms expansion
This commit is contained in:
parent
766a34a8db
commit
e7a669b668
@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common
|
||||
stemming is off by default for phrases).</para>
|
||||
</listitem>
|
||||
|
||||
<listitem><para><literal>s</literal> can be used to turn off
|
||||
synonym expansion, if a synonyms file is in place (only for
|
||||
&RCL; 1.22 and later).</para>
|
||||
</listitem>
|
||||
|
||||
<listitem><para><literal>o</literal> can be used to specify a
|
||||
"slack" for phrase and proximity searches: the number of
|
||||
additional terms that may be found between the specified
|
||||
|
||||
@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
|
||||
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
||||
}
|
||||
break;
|
||||
case 's':
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
|
||||
break;
|
||||
case 'S':
|
||||
break;
|
||||
case '.':case '0':case '1':case '2':case '3':case '4':
|
||||
case '5':case '6':case '7':case '8':case '9':
|
||||
{
|
||||
|
||||
@ -341,8 +341,9 @@ class Db {
|
||||
* Stem expansion is performed if lang is not empty
|
||||
*
|
||||
* @param typ_sens defines the kind of expansion: none, wildcard,
|
||||
* regexp or stemming. "none" will still expand case and
|
||||
* diacritics depending on the casesens and diacsens flags.
|
||||
* regexp or stemming. "none" may still expand case,
|
||||
* diacritics and synonyms, depending on the casesens, diacsens and
|
||||
* synexp flags.
|
||||
* @param lang sets the stemming language(s). Can be a space-separated list
|
||||
* @param term is the term to expand
|
||||
* @param result is the main output
|
||||
@ -354,14 +355,14 @@ class Db {
|
||||
* in the TermMatchResult header
|
||||
*/
|
||||
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
|
||||
ET_DIACSENS=8, ET_CASESENS=16};
|
||||
ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32};
|
||||
int matchTypeTp(int tp)
|
||||
{
|
||||
return tp & 7;
|
||||
}
|
||||
bool termMatch(int typ_sens, const string &lang, const string &term,
|
||||
TermMatchResult& result, int max = -1,
|
||||
const string& field = cstr_null);
|
||||
TermMatchResult& result, int max = -1,
|
||||
const string& field = "", vector<string> *multiwords = 0);
|
||||
bool dbStats(DbStats& stats);
|
||||
/** Return min and max years for doc mod times in db */
|
||||
bool maxYearSpan(int *minyear, int *maxyear);
|
||||
|
||||
@ -164,7 +164,8 @@ static const char *tmtptostr(int typ)
|
||||
// using the main index terms (filtering, retrieving stats, expansion
|
||||
// in some cases).
|
||||
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||
TermMatchResult& res, int max, const string& field)
|
||||
TermMatchResult& res, int max, const string& field,
|
||||
vector<string>* multiwords)
|
||||
{
|
||||
int matchtyp = matchTypeTp(typ_sens);
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||
if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
lexp.clear();
|
||||
if (matchtyp == ET_STEM) {
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
exp1.swap(lexp);
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
LOGDEB(("ExpTerm: stemexp: %s\n",
|
||||
stringsToString(lexp).c_str()));
|
||||
}
|
||||
|
||||
// Expand the result for synonyms. Note that doing it here
|
||||
// means that multi-term synonyms will not work
|
||||
// (e.g. stakhanovist -> "hard at work". We would have to
|
||||
// separate the multi-word expansions for our caller to
|
||||
// add them as phrases to the query. Not impossible, but
|
||||
// let's keep it at single words for now.
|
||||
if (m_syngroups.ok()) {
|
||||
if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
|
||||
LOGDEB(("ExpTerm: got syngroups\n"));
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
vector<string> exp1(lexp);
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
vector<string> sg = m_syngroups.getgroup(*it);
|
||||
if (!sg.empty()) {
|
||||
LOGDEB(("ExpTerm: syns: %s -> %s\n",
|
||||
it->c_str(), stringsToString(sg).c_str()));
|
||||
lexp.insert(lexp.end(), sg.begin(), sg.end());
|
||||
for (vector<string>::const_iterator it1 = sg.begin();
|
||||
it1 != sg.end(); it1++) {
|
||||
if (it1->find_first_of(" ") != string::npos) {
|
||||
if (multiwords)
|
||||
multiwords->push_back(*it1);
|
||||
} else {
|
||||
exp1.push_back(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lexp.swap(exp1);
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
// Keep result in exp1 for next step
|
||||
exp1.swap(lexp);
|
||||
}
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
synac.synExpand(*it, exp1);
|
||||
}
|
||||
exp1.swap(lexp);
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
@ -224,9 +224,10 @@ private:
|
||||
|
||||
class SearchDataClause {
|
||||
public:
|
||||
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
|
||||
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
|
||||
SDCM_NOTERMS=32 // Don't include terms for highlighting
|
||||
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
|
||||
SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
|
||||
SDCM_NOTERMS=0x20, // Don't include terms for highlighting
|
||||
SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
|
||||
};
|
||||
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
|
||||
|
||||
@ -382,7 +383,8 @@ protected:
|
||||
bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods,
|
||||
const std::string& term,
|
||||
std::vector<std::string>& exp,
|
||||
std::string& sterm, const std::string& prefix);
|
||||
std::string& sterm, const std::string& prefix,
|
||||
std::vector<std::string>* multiwords = 0);
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
|
||||
int mods, void *pq);
|
||||
|
||||
@ -379,17 +379,6 @@ private:
|
||||
};
|
||||
|
||||
|
||||
#if 1
|
||||
static void listVector(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||
a = a + *it + " ";
|
||||
}
|
||||
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
||||
* diacritics...
|
||||
*
|
||||
@ -400,12 +389,16 @@ static void listVector(const string& what, const vector<string>&l)
|
||||
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||
* @param multiwords it may happen that synonym processing results in multi-word
|
||||
* expansions which should be processed as phrases.
|
||||
*/
|
||||
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
string& ermsg, int mods,
|
||||
const string& term,
|
||||
vector<string>& oexp, string &sterm,
|
||||
const string& prefix)
|
||||
const string& prefix,
|
||||
vector<string>* multiwords
|
||||
)
|
||||
{
|
||||
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
||||
@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
// noexpansion can be modified further down by possible case/diac expansion
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
|
||||
int termmatchsens = 0;
|
||||
|
||||
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
||||
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
||||
bool synonyms = (mods & SDCM_NOSYNS) == 0;
|
||||
|
||||
// noexpansion can be modified further down by possible case/diac expansion
|
||||
bool noexpansion = nostemexp && !haswild && !synonyms;
|
||||
|
||||
if (o_index_stripchars) {
|
||||
diac_sensitive = case_sensitive = false;
|
||||
@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
noexpansion = false;
|
||||
}
|
||||
|
||||
if (case_sensitive)
|
||||
termmatchsens |= Db::ET_CASESENS;
|
||||
if (diac_sensitive)
|
||||
termmatchsens |= Db::ET_DIACSENS;
|
||||
|
||||
if (noexpansion) {
|
||||
oexp.push_back(prefix + term);
|
||||
@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
return true;
|
||||
}
|
||||
|
||||
int termmatchsens = 0;
|
||||
if (case_sensitive)
|
||||
termmatchsens |= Db::ET_CASESENS;
|
||||
if (diac_sensitive)
|
||||
termmatchsens |= Db::ET_DIACSENS;
|
||||
if (synonyms)
|
||||
termmatchsens |= Db::ET_SYNEXP;
|
||||
|
||||
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
||||
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
||||
TermMatchResult res;
|
||||
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
||||
m_field)) {
|
||||
if (!db.termMatch(mtyp | termmatchsens, getStemLang(),
|
||||
term, res, maxexpand, m_field, multiwords)) {
|
||||
// Let it go through
|
||||
}
|
||||
|
||||
@ -560,9 +556,17 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
}
|
||||
}
|
||||
|
||||
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
const string& span,
|
||||
int mods, void * pq)
|
||||
static void prefix_vector(vector<string>& v, const string& prefix)
|
||||
{
|
||||
for (vector<string>::iterator it = v.begin(); it != v.end(); it++) {
|
||||
*it = prefix + *it;
|
||||
}
|
||||
}
|
||||
|
||||
void SearchDataClauseSimple::
|
||||
processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
const string& span,
|
||||
int mods, void * pq)
|
||||
{
|
||||
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
||||
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
||||
@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
if (ftp->noterms)
|
||||
addModifier(SDCM_NOTERMS);
|
||||
addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
||||
vector<string> multiwords;
|
||||
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
|
||||
return;
|
||||
|
||||
// Set up the highlight data. No prefix should go in there
|
||||
@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
Xapian::Query(prefix+sterm,
|
||||
original_term_wqf_booster));
|
||||
}
|
||||
|
||||
// Push phrases for the multi-word expansions
|
||||
for (vector<string>::const_iterator mwp = multiwords.begin();
|
||||
mwp != multiwords.end(); mwp++) {
|
||||
vector<string> phr;
|
||||
// We just do a basic split to keep things a bit simpler here
|
||||
// (no textsplit). This means though that no punctuation is
|
||||
// allowed in multi-word synonyms.
|
||||
stringToTokens(*mwp, phr);
|
||||
if (!prefix.empty())
|
||||
prefix_vector(phr, prefix);
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||
Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||
phr.begin(), phr.end()));
|
||||
m_curcl++;
|
||||
}
|
||||
|
||||
pqueries.push_back(xq);
|
||||
}
|
||||
|
||||
@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
||||
return;
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(),
|
||||
stringsToString(exp).c_str()));
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
vector<string> noprefs;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
|
||||
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
|
||||
return false;
|
||||
}
|
||||
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(),
|
||||
stringsToString(exp).c_str()));
|
||||
if (exp.size() == 1)
|
||||
orqueries.push_back(Xapian::Query(exp[0]));
|
||||
else
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user