Support multi-word synonyms and add modifier to turn-off synonyms expansion

This commit is contained in:
Jean-Francois Dockes 2015-08-23 12:15:52 +02:00
parent 766a34a8db
commit e7a669b668
6 changed files with 112 additions and 61 deletions

View File

@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common
stemming is off by default for phrases).</para> stemming is off by default for phrases).</para>
</listitem> </listitem>
<listitem><para><literal>s</literal> can be used to turn off
synonym expansion, if a synonyms file is in place (only for
&RCL; 1.22 and later).</para>
</listitem>
<listitem><para><literal>o</literal> can be used to specify a <listitem><para><literal>o</literal> can be used to specify a
"slack" for phrase and proximity searches: the number of "slack" for phrase and proximity searches: the number of
additional terms that may be found between the specified additional terms that may be found between the specified

View File

@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
//cerr << "set slack " << cl->getslack() << " done" << endl; //cerr << "set slack " << cl->getslack() << " done" << endl;
} }
break; break;
case 's':
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
break;
case 'S':
break;
case '.':case '0':case '1':case '2':case '3':case '4': case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9': case '5':case '6':case '7':case '8':case '9':
{ {

View File

@ -341,8 +341,9 @@ class Db {
* Stem expansion is performed if lang is not empty * Stem expansion is performed if lang is not empty
* *
* @param typ_sens defines the kind of expansion: none, wildcard, * @param typ_sens defines the kind of expansion: none, wildcard,
* regexp or stemming. "none" will still expand case and * regexp or stemming. "none" may still expand case,
* diacritics depending on the casesens and diacsens flags. * diacritics and synonyms, depending on the casesens, diacsens and
* synexp flags.
* @param lang sets the stemming language(s). Can be a space-separated list * @param lang sets the stemming language(s). Can be a space-separated list
* @param term is the term to expand * @param term is the term to expand
* @param result is the main output * @param result is the main output
@ -354,14 +355,14 @@ class Db {
* in the TermMatchResult header * in the TermMatchResult header
*/ */
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3, enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
ET_DIACSENS=8, ET_CASESENS=16}; ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32};
int matchTypeTp(int tp) int matchTypeTp(int tp)
{ {
return tp & 7; return tp & 7;
} }
bool termMatch(int typ_sens, const string &lang, const string &term, bool termMatch(int typ_sens, const string &lang, const string &term,
TermMatchResult& result, int max = -1, TermMatchResult& result, int max = -1,
const string& field = cstr_null); const string& field = "", vector<string> *multiwords = 0);
bool dbStats(DbStats& stats); bool dbStats(DbStats& stats);
/** Return min and max years for doc mod times in db */ /** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear); bool maxYearSpan(int *minyear, int *maxyear);

View File

@ -164,7 +164,8 @@ static const char *tmtptostr(int typ)
// using the main index terms (filtering, retrieving stats, expansion // using the main index terms (filtering, retrieving stats, expansion
// in some cases). // in some cases).
bool Db::termMatch(int typ_sens, const string &lang, const string &_term, bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
TermMatchResult& res, int max, const string& field) TermMatchResult& res, int max, const string& field,
vector<string>* multiwords)
{ {
int matchtyp = matchTypeTp(typ_sens); int matchtyp = matchTypeTp(typ_sens);
if (!m_ndb || !m_ndb->m_isopen) if (!m_ndb || !m_ndb->m_isopen)
@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
synac.synExpand(term, lexp); synac.synExpand(term, lexp);
} }
if (matchTypeTp(typ_sens) == ET_STEM) { if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
// Need stem expansion. Lowercase the result of accent and case // Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb. // expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) { for (unsigned int i = 0; i < lexp.size(); i++) {
@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
} }
sort(lexp.begin(), lexp.end()); sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
StemDb sdb(xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(lang, *it, exp1);
}
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
lexp.clear(); if (matchtyp == ET_STEM) {
StemDb sdb(xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(lang, *it, exp1);
}
exp1.swap(lexp);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
LOGDEB(("ExpTerm: stemexp: %s\n",
stringsToString(lexp).c_str()));
}
// Expand the result for synonyms. Note that doing it here // Expand the result for synonyms. Note that doing it here
// means that multi-term synonyms will not work // means that multi-term synonyms will not work
// (e.g. stakhanovist -> "hard at work". We would have to // (e.g. stakhanovist -> "hard at work". We would have to
// separate the multi-word expansions for our caller to // separate the multi-word expansions for our caller to
// add them as phrases to the query. Not impossible, but // add them as phrases to the query. Not impossible, but
// let's keep it at single words for now. // let's keep it at single words for now.
if (m_syngroups.ok()) { if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
LOGDEB(("ExpTerm: got syngroups\n")); LOGDEB(("ExpTerm: got syngroups\n"));
for (vector<string>::const_iterator it = exp1.begin(); vector<string> exp1(lexp);
it != exp1.end(); it++) { for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
vector<string> sg = m_syngroups.getgroup(*it); vector<string> sg = m_syngroups.getgroup(*it);
if (!sg.empty()) { if (!sg.empty()) {
LOGDEB(("ExpTerm: syns: %s -> %s\n", LOGDEB(("ExpTerm: syns: %s -> %s\n",
it->c_str(), stringsToString(sg).c_str())); it->c_str(), stringsToString(sg).c_str()));
lexp.insert(lexp.end(), sg.begin(), sg.end()); for (vector<string>::const_iterator it1 = sg.begin();
it1 != sg.end(); it1++) {
if (it1->find_first_of(" ") != string::npos) {
if (multiwords)
multiwords->push_back(*it1);
} else {
exp1.push_back(*it);
}
}
} }
} }
lexp.swap(exp1);
sort(lexp.begin(), lexp.end()); sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
// Keep result in exp1 for next step
exp1.swap(lexp);
} }
// Expand the resulting list for case (all stemdb content // Expand the resulting list for case (all stemdb content
// is lowercase) // is lowercase)
lexp.clear(); vector<string> exp1;
for (vector<string>::const_iterator it = exp1.begin(); for (vector<string>::const_iterator it = lexp.begin();
it != exp1.end(); it++) { it != lexp.end(); it++) {
synac.synExpand(*it, lexp); synac.synExpand(*it, exp1);
} }
exp1.swap(lexp);
sort(lexp.begin(), lexp.end()); sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
} }

View File

@ -224,9 +224,10 @@ private:
class SearchDataClause { class SearchDataClause {
public: public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16, SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
SDCM_NOTERMS=32 // Don't include terms for highlighting SDCM_NOTERMS=0x20, // Don't include terms for highlighting
SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
}; };
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
@ -382,7 +383,8 @@ protected:
bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods, bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods,
const std::string& term, const std::string& term,
std::vector<std::string>& exp, std::vector<std::string>& exp,
std::string& sterm, const std::string& prefix); std::string& sterm, const std::string& prefix,
std::vector<std::string>* multiwords = 0);
// After splitting entry on whitespace: process non-phrase element // After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span, void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
int mods, void *pq); int mods, void *pq);

View File

@ -379,17 +379,6 @@ private:
}; };
#if 1
static void listVector(const string& what, const vector<string>&l)
{
string a;
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
}
#endif
/** Expand term into term list, using appropriate mode: stem, wildcards, /** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics... * diacritics...
* *
@ -400,12 +389,16 @@ static void listVector(const string& what, const vector<string>&l)
* @param prefix field prefix in index. We could recompute it, but the caller * @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand, * has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it). * and we just return the prefixed term (else Db::termMatch deals with it).
* @param multiwords it may happen that synonym processing results in multi-word
* expansions which should be processed as phrases.
*/ */
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
string& ermsg, int mods, string& ermsg, int mods,
const string& term, const string& term,
vector<string>& oexp, string &sterm, vector<string>& oexp, string &sterm,
const string& prefix) const string& prefix,
vector<string>* multiwords
)
{ {
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n", LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), getStemLang().c_str())); mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
nostemexp = true; nostemexp = true;
} }
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild;
int termmatchsens = 0;
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0; bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SDCM_CASESENS) != 0; bool case_sensitive = (mods & SDCM_CASESENS) != 0;
bool synonyms = (mods & SDCM_NOSYNS) == 0;
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild && !synonyms;
if (o_index_stripchars) { if (o_index_stripchars) {
diac_sensitive = case_sensitive = false; diac_sensitive = case_sensitive = false;
@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
noexpansion = false; noexpansion = false;
} }
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (noexpansion) { if (noexpansion) {
oexp.push_back(prefix + term); oexp.push_back(prefix + term);
@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
return true; return true;
} }
int termmatchsens = 0;
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (synonyms)
termmatchsens |= Db::ET_SYNEXP;
Db::MatchType mtyp = haswild ? Db::ET_WILD : Db::MatchType mtyp = haswild ? Db::ET_WILD :
nostemexp ? Db::ET_NONE : Db::ET_STEM; nostemexp ? Db::ET_NONE : Db::ET_STEM;
TermMatchResult res; TermMatchResult res;
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand, if (!db.termMatch(mtyp | termmatchsens, getStemLang(),
m_field)) { term, res, maxexpand, m_field, multiwords)) {
// Let it go through // Let it go through
} }
@ -560,9 +556,17 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
} }
} }
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, static void prefix_vector(vector<string>& v, const string& prefix)
const string& span, {
int mods, void * pq) for (vector<string>::iterator it = v.begin(); it != v.end(); it++) {
*it = prefix + *it;
}
}
void SearchDataClauseSimple::
processSimpleSpan(Rcl::Db &db, string& ermsg,
const string& span,
int mods, void * pq)
{ {
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq); vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n", LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
if (ftp->noterms) if (ftp->noterms)
addModifier(SDCM_NOTERMS); addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
prefix = wrap_prefix(ftp->pfx); prefix = wrap_prefix(ftp->pfx);
} }
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix)) vector<string> multiwords;
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
return; return;
// Set up the highlight data. No prefix should go in there // Set up the highlight data. No prefix should go in there
@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
Xapian::Query(prefix+sterm, Xapian::Query(prefix+sterm,
original_term_wqf_booster)); original_term_wqf_booster));
} }
// Push phrases for the multi-word expansions
for (vector<string>::const_iterator mwp = multiwords.begin();
mwp != multiwords.end(); mwp++) {
vector<string> phr;
// We just do a basic split to keep things a bit simpler here
// (no textsplit). This means though that no punctuation is
// allowed in multi-word synonyms.
stringToTokens(*mwp, phr);
if (!prefix.empty())
prefix_vector(phr, prefix);
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(Xapian::Query::OP_PHRASE,
phr.begin(), phr.end()));
m_curcl++;
}
pqueries.push_back(xq); pqueries.push_back(xq);
} }
@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
vector<string> exp; vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
return; return;
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(),
listVector("", exp); stringsToString(exp).c_str()));
// groups is used for highlighting, we don't want prefixes in there. // groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs; vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin(); for (vector<string>::const_iterator it = exp.begin();
@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) { *pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
return false; return false;
} }
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size())); LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(),
listVector("", exp); stringsToString(exp).c_str()));
if (exp.size() == 1) if (exp.size() == 1)
orqueries.push_back(Xapian::Query(exp[0])); orqueries.push_back(Xapian::Query(exp[0]));
else else