Support multi-word synonyms and add modifier to turn-off synonyms expansion

This commit is contained in:
Jean-Francois Dockes 2015-08-23 12:15:52 +02:00
parent 766a34a8db
commit e7a669b668
6 changed files with 112 additions and 61 deletions

View File

@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common
stemming is off by default for phrases).</para>
</listitem>
<listitem><para><literal>s</literal> can be used to turn off
synonym expansion, if a synonyms file is in place (only for
&RCL; 1.22 and later).</para>
</listitem>
<listitem><para><literal>o</literal> can be used to specify a
"slack" for phrase and proximity searches: the number of
additional terms that may be found between the specified

View File

@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
case 's':
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
break;
case 'S':
break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{

View File

@ -341,8 +341,9 @@ class Db {
* Stem expansion is performed if lang is not empty
*
* @param typ_sens defines the kind of expansion: none, wildcard,
* regexp or stemming. "none" will still expand case and
* diacritics depending on the casesens and diacsens flags.
* regexp or stemming. "none" may still expand case,
* diacritics and synonyms, depending on the casesens, diacsens and
* synexp flags.
* @param lang sets the stemming language(s). Can be a space-separated list
* @param term is the term to expand
* @param result is the main output
@ -354,14 +355,14 @@ class Db {
* in the TermMatchResult header
*/
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
ET_DIACSENS=8, ET_CASESENS=16};
ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32};
int matchTypeTp(int tp)
{
return tp & 7;
}
bool termMatch(int typ_sens, const string &lang, const string &term,
TermMatchResult& result, int max = -1,
const string& field = cstr_null);
TermMatchResult& result, int max = -1,
const string& field = "", vector<string> *multiwords = 0);
bool dbStats(DbStats& stats);
/** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear);

View File

@ -164,7 +164,8 @@ static const char *tmtptostr(int typ)
// using the main index terms (filtering, retrieving stats, expansion
// in some cases).
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
TermMatchResult& res, int max, const string& field)
TermMatchResult& res, int max, const string& field,
vector<string>* multiwords)
{
int matchtyp = matchTypeTp(typ_sens);
if (!m_ndb || !m_ndb->m_isopen)
@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
synac.synExpand(term, lexp);
}
if (matchTypeTp(typ_sens) == ET_STEM) {
if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
// Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) {
@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
StemDb sdb(xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(lang, *it, exp1);
}
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
lexp.clear();
if (matchtyp == ET_STEM) {
StemDb sdb(xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(lang, *it, exp1);
}
exp1.swap(lexp);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
LOGDEB(("ExpTerm: stemexp: %s\n",
stringsToString(lexp).c_str()));
}
// Expand the result for synonyms. Note that doing it here
// means that multi-term synonyms will not work
// (e.g. stakhanovist -> "hard at work". We would have to
// separate the multi-word expansions for our caller to
// add them as phrases to the query. Not impossible, but
// let's keep it at single words for now.
if (m_syngroups.ok()) {
if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
LOGDEB(("ExpTerm: got syngroups\n"));
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
vector<string> exp1(lexp);
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
vector<string> sg = m_syngroups.getgroup(*it);
if (!sg.empty()) {
LOGDEB(("ExpTerm: syns: %s -> %s\n",
it->c_str(), stringsToString(sg).c_str()));
lexp.insert(lexp.end(), sg.begin(), sg.end());
for (vector<string>::const_iterator it1 = sg.begin();
it1 != sg.end(); it1++) {
if (it1->find_first_of(" ") != string::npos) {
if (multiwords)
multiwords->push_back(*it1);
} else {
exp1.push_back(*it);
}
}
}
}
lexp.swap(exp1);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
// Keep result in exp1 for next step
exp1.swap(lexp);
}
// Expand the resulting list for case (all stemdb content
// is lowercase)
lexp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, lexp);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
synac.synExpand(*it, exp1);
}
exp1.swap(lexp);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
}

View File

@ -224,9 +224,10 @@ private:
class SearchDataClause {
public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
SDCM_NOTERMS=32 // Don't include terms for highlighting
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
SDCM_NOTERMS=0x20, // Don't include terms for highlighting
SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
@ -382,7 +383,8 @@ protected:
bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods,
const std::string& term,
std::vector<std::string>& exp,
std::string& sterm, const std::string& prefix);
std::string& sterm, const std::string& prefix,
std::vector<std::string>* multiwords = 0);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
int mods, void *pq);

View File

@ -379,17 +379,6 @@ private:
};
#if 1
static void listVector(const string& what, const vector<string>&l)
{
string a;
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
}
#endif
/** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
*
@ -400,12 +389,16 @@ static void listVector(const string& what, const vector<string>&l)
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
* @param multiwords it may happen that synonym processing results in multi-word
* expansions which should be processed as phrases.
*/
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
string& ermsg, int mods,
const string& term,
vector<string>& oexp, string &sterm,
const string& prefix)
const string& prefix,
vector<string>* multiwords
)
{
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
nostemexp = true;
}
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild;
int termmatchsens = 0;
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
bool synonyms = (mods & SDCM_NOSYNS) == 0;
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild && !synonyms;
if (o_index_stripchars) {
diac_sensitive = case_sensitive = false;
@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
noexpansion = false;
}
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (noexpansion) {
oexp.push_back(prefix + term);
@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
return true;
}
int termmatchsens = 0;
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (synonyms)
termmatchsens |= Db::ET_SYNEXP;
Db::MatchType mtyp = haswild ? Db::ET_WILD :
nostemexp ? Db::ET_NONE : Db::ET_STEM;
TermMatchResult res;
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
m_field)) {
if (!db.termMatch(mtyp | termmatchsens, getStemLang(),
term, res, maxexpand, m_field, multiwords)) {
// Let it go through
}
@ -560,9 +556,17 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
}
}
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const string& span,
int mods, void * pq)
static void prefix_vector(vector<string>& v, const string& prefix)
{
for (vector<string>::iterator it = v.begin(); it != v.end(); it++) {
*it = prefix + *it;
}
}
void SearchDataClauseSimple::
processSimpleSpan(Rcl::Db &db, string& ermsg,
const string& span,
int mods, void * pq)
{
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
if (ftp->noterms)
addModifier(SDCM_NOTERMS);
addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
prefix = wrap_prefix(ftp->pfx);
}
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
vector<string> multiwords;
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
return;
// Set up the highlight data. No prefix should go in there
@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
}
// Push phrases for the multi-word expansions
for (vector<string>::const_iterator mwp = multiwords.begin();
mwp != multiwords.end(); mwp++) {
vector<string> phr;
// We just do a basic split to keep things a bit simpler here
// (no textsplit). This means though that no punctuation is
// allowed in multi-word synonyms.
stringToTokens(*mwp, phr);
if (!prefix.empty())
prefix_vector(phr, prefix);
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(Xapian::Query::OP_PHRASE,
phr.begin(), phr.end()));
m_curcl++;
}
pqueries.push_back(xq);
}
@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
return;
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp);
LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(),
stringsToString(exp).c_str()));
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin();
@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
return false;
}
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
listVector("", exp);
LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(),
stringsToString(exp).c_str()));
if (exp.size() == 1)
orqueries.push_back(Xapian::Query(exp[0]));
else