From e7a669b668470e5099513ee2c443a2a271abc58d Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 23 Aug 2015 12:15:52 +0200 Subject: [PATCH] Support multi-word synonyms and add modifier to turn-off synonyms expansion --- src/doc/user/usermanual.xml | 5 +++ src/query/wasaparse.ypp | 5 +++ src/rcldb/rcldb.h | 11 ++--- src/rcldb/rclterms.cpp | 56 +++++++++++++++--------- src/rcldb/searchdata.h | 10 +++-- src/rcldb/searchdatatox.cpp | 86 +++++++++++++++++++++++-------------- 6 files changed, 112 insertions(+), 61 deletions(-) diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index b382bc3b..07b6529e 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common stemming is off by default for phrases). + s can be used to turn off + synonym expansion, if a synonyms file is in place (only for + &RCL; 1.22 and later). + + o can be used to specify a "slack" for phrase and proximity searches: the number of additional terms that may be found between the specified diff --git a/src/query/wasaparse.ypp b/src/query/wasaparse.ypp index 50f38422..92fad1a1 100644 --- a/src/query/wasaparse.ypp +++ b/src/query/wasaparse.ypp @@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) //cerr << "set slack " << cl->getslack() << " done" << endl; } break; + case 's': + cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS); + break; + case 'S': + break; case '.':case '0':case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9': { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 70e87dc6..2d679648 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -341,8 +341,9 @@ class Db { * Stem expansion is performed if lang is not empty * * @param typ_sens defines the kind of expansion: none, wildcard, - * regexp or stemming. "none" will still expand case and - * diacritics depending on the casesens and diacsens flags. + * regexp or stemming. "none" may still expand case, + * diacritics and synonyms, depending on the casesens, diacsens and + * synexp flags. * @param lang sets the stemming language(s). Can be a space-separated list * @param term is the term to expand * @param result is the main output @@ -354,14 +355,14 @@ class Db { * in the TermMatchResult header */ enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3, - ET_DIACSENS=8, ET_CASESENS=16}; + ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32}; int matchTypeTp(int tp) { return tp & 7; } bool termMatch(int typ_sens, const string &lang, const string &term, - TermMatchResult& result, int max = -1, - const string& field = cstr_null); + TermMatchResult& result, int max = -1, + const string& field = "", vector *multiwords = 0); bool dbStats(DbStats& stats); /** Return min and max years for doc mod times in db */ bool maxYearSpan(int *minyear, int *maxyear); diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp index 8d0e4abd..df52bfac 100644 --- a/src/rcldb/rclterms.cpp +++ b/src/rcldb/rclterms.cpp @@ -164,7 +164,8 @@ static const char *tmtptostr(int typ) // using the main index terms (filtering, retrieving stats, expansion // in some cases). bool Db::termMatch(int typ_sens, const string &lang, const string &_term, - TermMatchResult& res, int max, const string& field) + TermMatchResult& res, int max, const string& field, + vector* multiwords) { int matchtyp = matchTypeTp(typ_sens); if (!m_ndb || !m_ndb->m_isopen) @@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, synac.synExpand(term, lexp); } - if (matchTypeTp(typ_sens) == ET_STEM) { + if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) { // Need stem expansion. Lowercase the result of accent and case // expansion for input to stemdb. for (unsigned int i = 0; i < lexp.size(); i++) { @@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, } sort(lexp.begin(), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); - StemDb sdb(xrdb); - vector exp1; - for (vector::const_iterator it = lexp.begin(); - it != lexp.end(); it++) { - sdb.stemExpand(lang, *it, exp1); - } - LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str())); - lexp.clear(); + if (matchtyp == ET_STEM) { + StemDb sdb(xrdb); + vector exp1; + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { + sdb.stemExpand(lang, *it, exp1); + } + exp1.swap(lexp); + sort(lexp.begin(), lexp.end()); + lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); + LOGDEB(("ExpTerm: stemexp: %s\n", + stringsToString(lexp).c_str())); + } + // Expand the result for synonyms. Note that doing it here // means that multi-term synonyms will not work // (e.g. stakhanovist -> "hard at work". We would have to // separate the multi-word expansions for our caller to // add them as phrases to the query. Not impossible, but // let's keep it at single words for now. - if (m_syngroups.ok()) { + if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) { LOGDEB(("ExpTerm: got syngroups\n")); - for (vector::const_iterator it = exp1.begin(); - it != exp1.end(); it++) { + vector exp1(lexp); + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { vector sg = m_syngroups.getgroup(*it); if (!sg.empty()) { LOGDEB(("ExpTerm: syns: %s -> %s\n", it->c_str(), stringsToString(sg).c_str())); - lexp.insert(lexp.end(), sg.begin(), sg.end()); + for (vector::const_iterator it1 = sg.begin(); + it1 != sg.end(); it1++) { + if (it1->find_first_of(" ") != string::npos) { + if (multiwords) + multiwords->push_back(*it1); + } else { + exp1.push_back(*it); + } + } } } + lexp.swap(exp1); sort(lexp.begin(), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); - // Keep result in exp1 for next step - exp1.swap(lexp); } // Expand the resulting list for case (all stemdb content // is lowercase) - lexp.clear(); - for (vector::const_iterator it = exp1.begin(); - it != exp1.end(); it++) { - synac.synExpand(*it, lexp); + vector exp1; + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { + synac.synExpand(*it, exp1); } + exp1.swap(lexp); sort(lexp.begin(), lexp.end()); lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); } diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 2e938d04..df478a80 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -224,9 +224,10 @@ private: class SearchDataClause { public: - enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, - SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16, - SDCM_NOTERMS=32 // Don't include terms for highlighting + enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2, + SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10, + SDCM_NOTERMS=0x20, // Don't include terms for highlighting + SDCM_NOSYNS = 0x40, // Don't perform synonym expansion }; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; @@ -382,7 +383,8 @@ protected: bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods, const std::string& term, std::vector& exp, - std::string& sterm, const std::string& prefix); + std::string& sterm, const std::string& prefix, + std::vector* multiwords = 0); // After splitting entry on whitespace: process non-phrase element void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span, int mods, void *pq); diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index 543d55e5..0da1d770 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -379,17 +379,6 @@ private: }; -#if 1 -static void listVector(const string& what, const vector&l) -{ - string a; - for (vector::const_iterator it = l.begin(); it != l.end(); it++) { - a = a + *it + " "; - } - LOGDEB0(("%s: %s\n", what.c_str(), a.c_str())); -} -#endif - /** Expand term into term list, using appropriate mode: stem, wildcards, * diacritics... * @@ -400,12 +389,16 @@ static void listVector(const string& what, const vector&l) * @param prefix field prefix in index. We could recompute it, but the caller * has it already. Used in the simple case where there is nothing to expand, * and we just return the prefixed term (else Db::termMatch deals with it). + * @param multiwords it may happen that synonym processing results in multi-word + * expansions which should be processed as phrases. */ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, string& ermsg, int mods, const string& term, vector& oexp, string &sterm, - const string& prefix) + const string& prefix, + vector* multiwords + ) { LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n", mods, m_field.c_str(), term.c_str(), getStemLang().c_str())); @@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, nostemexp = true; } - // noexpansion can be modified further down by possible case/diac expansion - bool noexpansion = nostemexp && !haswild; - - int termmatchsens = 0; - bool diac_sensitive = (mods & SDCM_DIACSENS) != 0; bool case_sensitive = (mods & SDCM_CASESENS) != 0; + bool synonyms = (mods & SDCM_NOSYNS) == 0; + + // noexpansion can be modified further down by possible case/diac expansion + bool noexpansion = nostemexp && !haswild && !synonyms; if (o_index_stripchars) { diac_sensitive = case_sensitive = false; @@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, noexpansion = false; } - if (case_sensitive) - termmatchsens |= Db::ET_CASESENS; - if (diac_sensitive) - termmatchsens |= Db::ET_DIACSENS; if (noexpansion) { oexp.push_back(prefix + term); @@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, return true; } + int termmatchsens = 0; + if (case_sensitive) + termmatchsens |= Db::ET_CASESENS; + if (diac_sensitive) + termmatchsens |= Db::ET_DIACSENS; + if (synonyms) + termmatchsens |= Db::ET_SYNEXP; + Db::MatchType mtyp = haswild ? Db::ET_WILD : nostemexp ? Db::ET_NONE : Db::ET_STEM; TermMatchResult res; - if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand, - m_field)) { + if (!db.termMatch(mtyp | termmatchsens, getStemLang(), + term, res, maxexpand, m_field, multiwords)) { // Let it go through } @@ -560,9 +556,17 @@ void multiply_groups(vector >::const_iterator vvit, } } -void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, - const string& span, - int mods, void * pq) +static void prefix_vector(vector& v, const string& prefix) +{ + for (vector::iterator it = v.begin(); it != v.end(); it++) { + *it = prefix + *it; + } +} + +void SearchDataClauseSimple:: +processSimpleSpan(Rcl::Db &db, string& ermsg, + const string& span, + int mods, void * pq) { vector& pqueries(*(vector*)pq); LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n", @@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, const FieldTraits *ftp; if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { if (ftp->noterms) - addModifier(SDCM_NOTERMS); + addModifier(SDCM_NOTERMS); // Don't add terms to highlight data prefix = wrap_prefix(ftp->pfx); } - if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix)) + vector multiwords; + if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords)) return; // Set up the highlight data. No prefix should go in there @@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, Xapian::Query(prefix+sterm, original_term_wqf_booster)); } + + // Push phrases for the multi-word expansions + for (vector::const_iterator mwp = multiwords.begin(); + mwp != multiwords.end(); mwp++) { + vector phr; + // We just do a basic split to keep things a bit simpler here + // (no textsplit). This means though that no punctuation is + // allowed in multi-word synonyms. + stringToTokens(*mwp, phr); + if (!prefix.empty()) + prefix_vector(phr, prefix); + xq = Xapian::Query(Xapian::Query::OP_OR, xq, + Xapian::Query(Xapian::Query::OP_PHRASE, + phr.begin(), phr.end())); + m_curcl++; + } + pqueries.push_back(xq); } @@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, vector exp; if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) return; - LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); - listVector("", exp); + LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(), + stringsToString(exp).c_str())); // groups is used for highlighting, we don't want prefixes in there. vector noprefs; for (vector::const_iterator it = exp.begin(); @@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) *pit, exp, sterm, wrap_prefix(pathelt_prefix))) { return false; } - LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size())); - listVector("", exp); + LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(), + stringsToString(exp).c_str())); if (exp.size() == 1) orqueries.push_back(Xapian::Query(exp[0])); else