diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index b382bc3b..07b6529e 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -3344,6 +3344,11 @@ dir:recoll dir:src -dir:utils -dir:common
stemming is off by default for phrases).
+ s can be used to turn off
+ synonym expansion, if a synonyms file is in place (only for
+ &RCL; 1.22 and later).
+
+
o can be used to specify a
"slack" for phrase and proximity searches: the number of
additional terms that may be found between the specified
diff --git a/src/query/wasaparse.ypp b/src/query/wasaparse.ypp
index 50f38422..92fad1a1 100644
--- a/src/query/wasaparse.ypp
+++ b/src/query/wasaparse.ypp
@@ -282,6 +282,11 @@ static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
+ case 's':
+ cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
+ break;
+ case 'S':
+ break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 70e87dc6..2d679648 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -341,8 +341,9 @@ class Db {
* Stem expansion is performed if lang is not empty
*
* @param typ_sens defines the kind of expansion: none, wildcard,
- * regexp or stemming. "none" will still expand case and
- * diacritics depending on the casesens and diacsens flags.
+ * regexp or stemming. "none" may still expand case,
+ * diacritics and synonyms, depending on the casesens, diacsens and
+ * synexp flags.
* @param lang sets the stemming language(s). Can be a space-separated list
* @param term is the term to expand
* @param result is the main output
@@ -354,14 +355,14 @@ class Db {
* in the TermMatchResult header
*/
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
- ET_DIACSENS=8, ET_CASESENS=16};
+ ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32};
int matchTypeTp(int tp)
{
return tp & 7;
}
bool termMatch(int typ_sens, const string &lang, const string &term,
- TermMatchResult& result, int max = -1,
- const string& field = cstr_null);
+ TermMatchResult& result, int max = -1,
+ const string& field = "", vector *multiwords = 0);
bool dbStats(DbStats& stats);
/** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear);
diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp
index 8d0e4abd..df52bfac 100644
--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@@ -164,7 +164,8 @@ static const char *tmtptostr(int typ)
// using the main index terms (filtering, retrieving stats, expansion
// in some cases).
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
- TermMatchResult& res, int max, const string& field)
+ TermMatchResult& res, int max, const string& field,
+ vector* multiwords)
{
int matchtyp = matchTypeTp(typ_sens);
if (!m_ndb || !m_ndb->m_isopen)
@@ -256,7 +257,7 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
synac.synExpand(term, lexp);
}
- if (matchTypeTp(typ_sens) == ET_STEM) {
+ if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
// Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) {
@@ -266,45 +267,60 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
- StemDb sdb(xrdb);
- vector exp1;
- for (vector::const_iterator it = lexp.begin();
- it != lexp.end(); it++) {
- sdb.stemExpand(lang, *it, exp1);
- }
- LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
- lexp.clear();
+ if (matchtyp == ET_STEM) {
+ StemDb sdb(xrdb);
+ vector exp1;
+ for (vector::const_iterator it = lexp.begin();
+ it != lexp.end(); it++) {
+ sdb.stemExpand(lang, *it, exp1);
+ }
+ exp1.swap(lexp);
+ sort(lexp.begin(), lexp.end());
+ lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
+ LOGDEB(("ExpTerm: stemexp: %s\n",
+ stringsToString(lexp).c_str()));
+ }
+
// Expand the result for synonyms. Note that doing it here
// means that multi-term synonyms will not work
// (e.g. stakhanovist -> "hard at work". We would have to
// separate the multi-word expansions for our caller to
// add them as phrases to the query. Not impossible, but
// let's keep it at single words for now.
- if (m_syngroups.ok()) {
+ if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
LOGDEB(("ExpTerm: got syngroups\n"));
- for (vector::const_iterator it = exp1.begin();
- it != exp1.end(); it++) {
+ vector exp1(lexp);
+ for (vector::const_iterator it = lexp.begin();
+ it != lexp.end(); it++) {
vector sg = m_syngroups.getgroup(*it);
if (!sg.empty()) {
LOGDEB(("ExpTerm: syns: %s -> %s\n",
it->c_str(), stringsToString(sg).c_str()));
- lexp.insert(lexp.end(), sg.begin(), sg.end());
+ for (vector::const_iterator it1 = sg.begin();
+ it1 != sg.end(); it1++) {
+ if (it1->find_first_of(" ") != string::npos) {
+ if (multiwords)
+ multiwords->push_back(*it1);
+ } else {
+ exp1.push_back(*it);
+ }
+ }
}
}
+ lexp.swap(exp1);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
- // Keep result in exp1 for next step
- exp1.swap(lexp);
}
// Expand the resulting list for case (all stemdb content
// is lowercase)
- lexp.clear();
- for (vector::const_iterator it = exp1.begin();
- it != exp1.end(); it++) {
- synac.synExpand(*it, lexp);
+ vector exp1;
+ for (vector::const_iterator it = lexp.begin();
+ it != lexp.end(); it++) {
+ synac.synExpand(*it, exp1);
}
+ exp1.swap(lexp);
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
}
diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h
index 2e938d04..df478a80 100644
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@@ -224,9 +224,10 @@ private:
class SearchDataClause {
public:
- enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
- SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
- SDCM_NOTERMS=32 // Don't include terms for highlighting
+ enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
+ SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
+ SDCM_NOTERMS=0x20, // Don't include terms for highlighting
+ SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
@@ -382,7 +383,8 @@ protected:
bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods,
const std::string& term,
std::vector& exp,
- std::string& sterm, const std::string& prefix);
+ std::string& sterm, const std::string& prefix,
+ std::vector* multiwords = 0);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
int mods, void *pq);
diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp
index 543d55e5..0da1d770 100644
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@@ -379,17 +379,6 @@ private:
};
-#if 1
-static void listVector(const string& what, const vector&l)
-{
- string a;
- for (vector::const_iterator it = l.begin(); it != l.end(); it++) {
- a = a + *it + " ";
- }
- LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
-}
-#endif
-
/** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
*
@@ -400,12 +389,16 @@ static void listVector(const string& what, const vector&l)
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
+ * @param multiwords it may happen that synonym processing results in multi-word
+ * expansions which should be processed as phrases.
*/
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
string& ermsg, int mods,
const string& term,
vector& oexp, string &sterm,
- const string& prefix)
+ const string& prefix,
+ vector* multiwords
+ )
{
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
@@ -436,13 +429,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
nostemexp = true;
}
- // noexpansion can be modified further down by possible case/diac expansion
- bool noexpansion = nostemexp && !haswild;
-
- int termmatchsens = 0;
-
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
+ bool synonyms = (mods & SDCM_NOSYNS) == 0;
+
+ // noexpansion can be modified further down by possible case/diac expansion
+ bool noexpansion = nostemexp && !haswild && !synonyms;
if (o_index_stripchars) {
diac_sensitive = case_sensitive = false;
@@ -480,10 +472,6 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
noexpansion = false;
}
- if (case_sensitive)
- termmatchsens |= Db::ET_CASESENS;
- if (diac_sensitive)
- termmatchsens |= Db::ET_DIACSENS;
if (noexpansion) {
oexp.push_back(prefix + term);
@@ -493,11 +481,19 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
return true;
}
+ int termmatchsens = 0;
+ if (case_sensitive)
+ termmatchsens |= Db::ET_CASESENS;
+ if (diac_sensitive)
+ termmatchsens |= Db::ET_DIACSENS;
+ if (synonyms)
+ termmatchsens |= Db::ET_SYNEXP;
+
Db::MatchType mtyp = haswild ? Db::ET_WILD :
nostemexp ? Db::ET_NONE : Db::ET_STEM;
TermMatchResult res;
- if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
- m_field)) {
+ if (!db.termMatch(mtyp | termmatchsens, getStemLang(),
+ term, res, maxexpand, m_field, multiwords)) {
// Let it go through
}
@@ -560,9 +556,17 @@ void multiply_groups(vector >::const_iterator vvit,
}
}
-void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
- const string& span,
- int mods, void * pq)
+static void prefix_vector(vector& v, const string& prefix)
+{
+ for (vector::iterator it = v.begin(); it != v.end(); it++) {
+ *it = prefix + *it;
+ }
+}
+
+void SearchDataClauseSimple::
+processSimpleSpan(Rcl::Db &db, string& ermsg,
+ const string& span,
+ int mods, void * pq)
{
vector& pqueries(*(vector*)pq);
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
@@ -574,11 +578,12 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
if (ftp->noterms)
- addModifier(SDCM_NOTERMS);
+ addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
prefix = wrap_prefix(ftp->pfx);
}
- if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
+ vector multiwords;
+ if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
return;
// Set up the highlight data. No prefix should go in there
@@ -608,6 +613,23 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
}
+
+ // Push phrases for the multi-word expansions
+ for (vector::const_iterator mwp = multiwords.begin();
+ mwp != multiwords.end(); mwp++) {
+ vector phr;
+ // We just do a basic split to keep things a bit simpler here
+ // (no textsplit). This means though that no punctuation is
+ // allowed in multi-word synonyms.
+ stringToTokens(*mwp, phr);
+ if (!prefix.empty())
+ prefix_vector(phr, prefix);
+ xq = Xapian::Query(Xapian::Query::OP_OR, xq,
+ Xapian::Query(Xapian::Query::OP_PHRASE,
+ phr.begin(), phr.end()));
+ m_curcl++;
+ }
+
pqueries.push_back(xq);
}
@@ -660,8 +682,8 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
vector exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
return;
- LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
- listVector("", exp);
+ LOGDEB0(("ProcessPhraseOrNear: exp size %d, exp: %s\n", exp.size(),
+ stringsToString(exp).c_str()));
// groups is used for highlighting, we don't want prefixes in there.
vector noprefs;
for (vector::const_iterator it = exp.begin();
@@ -957,8 +979,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
return false;
}
- LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
- listVector("", exp);
+ LOGDEB0(("SDataPath::toNative: exp size %d. Exp: %s\n", exp.size(),
+ stringsToString(exp).c_str()));
if (exp.size() == 1)
orqueries.push_back(Xapian::Query(exp[0]));
else