perform case/diac expansion when processing wildcards

This commit is contained in:
Jean-Francois Dockes 2013-01-04 13:34:26 +01:00
parent e1276f5d98
commit 9b55eb1cda
3 changed files with 197 additions and 88 deletions

View File

@ -545,7 +545,7 @@ static void listVector(const string& what, const vector<string>&l)
*
* @param mods stem expansion, case and diacritics sensitivity control.
* @param term input single word
* @param exp output expansion list
* @param oexp output expansion list
* @param sterm output original input term if there were no wildcards
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
@ -578,15 +578,15 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
if (!haswild)
m_hldata.uterms.insert(term);
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
// No stem expansion if there are wildcards or if prevented by caller
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
if (haswild || getStemLang().empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
}
bool noexpansion = nostemexp && !haswild;
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild;
#ifndef RCL_INDEX_STRIPCHARS
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
@ -637,20 +637,42 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
return true;
}
// Make objects before the goto jungle to avoid compiler complaints
// The case/diac expansion db
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
&unacfoldtrans);
// This will hold the result of case and diacritics expansion as input
// to stem expansion.
vector<string> lexp;
TermMatchResult res;
if (haswild) {
// Note that if there are wildcards, we do a direct from-index
// expansion, which means that we are casediac-sensitive. There
// would be nothing to prevent us to expand from the casediac
// synonyms first. To be done later
#ifndef RCL_INDEX_STRIPCHARS
if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
// Perform case/diac expansion on the exp as appropriate and
// expand the result.
vector<string> exp;
if (diac_sensitive) {
// Expand for diacritics and case, filtering for same diacritics
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.keyWildExpand(term, exp, &foldtrans);
} else if (case_sensitive) {
// Expand for diacritics and case, filtering for same case
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.keyWildExpand(term, exp, &unactrans);
} else {
// Expand for diacritics and case, no filtering
synac.keyWildExpand(term, exp);
}
// There are no wildcards in the result from above but
// calling termMatch gets the result into the right form
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
maxexpand, m_field);
}
}
#endif // RCL_INDEX_STRIPCHARS
// Expand the original wildcard expression even if we did the
// case/diac dance above,
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
maxexpand, m_field);
goto termmatchtoresult;
@ -670,76 +692,61 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
// nostemexp is unset and we just need stem expansion.
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
maxexpand, m_field);
goto termmatchtoresult;
}
// No stem expansion when diacritic or case sensitivity is set, it
// makes no sense (it would mess with the diacritics anyway if
// they are not in the stem part). In these 3 cases, perform
// appropriate expansion from the charstripping db, and do a bogus
// wildcard expansion (there is no wild card) to generate the
// result:
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever.
lexp.push_back(term);
goto exptotermatch;
} else if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.synExpand(term, lexp, &foldtrans);
goto exptotermatch;
} else if (case_sensitive) {
// Expand for accents and case, filtering for same case
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, lexp, &unactrans);
goto exptotermatch;
} else {
// We are neither accent- nor case- sensitive and may need stem
// expansion or not. Expand for accents and case
synac.synExpand(term, lexp);
if (nostemexp)
goto exptotermatch;
}
vector<string> lexp;
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever.
lexp.push_back(term);
} else if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.synExpand(term, lexp, &foldtrans);
} else if (case_sensitive) {
// Expand for accents and case, filtering for same case
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, lexp, &unactrans);
} else {
// We are neither accent- nor case- sensitive and may need stem
// expansion or not. Expand for accents and case
synac.synExpand(term, lexp);
}
// Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) {
string lower;
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
lexp[i] = lower;
}
sort(lexp.begin(), lexp.end());
{
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
lexp.resize(uit - lexp.begin());
StemDb sdb(db.m_ndb->xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
if (!nostemexp) {
// Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) {
string lower;
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
lexp[i] = lower;
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
StemDb sdb(db.m_ndb->xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(getStemLang(), *it, exp1);
}
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
// Expand the resulting list for case (all stemdb content
// is lowercase)
lexp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, lexp);
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
}
// Bogus wildcard expand to generate the result (possibly add prefixes)
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
sdb.stemExpand(getStemLang(), *it, exp1);
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
maxexpand, m_field);
}
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
// Expand the resulting list for case (all stemdb content
// is lowercase)
lexp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, lexp);
}
sort(lexp.begin(), lexp.end());
uit = unique(lexp.begin(), lexp.end());
lexp.resize(uit - lexp.begin());
}
// Bogus wildcard expand to generate the result (possibly add prefixes)
exptotermatch:
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
maxexpand, m_field);
}
#endif

View File

@ -18,10 +18,13 @@
#include "autoconfig.h"
#include <fnmatch.h>
#include <iostream>
#include <algorithm>
#include "debuglog.h"
#include "cstr.h"
#include "xmacros.h"
#include "synfamily.h"
#include "smallut.h"
@ -140,8 +143,6 @@ bool XapComputableSynFamMember::synExpand(const string& term,
if (filtertrans)
filter_root = (*filtertrans)(term);
/* We could call XapSynFamily::synExpand() here instead of doing it
ourselves... */
string key = m_prefix + root;
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
@ -181,8 +182,99 @@ bool XapComputableSynFamMember::synExpand(const string& term,
return true;
}
bool XapComputableSynFamMember::keyWildExpand(const string& inexp,
vector<string>& result,
SynTermTrans *filtertrans)
{
LOGDEB(("XapCompSynFam::keyWildExpand: [%s]\n", inexp.c_str()));
// Transform input into our key format (e.g.: case-folded + diac-stripped)
string stripped_exp = (*m_trans)(inexp);
// If set, compute filtering term (e.g.: only case-folded)
string filter_exp;
if (filtertrans)
filter_exp = (*filtertrans)(inexp);
// Find the initial section before any special chars
string::size_type es = stripped_exp.find_first_of(cstr_wildSpecStChars);
string is; // Initial section
switch (es) {
case string::npos:
// No special chars, no expansion.
result.push_back(inexp);
return true;
break;
case 0:
// Input starts with special char: start at bottom
is = m_prefix;
break;
default:
// Compute initial section
is = m_prefix + stripped_exp.substr(0, es);
break;
}
// Input to matching: prefix + transformed input
string matchin = m_prefix + stripped_exp;
string::size_type preflen = m_prefix.size();
string ermsg;
try {
for (Xapian::TermIterator xit = m_family.getdb().synonym_keys_begin(is);
xit != m_family.getdb().synonym_keys_end(is); xit++) {
LOGDEB((" Checking1 [%s] against [%s]\n", (*xit).c_str(),
matchin.c_str()));
if (fnmatch(matchin.c_str(), (*xit).c_str(), 0) == FNM_NOMATCH)
continue;
// Push all the synonyms if they match the secondary filter
for (Xapian::TermIterator xit1 =
m_family.getdb().synonyms_begin(*xit);
xit1 != m_family.getdb().synonyms_end(*xit); xit1++) {
string term = *xit1;
if (filtertrans) {
string term1 = (*filtertrans)(term);
LOGDEB((" Testing [%s] against [%s]\n",
term1.c_str(), filter_exp.c_str()));
if (fnmatch(filter_exp.c_str(),
term1.c_str(), 0) == FNM_NOMATCH) {
continue;
}
}
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing %s\n",
(*xit1).c_str()));
result.push_back(*xit1);
}
// Same with key itself
string term = (*xit).substr(preflen);
if (filtertrans) {
string term1 = (*filtertrans)(term);
LOGDEB((" Testing [%s] against [%s]\n",
term1.c_str(), filter_exp.c_str()));
if (fnmatch(filter_exp.c_str(),
term1.c_str(), 0) == FNM_NOMATCH) {
continue;
}
}
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing [%s]\n",
term.c_str()));
result.push_back(term);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapCompSynFam::keyWildExpand: error: term [%s]\n",
inexp.c_str()));
result.push_back(inexp);
return false;
}
return true;
}
} // Namespace Rcl
#else // TEST_SYNFAMILY
#include "autoconfig.h"

View File

@ -134,10 +134,17 @@ public:
}
/** Expand a term to its list of synonyms. If filtertrans is set we
* keep only the results which transform to the same value as the input */
* keep only the results which transform to the same value as the input
* This is used for example for filtering the result of case+diac
* expansion when only either case or diac expansion is desired.
*/
bool synExpand(const std::string& term, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
/** Expand key to wildcard/regexp matching keys */
bool keyWildExpand(const std::string& in, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
private:
XapSynFamily m_family;
std::string m_membername;
@ -199,15 +206,18 @@ private:
//
// Prefixes are centrally defined here to avoid collisions
//
// Stem expansion family prefix. The family member name is the
// language ("all" for Dia and Cse)
// Lowercase accented stem to expansion
// Lowercase accented stem to expansion. Family member name: language
static const std::string synFamStem("Stm");
// Lowercase unaccented stem to expansion
// Lowercase unaccented stem to expansion. Family member name: language
static const std::string synFamStemUnac("StU");
// Lowercase unaccented term to case and accent variations
// Lowercase unaccented term to case and accent variations. Only one
// member, named "all". This set is used for separate case/diac
// expansion by post-filtering the results of dual expansion.
static const std::string synFamDiCa("DCa");
}
} // end namespace Rcl
#endif /* _SYNFAMILY_H_INCLUDED_ */