perform case/diac expansion when processing wildcards
This commit is contained in:
parent
e1276f5d98
commit
9b55eb1cda
@ -545,7 +545,7 @@ static void listVector(const string& what, const vector<string>&l)
|
||||
*
|
||||
* @param mods stem expansion, case and diacritics sensitivity control.
|
||||
* @param term input single word
|
||||
* @param exp output expansion list
|
||||
* @param oexp output expansion list
|
||||
* @param sterm output original input term if there were no wildcards
|
||||
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
@ -578,15 +578,15 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
if (!haswild)
|
||||
m_hldata.uterms.insert(term);
|
||||
|
||||
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
||||
|
||||
// No stem expansion if there are wildcards or if prevented by caller
|
||||
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
||||
if (haswild || getStemLang().empty()) {
|
||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
// noexpansion can be modified further down by possible case/diac expansion
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
||||
@ -637,20 +637,42 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Make objects before the goto jungle to avoid compiler complaints
|
||||
// The case/diac expansion db
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
||||
&unacfoldtrans);
|
||||
// This will hold the result of case and diacritics expansion as input
|
||||
// to stem expansion.
|
||||
vector<string> lexp;
|
||||
|
||||
TermMatchResult res;
|
||||
|
||||
if (haswild) {
|
||||
// Note that if there are wildcards, we do a direct from-index
|
||||
// expansion, which means that we are casediac-sensitive. There
|
||||
// would be nothing to prevent us to expand from the casediac
|
||||
// synonyms first. To be done later
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
|
||||
// Perform case/diac expansion on the exp as appropriate and
|
||||
// expand the result.
|
||||
vector<string> exp;
|
||||
if (diac_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same diacritics
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.keyWildExpand(term, exp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.keyWildExpand(term, exp, &unactrans);
|
||||
} else {
|
||||
// Expand for diacritics and case, no filtering
|
||||
synac.keyWildExpand(term, exp);
|
||||
}
|
||||
// There are no wildcards in the result from above but
|
||||
// calling termMatch gets the result into the right form
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
||||
maxexpand, m_field);
|
||||
}
|
||||
}
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
// Expand the original wildcard expression even if we did the
|
||||
// case/diac dance above,
|
||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
||||
maxexpand, m_field);
|
||||
goto termmatchtoresult;
|
||||
@ -670,76 +692,61 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
// nostemexp is unset and we just need stem expansion.
|
||||
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
||||
maxexpand, m_field);
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
// No stem expansion when diacritic or case sensitivity is set, it
|
||||
// makes no sense (it would mess with the diacritics anyway if
|
||||
// they are not in the stem part). In these 3 cases, perform
|
||||
// appropriate expansion from the charstripping db, and do a bogus
|
||||
// wildcard expansion (there is no wild card) to generate the
|
||||
// result:
|
||||
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No expansion whatsoever.
|
||||
lexp.push_back(term);
|
||||
goto exptotermatch;
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
goto exptotermatch;
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
goto exptotermatch;
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
if (nostemexp)
|
||||
goto exptotermatch;
|
||||
}
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No expansion whatsoever.
|
||||
lexp.push_back(term);
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
{
|
||||
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
||||
lexp.resize(uit - lexp.begin());
|
||||
StemDb sdb(db.m_ndb->xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
if (!nostemexp) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(db.m_ndb->xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(getStemLang(), *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(getStemLang(), *it, exp1);
|
||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
||||
maxexpand, m_field);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
uit = unique(lexp.begin(), lexp.end());
|
||||
lexp.resize(uit - lexp.begin());
|
||||
}
|
||||
|
||||
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
||||
exptotermatch:
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
||||
maxexpand, m_field);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -18,10 +18,13 @@
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <fnmatch.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "cstr.h"
|
||||
#include "xmacros.h"
|
||||
#include "synfamily.h"
|
||||
#include "smallut.h"
|
||||
@ -140,8 +143,6 @@ bool XapComputableSynFamMember::synExpand(const string& term,
|
||||
if (filtertrans)
|
||||
filter_root = (*filtertrans)(term);
|
||||
|
||||
/* We could call XapSynFamily::synExpand() here instead of doing it
|
||||
ourselves... */
|
||||
string key = m_prefix + root;
|
||||
|
||||
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
|
||||
@ -181,8 +182,99 @@ bool XapComputableSynFamMember::synExpand(const string& term,
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool XapComputableSynFamMember::keyWildExpand(const string& inexp,
|
||||
vector<string>& result,
|
||||
SynTermTrans *filtertrans)
|
||||
{
|
||||
LOGDEB(("XapCompSynFam::keyWildExpand: [%s]\n", inexp.c_str()));
|
||||
|
||||
// Transform input into our key format (e.g.: case-folded + diac-stripped)
|
||||
string stripped_exp = (*m_trans)(inexp);
|
||||
|
||||
// If set, compute filtering term (e.g.: only case-folded)
|
||||
string filter_exp;
|
||||
if (filtertrans)
|
||||
filter_exp = (*filtertrans)(inexp);
|
||||
|
||||
// Find the initial section before any special chars
|
||||
string::size_type es = stripped_exp.find_first_of(cstr_wildSpecStChars);
|
||||
string is; // Initial section
|
||||
switch (es) {
|
||||
case string::npos:
|
||||
// No special chars, no expansion.
|
||||
result.push_back(inexp);
|
||||
return true;
|
||||
break;
|
||||
case 0:
|
||||
// Input starts with special char: start at bottom
|
||||
is = m_prefix;
|
||||
break;
|
||||
default:
|
||||
// Compute initial section
|
||||
is = m_prefix + stripped_exp.substr(0, es);
|
||||
break;
|
||||
}
|
||||
|
||||
// Input to matching: prefix + transformed input
|
||||
string matchin = m_prefix + stripped_exp;
|
||||
string::size_type preflen = m_prefix.size();
|
||||
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator xit = m_family.getdb().synonym_keys_begin(is);
|
||||
xit != m_family.getdb().synonym_keys_end(is); xit++) {
|
||||
LOGDEB((" Checking1 [%s] against [%s]\n", (*xit).c_str(),
|
||||
matchin.c_str()));
|
||||
if (fnmatch(matchin.c_str(), (*xit).c_str(), 0) == FNM_NOMATCH)
|
||||
continue;
|
||||
|
||||
// Push all the synonyms if they match the secondary filter
|
||||
for (Xapian::TermIterator xit1 =
|
||||
m_family.getdb().synonyms_begin(*xit);
|
||||
xit1 != m_family.getdb().synonyms_end(*xit); xit1++) {
|
||||
string term = *xit1;
|
||||
if (filtertrans) {
|
||||
string term1 = (*filtertrans)(term);
|
||||
LOGDEB((" Testing [%s] against [%s]\n",
|
||||
term1.c_str(), filter_exp.c_str()));
|
||||
if (fnmatch(filter_exp.c_str(),
|
||||
term1.c_str(), 0) == FNM_NOMATCH) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing %s\n",
|
||||
(*xit1).c_str()));
|
||||
result.push_back(*xit1);
|
||||
}
|
||||
// Same with key itself
|
||||
string term = (*xit).substr(preflen);
|
||||
if (filtertrans) {
|
||||
string term1 = (*filtertrans)(term);
|
||||
LOGDEB((" Testing [%s] against [%s]\n",
|
||||
term1.c_str(), filter_exp.c_str()));
|
||||
if (fnmatch(filter_exp.c_str(),
|
||||
term1.c_str(), 0) == FNM_NOMATCH) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing [%s]\n",
|
||||
term.c_str()));
|
||||
result.push_back(term);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapCompSynFam::keyWildExpand: error: term [%s]\n",
|
||||
inexp.c_str()));
|
||||
result.push_back(inexp);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
} // Namespace Rcl
|
||||
|
||||
#else // TEST_SYNFAMILY
|
||||
#include "autoconfig.h"
|
||||
|
||||
|
||||
@ -134,10 +134,17 @@ public:
|
||||
}
|
||||
|
||||
/** Expand a term to its list of synonyms. If filtertrans is set we
|
||||
* keep only the results which transform to the same value as the input */
|
||||
* keep only the results which transform to the same value as the input
|
||||
* This is used for example for filtering the result of case+diac
|
||||
* expansion when only either case or diac expansion is desired.
|
||||
*/
|
||||
bool synExpand(const std::string& term, std::vector<std::string>& result,
|
||||
SynTermTrans *filtertrans = 0);
|
||||
|
||||
/** Expand key to wildcard/regexp matching keys */
|
||||
bool keyWildExpand(const std::string& in, std::vector<std::string>& result,
|
||||
SynTermTrans *filtertrans = 0);
|
||||
|
||||
private:
|
||||
XapSynFamily m_family;
|
||||
std::string m_membername;
|
||||
@ -199,15 +206,18 @@ private:
|
||||
//
|
||||
// Prefixes are centrally defined here to avoid collisions
|
||||
//
|
||||
// Stem expansion family prefix. The family member name is the
|
||||
// language ("all" for Dia and Cse)
|
||||
|
||||
// Lowercase accented stem to expansion
|
||||
// Lowercase accented stem to expansion. Family member name: language
|
||||
static const std::string synFamStem("Stm");
|
||||
// Lowercase unaccented stem to expansion
|
||||
|
||||
// Lowercase unaccented stem to expansion. Family member name: language
|
||||
static const std::string synFamStemUnac("StU");
|
||||
// Lowercase unaccented term to case and accent variations
|
||||
|
||||
// Lowercase unaccented term to case and accent variations. Only one
|
||||
// member, named "all". This set is used for separate case/diac
|
||||
// expansion by post-filtering the results of dual expansion.
|
||||
static const std::string synFamDiCa("DCa");
|
||||
}
|
||||
|
||||
} // end namespace Rcl
|
||||
|
||||
#endif /* _SYNFAMILY_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user