diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 3cebd245..98e42e6f 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -545,7 +545,7 @@ static void listVector(const string& what, const vector&l) * * @param mods stem expansion, case and diacritics sensitivity control. * @param term input single word - * @param exp output expansion list + * @param oexp output expansion list * @param sterm output original input term if there were no wildcards * @param prefix field prefix in index. We could recompute it, but the caller * has it already. Used in the simple case where there is nothing to expand, @@ -578,15 +578,15 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, if (!haswild) m_hldata.uterms.insert(term); - bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0; - // No stem expansion if there are wildcards or if prevented by caller + bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0; if (haswild || getStemLang().empty()) { LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n")); nostemexp = true; } - bool noexpansion = nostemexp && !haswild; + // noexpansion can be modified further down by possible case/diac expansion + bool noexpansion = nostemexp && !haswild; #ifndef RCL_INDEX_STRIPCHARS bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0; @@ -637,20 +637,42 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, return true; } - // Make objects before the goto jungle to avoid compiler complaints + // The case/diac expansion db SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD); XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all", &unacfoldtrans); - // This will hold the result of case and diacritics expansion as input - // to stem expansion. - vector lexp; - TermMatchResult res; + if (haswild) { - // Note that if there are wildcards, we do a direct from-index - // expansion, which means that we are casediac-sensitive. There - // would be nothing to prevent us to expand from the casediac - // synonyms first. To be done later +#ifndef RCL_INDEX_STRIPCHARS + if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) { + // Perform case/diac expansion on the exp as appropriate and + // expand the result. + vector exp; + if (diac_sensitive) { + // Expand for diacritics and case, filtering for same diacritics + SynTermTransUnac foldtrans(UNACOP_FOLD); + synac.keyWildExpand(term, exp, &foldtrans); + } else if (case_sensitive) { + // Expand for diacritics and case, filtering for same case + SynTermTransUnac unactrans(UNACOP_UNAC); + synac.keyWildExpand(term, exp, &unactrans); + } else { + // Expand for diacritics and case, no filtering + synac.keyWildExpand(term, exp); + } + // There are no wildcards in the result from above but + // calling termMatch gets the result into the right form + for (vector::const_iterator it = exp.begin(); + it != exp.end(); it++) { + db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res, + maxexpand, m_field); + } + } +#endif // RCL_INDEX_STRIPCHARS + + // Expand the original wildcard expression even if we did the + // case/diac dance above, db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res, maxexpand, m_field); goto termmatchtoresult; @@ -670,76 +692,61 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, // nostemexp is unset and we just need stem expansion. db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, maxexpand, m_field); - goto termmatchtoresult; - } - - // No stem expansion when diacritic or case sensitivity is set, it - // makes no sense (it would mess with the diacritics anyway if - // they are not in the stem part). In these 3 cases, perform - // appropriate expansion from the charstripping db, and do a bogus - // wildcard expansion (there is no wild card) to generate the - // result: - - if (diac_sensitive && case_sensitive) { - // No expansion whatsoever. - lexp.push_back(term); - goto exptotermatch; - } else if (diac_sensitive) { - // Expand for accents and case, filtering for same accents, - SynTermTransUnac foldtrans(UNACOP_FOLD); - synac.synExpand(term, lexp, &foldtrans); - goto exptotermatch; - } else if (case_sensitive) { - // Expand for accents and case, filtering for same case - SynTermTransUnac unactrans(UNACOP_UNAC); - synac.synExpand(term, lexp, &unactrans); - goto exptotermatch; } else { - // We are neither accent- nor case- sensitive and may need stem - // expansion or not. Expand for accents and case - synac.synExpand(term, lexp); - if (nostemexp) - goto exptotermatch; - } + vector lexp; + if (diac_sensitive && case_sensitive) { + // No expansion whatsoever. + lexp.push_back(term); + } else if (diac_sensitive) { + // Expand for accents and case, filtering for same accents, + SynTermTransUnac foldtrans(UNACOP_FOLD); + synac.synExpand(term, lexp, &foldtrans); + } else if (case_sensitive) { + // Expand for accents and case, filtering for same case + SynTermTransUnac unactrans(UNACOP_UNAC); + synac.synExpand(term, lexp, &unactrans); + } else { + // We are neither accent- nor case- sensitive and may need stem + // expansion or not. Expand for accents and case + synac.synExpand(term, lexp); + } - // Need stem expansion. Lowercase the result of accent and case - // expansion for input to stemdb. - for (unsigned int i = 0; i < lexp.size(); i++) { - string lower; - unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD); - lexp[i] = lower; - } - sort(lexp.begin(), lexp.end()); - { - vector::iterator uit = unique(lexp.begin(), lexp.end()); - lexp.resize(uit - lexp.begin()); - StemDb sdb(db.m_ndb->xrdb); - vector exp1; - for (vector::const_iterator it = lexp.begin(); + if (!nostemexp) { + // Need stem expansion. Lowercase the result of accent and case + // expansion for input to stemdb. + for (unsigned int i = 0; i < lexp.size(); i++) { + string lower; + unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD); + lexp[i] = lower; + } + sort(lexp.begin(), lexp.end()); + lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); + StemDb sdb(db.m_ndb->xrdb); + vector exp1; + for (vector::const_iterator it = lexp.begin(); + it != lexp.end(); it++) { + sdb.stemExpand(getStemLang(), *it, exp1); + } + LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str())); + + // Expand the resulting list for case (all stemdb content + // is lowercase) + lexp.clear(); + for (vector::const_iterator it = exp1.begin(); + it != exp1.end(); it++) { + synac.synExpand(*it, lexp); + } + sort(lexp.begin(), lexp.end()); + lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); + } + + // Bogus wildcard expand to generate the result (possibly add prefixes) + LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); + for (vector::const_iterator it = lexp.begin(); it != lexp.end(); it++) { - sdb.stemExpand(getStemLang(), *it, exp1); + db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res, + maxexpand, m_field); } - LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str())); - - // Expand the resulting list for case (all stemdb content - // is lowercase) - lexp.clear(); - for (vector::const_iterator it = exp1.begin(); - it != exp1.end(); it++) { - synac.synExpand(*it, lexp); - } - sort(lexp.begin(), lexp.end()); - uit = unique(lexp.begin(), lexp.end()); - lexp.resize(uit - lexp.begin()); - } - - // Bogus wildcard expand to generate the result (possibly add prefixes) -exptotermatch: - LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); - for (vector::const_iterator it = lexp.begin(); - it != lexp.end(); it++) { - db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res, - maxexpand, m_field); } #endif diff --git a/src/rcldb/synfamily.cpp b/src/rcldb/synfamily.cpp index 9e857342..060f9628 100644 --- a/src/rcldb/synfamily.cpp +++ b/src/rcldb/synfamily.cpp @@ -18,10 +18,13 @@ #include "autoconfig.h" +#include + #include #include #include "debuglog.h" +#include "cstr.h" #include "xmacros.h" #include "synfamily.h" #include "smallut.h" @@ -140,8 +143,6 @@ bool XapComputableSynFamMember::synExpand(const string& term, if (filtertrans) filter_root = (*filtertrans)(term); - /* We could call XapSynFamily::synExpand() here instead of doing it - ourselves... */ string key = m_prefix + root; LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n", @@ -181,8 +182,99 @@ bool XapComputableSynFamMember::synExpand(const string& term, return true; } + +bool XapComputableSynFamMember::keyWildExpand(const string& inexp, + vector& result, + SynTermTrans *filtertrans) +{ + LOGDEB(("XapCompSynFam::keyWildExpand: [%s]\n", inexp.c_str())); + + // Transform input into our key format (e.g.: case-folded + diac-stripped) + string stripped_exp = (*m_trans)(inexp); + + // If set, compute filtering term (e.g.: only case-folded) + string filter_exp; + if (filtertrans) + filter_exp = (*filtertrans)(inexp); + + // Find the initial section before any special chars + string::size_type es = stripped_exp.find_first_of(cstr_wildSpecStChars); + string is; // Initial section + switch (es) { + case string::npos: + // No special chars, no expansion. + result.push_back(inexp); + return true; + break; + case 0: + // Input starts with special char: start at bottom + is = m_prefix; + break; + default: + // Compute initial section + is = m_prefix + stripped_exp.substr(0, es); + break; + } + + // Input to matching: prefix + transformed input + string matchin = m_prefix + stripped_exp; + string::size_type preflen = m_prefix.size(); + + string ermsg; + try { + for (Xapian::TermIterator xit = m_family.getdb().synonym_keys_begin(is); + xit != m_family.getdb().synonym_keys_end(is); xit++) { + LOGDEB((" Checking1 [%s] against [%s]\n", (*xit).c_str(), + matchin.c_str())); + if (fnmatch(matchin.c_str(), (*xit).c_str(), 0) == FNM_NOMATCH) + continue; + + // Push all the synonyms if they match the secondary filter + for (Xapian::TermIterator xit1 = + m_family.getdb().synonyms_begin(*xit); + xit1 != m_family.getdb().synonyms_end(*xit); xit1++) { + string term = *xit1; + if (filtertrans) { + string term1 = (*filtertrans)(term); + LOGDEB((" Testing [%s] against [%s]\n", + term1.c_str(), filter_exp.c_str())); + if (fnmatch(filter_exp.c_str(), + term1.c_str(), 0) == FNM_NOMATCH) { + continue; + } + } + LOGDEB(("XapCompSynFam::keyWildExpand: Pushing %s\n", + (*xit1).c_str())); + result.push_back(*xit1); + } + // Same with key itself + string term = (*xit).substr(preflen); + if (filtertrans) { + string term1 = (*filtertrans)(term); + LOGDEB((" Testing [%s] against [%s]\n", + term1.c_str(), filter_exp.c_str())); + if (fnmatch(filter_exp.c_str(), + term1.c_str(), 0) == FNM_NOMATCH) { + continue; + } + } + LOGDEB(("XapCompSynFam::keyWildExpand: Pushing [%s]\n", + term.c_str())); + result.push_back(term); + } + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("XapCompSynFam::keyWildExpand: error: term [%s]\n", + inexp.c_str())); + result.push_back(inexp); + return false; + } + return true; } + +} // Namespace Rcl + #else // TEST_SYNFAMILY #include "autoconfig.h" diff --git a/src/rcldb/synfamily.h b/src/rcldb/synfamily.h index e203d351..c3689a8e 100644 --- a/src/rcldb/synfamily.h +++ b/src/rcldb/synfamily.h @@ -134,10 +134,17 @@ public: } /** Expand a term to its list of synonyms. If filtertrans is set we - * keep only the results which transform to the same value as the input */ + * keep only the results which transform to the same value as the input + * This is used for example for filtering the result of case+diac + * expansion when only either case or diac expansion is desired. + */ bool synExpand(const std::string& term, std::vector& result, SynTermTrans *filtertrans = 0); + /** Expand key to wildcard/regexp matching keys */ + bool keyWildExpand(const std::string& in, std::vector& result, + SynTermTrans *filtertrans = 0); + private: XapSynFamily m_family; std::string m_membername; @@ -199,15 +206,18 @@ private: // // Prefixes are centrally defined here to avoid collisions // -// Stem expansion family prefix. The family member name is the -// language ("all" for Dia and Cse) -// Lowercase accented stem to expansion +// Lowercase accented stem to expansion. Family member name: language static const std::string synFamStem("Stm"); -// Lowercase unaccented stem to expansion + +// Lowercase unaccented stem to expansion. Family member name: language static const std::string synFamStemUnac("StU"); -// Lowercase unaccented term to case and accent variations + +// Lowercase unaccented term to case and accent variations. Only one +// member, named "all". This set is used for separate case/diac +// expansion by post-filtering the results of dual expansion. static const std::string synFamDiCa("DCa"); -} + +} // end namespace Rcl #endif /* _SYNFAMILY_H_INCLUDED_ */