Avoid multiple expansion of xapian term iterator

This commit is contained in:
Jean-Francois Dockes 2019-02-01 09:07:28 +01:00
parent 17a8d72227
commit 04f3449f99
2 changed files with 21 additions and 17 deletions

View File

@ -18,7 +18,10 @@
#include "autoconfig.h" #include "autoconfig.h"
#include "expansiondbs.h"
#include <memory> #include <memory>
#include <string>
#include "log.h" #include "log.h"
#include "utf8iter.h" #include "utf8iter.h"
@ -28,7 +31,6 @@
#include "xmacros.h" #include "xmacros.h"
#include "rcldb.h" #include "rcldb.h"
#include "stemdb.h" #include "stemdb.h"
#include "expansiondbs.h"
using namespace std; using namespace std;
@ -41,7 +43,7 @@ namespace Rcl {
bool createExpansionDbs(Xapian::WritableDatabase& wdb, bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs) const vector<string>& langs)
{ {
LOGDEB("StemDb::createExpansionDbs: languages: " << (stringsToString(langs)) << "\n" ); LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n");
Chrono cron; Chrono cron;
// Erase and recreate all the expansion groups // Erase and recreate all the expansion groups
@ -93,32 +95,33 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// skip the rest one by one. // skip the rest one by one.
it.skip_to(wrap_prefix("Z")); it.skip_to(wrap_prefix("Z"));
for ( ;it != wdb.allterms_end(); it++) { for ( ;it != wdb.allterms_end(); it++) {
if (has_prefix(*it)) const string term{*it};
if (has_prefix(term))
continue; continue;
// Detect and skip CJK terms. // Detect and skip CJK terms.
Utf8Iter utfit(*it); Utf8Iter utfit(term);
if (utfit.eof()) // Empty term?? Seems to happen. if (utfit.eof()) // Empty term?? Seems to happen.
continue; continue;
if (TextSplit::isCJK(*utfit)) { if (TextSplit::isCJK(*utfit)) {
// LOGDEB("stemskipped: Skipping CJK\n" ); // LOGDEB("stemskipped: Skipping CJK\n");
continue; continue;
} }
string lower = *it; string lower = term;
// If the index is raw, compute the case-folded term which // If the index is raw, compute the case-folded term which
// is the input to the stem db, and add a synonym from the // is the input to the stem db, and add a synonym from the
// stripped term to the cased and accented one, for accent // stripped term to the cased and accented one, for accent
// and case expansion at query time // and case expansion at query time
if (!o_index_stripchars) { if (!o_index_stripchars) {
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD); unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(*it); diacasedb.addSynonym(term);
} }
// Dont' apply stemming to terms which don't look like // Dont' apply stemming to terms which don't look like
// natural language words. // natural language words.
if (!Db::isSpellingCandidate(*it)) { if (!Db::isSpellingCandidate(term)) {
LOGDEB1("createExpansionDbs: skipped: [" << ((*it)) << "]\n" ); LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n");
continue; continue;
} }
@ -144,11 +147,11 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
} }
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR("Db::createStemDb: map build failed: " << (ermsg) << "\n" ); LOGERR("Db::createStemDb: map build failed: " << ermsg << "\n");
return false; return false;
} }
LOGDEB("StemDb::createExpansionDbs: done: " << (cron.secs()) << " S\n" ); LOGDEB("StemDb::createExpansionDbs: done: " << cron.secs() << " S\n");
return true; return true;
} }

View File

@ -400,9 +400,10 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
if (!is.empty()) if (!is.empty())
it.skip_to(is.c_str()); it.skip_to(is.c_str());
for (int rcnt = 0; it != xdb.allterms_end(); it++) { for (int rcnt = 0; it != xdb.allterms_end(); it++) {
const string ixterm{*it};
// If we're beyond the terms matching the initial // If we're beyond the terms matching the initial
// section, end // section, end
if (!is.empty() && (*it).find(is) != 0) if (!is.empty() && ixterm.find(is) != 0)
break; break;
// Else try to match the term. The matcher content // Else try to match the term. The matcher content
@ -411,19 +412,19 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
// the prefix. // the prefix.
string term; string term;
if (!prefix.empty()) { if (!prefix.empty()) {
term = (*it).substr(prefix.length()); term = ixterm.substr(prefix.length());
} else { } else {
if (has_prefix(*it)) { if (has_prefix(ixterm)) {
continue; continue;
} }
term = *it; term = ixterm;
} }
if (matcher && !matcher->match(term)) if (matcher && !matcher->match(term))
continue; continue;
res.entries.push_back( res.entries.push_back(
TermMatchEntry(*it, xdb.get_collection_freq(*it), TermMatchEntry(ixterm, xdb.get_collection_freq(ixterm),
it.get_termfreq())); it.get_termfreq()));
// The problem with truncating here is that this is done // The problem with truncating here is that this is done