Avoid multiple expansion of xapian term iterator

This commit is contained in:
Jean-Francois Dockes 2019-02-01 09:07:28 +01:00
parent 17a8d72227
commit 04f3449f99
2 changed files with 21 additions and 17 deletions

View File

@ -18,7 +18,10 @@
#include "autoconfig.h"
#include "expansiondbs.h"
#include <memory>
#include <string>
#include "log.h"
#include "utf8iter.h"
@ -28,7 +31,6 @@
#include "xmacros.h"
#include "rcldb.h"
#include "stemdb.h"
#include "expansiondbs.h"
using namespace std;
@ -41,7 +43,7 @@ namespace Rcl {
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB("StemDb::createExpansionDbs: languages: " << (stringsToString(langs)) << "\n" );
LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n");
Chrono cron;
// Erase and recreate all the expansion groups
@ -93,32 +95,33 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// skip the rest one by one.
it.skip_to(wrap_prefix("Z"));
for ( ;it != wdb.allterms_end(); it++) {
if (has_prefix(*it))
const string term{*it};
if (has_prefix(term))
continue;
// Detect and skip CJK terms.
Utf8Iter utfit(*it);
Utf8Iter utfit(term);
if (utfit.eof()) // Empty term?? Seems to happen.
continue;
if (TextSplit::isCJK(*utfit)) {
// LOGDEB("stemskipped: Skipping CJK\n" );
// LOGDEB("stemskipped: Skipping CJK\n");
continue;
}
string lower = *it;
string lower = term;
// If the index is raw, compute the case-folded term which
// is the input to the stem db, and add a synonym from the
// stripped term to the cased and accented one, for accent
// and case expansion at query time
if (!o_index_stripchars) {
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(*it);
unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(term);
}
// Dont' apply stemming to terms which don't look like
// natural language words.
if (!Db::isSpellingCandidate(*it)) {
LOGDEB1("createExpansionDbs: skipped: [" << ((*it)) << "]\n" );
if (!Db::isSpellingCandidate(term)) {
LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n");
continue;
}
@ -144,11 +147,11 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR("Db::createStemDb: map build failed: " << (ermsg) << "\n" );
LOGERR("Db::createStemDb: map build failed: " << ermsg << "\n");
return false;
}
LOGDEB("StemDb::createExpansionDbs: done: " << (cron.secs()) << " S\n" );
LOGDEB("StemDb::createExpansionDbs: done: " << cron.secs() << " S\n");
return true;
}

View File

@ -400,9 +400,10 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
if (!is.empty())
it.skip_to(is.c_str());
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
const string ixterm{*it};
// If we're beyond the terms matching the initial
// section, end
if (!is.empty() && (*it).find(is) != 0)
if (!is.empty() && ixterm.find(is) != 0)
break;
// Else try to match the term. The matcher content
@ -411,19 +412,19 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
// the prefix.
string term;
if (!prefix.empty()) {
term = (*it).substr(prefix.length());
term = ixterm.substr(prefix.length());
} else {
if (has_prefix(*it)) {
if (has_prefix(ixterm)) {
continue;
}
term = *it;
term = ixterm;
}
if (matcher && !matcher->match(term))
continue;
res.entries.push_back(
TermMatchEntry(*it, xdb.get_collection_freq(*it),
TermMatchEntry(ixterm, xdb.get_collection_freq(ixterm),
it.get_termfreq()));
// The problem with truncating here is that this is done