Avoid multiple expansion of xapian term iterator
This commit is contained in:
parent
17a8d72227
commit
04f3449f99
@ -18,7 +18,10 @@
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "expansiondbs.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "log.h"
|
||||
#include "utf8iter.h"
|
||||
@ -28,7 +31,6 @@
|
||||
#include "xmacros.h"
|
||||
#include "rcldb.h"
|
||||
#include "stemdb.h"
|
||||
#include "expansiondbs.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -41,7 +43,7 @@ namespace Rcl {
|
||||
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const vector<string>& langs)
|
||||
{
|
||||
LOGDEB("StemDb::createExpansionDbs: languages: " << (stringsToString(langs)) << "\n" );
|
||||
LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n");
|
||||
Chrono cron;
|
||||
|
||||
// Erase and recreate all the expansion groups
|
||||
@ -93,32 +95,33 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
// skip the rest one by one.
|
||||
it.skip_to(wrap_prefix("Z"));
|
||||
for ( ;it != wdb.allterms_end(); it++) {
|
||||
if (has_prefix(*it))
|
||||
const string term{*it};
|
||||
if (has_prefix(term))
|
||||
continue;
|
||||
|
||||
// Detect and skip CJK terms.
|
||||
Utf8Iter utfit(*it);
|
||||
Utf8Iter utfit(term);
|
||||
if (utfit.eof()) // Empty term?? Seems to happen.
|
||||
continue;
|
||||
if (TextSplit::isCJK(*utfit)) {
|
||||
// LOGDEB("stemskipped: Skipping CJK\n" );
|
||||
// LOGDEB("stemskipped: Skipping CJK\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
string lower = *it;
|
||||
string lower = term;
|
||||
// If the index is raw, compute the case-folded term which
|
||||
// is the input to the stem db, and add a synonym from the
|
||||
// stripped term to the cased and accented one, for accent
|
||||
// and case expansion at query time
|
||||
if (!o_index_stripchars) {
|
||||
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
||||
diacasedb.addSynonym(*it);
|
||||
unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD);
|
||||
diacasedb.addSynonym(term);
|
||||
}
|
||||
|
||||
// Dont' apply stemming to terms which don't look like
|
||||
// natural language words.
|
||||
if (!Db::isSpellingCandidate(*it)) {
|
||||
LOGDEB1("createExpansionDbs: skipped: [" << ((*it)) << "]\n" );
|
||||
if (!Db::isSpellingCandidate(term)) {
|
||||
LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -144,11 +147,11 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR("Db::createStemDb: map build failed: " << (ermsg) << "\n" );
|
||||
LOGERR("Db::createStemDb: map build failed: " << ermsg << "\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGDEB("StemDb::createExpansionDbs: done: " << (cron.secs()) << " S\n" );
|
||||
LOGDEB("StemDb::createExpansionDbs: done: " << cron.secs() << " S\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -400,9 +400,10 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
||||
if (!is.empty())
|
||||
it.skip_to(is.c_str());
|
||||
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
||||
const string ixterm{*it};
|
||||
// If we're beyond the terms matching the initial
|
||||
// section, end
|
||||
if (!is.empty() && (*it).find(is) != 0)
|
||||
if (!is.empty() && ixterm.find(is) != 0)
|
||||
break;
|
||||
|
||||
// Else try to match the term. The matcher content
|
||||
@ -411,19 +412,19 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
||||
// the prefix.
|
||||
string term;
|
||||
if (!prefix.empty()) {
|
||||
term = (*it).substr(prefix.length());
|
||||
term = ixterm.substr(prefix.length());
|
||||
} else {
|
||||
if (has_prefix(*it)) {
|
||||
if (has_prefix(ixterm)) {
|
||||
continue;
|
||||
}
|
||||
term = *it;
|
||||
term = ixterm;
|
||||
}
|
||||
|
||||
if (matcher && !matcher->match(term))
|
||||
continue;
|
||||
|
||||
res.entries.push_back(
|
||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||
TermMatchEntry(ixterm, xdb.get_collection_freq(ixterm),
|
||||
it.get_termfreq()));
|
||||
|
||||
// The problem with truncating here is that this is done
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user