Avoid multiple expansion of xapian term iterator
This commit is contained in:
parent
17a8d72227
commit
04f3449f99
@ -18,7 +18,10 @@
|
|||||||
|
|
||||||
#include "autoconfig.h"
|
#include "autoconfig.h"
|
||||||
|
|
||||||
|
#include "expansiondbs.h"
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
@ -28,7 +31,6 @@
|
|||||||
#include "xmacros.h"
|
#include "xmacros.h"
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
#include "stemdb.h"
|
#include "stemdb.h"
|
||||||
#include "expansiondbs.h"
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -41,7 +43,7 @@ namespace Rcl {
|
|||||||
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
const vector<string>& langs)
|
const vector<string>& langs)
|
||||||
{
|
{
|
||||||
LOGDEB("StemDb::createExpansionDbs: languages: " << (stringsToString(langs)) << "\n" );
|
LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n");
|
||||||
Chrono cron;
|
Chrono cron;
|
||||||
|
|
||||||
// Erase and recreate all the expansion groups
|
// Erase and recreate all the expansion groups
|
||||||
@ -93,32 +95,33 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||||||
// skip the rest one by one.
|
// skip the rest one by one.
|
||||||
it.skip_to(wrap_prefix("Z"));
|
it.skip_to(wrap_prefix("Z"));
|
||||||
for ( ;it != wdb.allterms_end(); it++) {
|
for ( ;it != wdb.allterms_end(); it++) {
|
||||||
if (has_prefix(*it))
|
const string term{*it};
|
||||||
|
if (has_prefix(term))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Detect and skip CJK terms.
|
// Detect and skip CJK terms.
|
||||||
Utf8Iter utfit(*it);
|
Utf8Iter utfit(term);
|
||||||
if (utfit.eof()) // Empty term?? Seems to happen.
|
if (utfit.eof()) // Empty term?? Seems to happen.
|
||||||
continue;
|
continue;
|
||||||
if (TextSplit::isCJK(*utfit)) {
|
if (TextSplit::isCJK(*utfit)) {
|
||||||
// LOGDEB("stemskipped: Skipping CJK\n" );
|
// LOGDEB("stemskipped: Skipping CJK\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
string lower = *it;
|
string lower = term;
|
||||||
// If the index is raw, compute the case-folded term which
|
// If the index is raw, compute the case-folded term which
|
||||||
// is the input to the stem db, and add a synonym from the
|
// is the input to the stem db, and add a synonym from the
|
||||||
// stripped term to the cased and accented one, for accent
|
// stripped term to the cased and accented one, for accent
|
||||||
// and case expansion at query time
|
// and case expansion at query time
|
||||||
if (!o_index_stripchars) {
|
if (!o_index_stripchars) {
|
||||||
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD);
|
||||||
diacasedb.addSynonym(*it);
|
diacasedb.addSynonym(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dont' apply stemming to terms which don't look like
|
// Dont' apply stemming to terms which don't look like
|
||||||
// natural language words.
|
// natural language words.
|
||||||
if (!Db::isSpellingCandidate(*it)) {
|
if (!Db::isSpellingCandidate(term)) {
|
||||||
LOGDEB1("createExpansionDbs: skipped: [" << ((*it)) << "]\n" );
|
LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -144,11 +147,11 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR("Db::createStemDb: map build failed: " << (ermsg) << "\n" );
|
LOGERR("Db::createStemDb: map build failed: " << ermsg << "\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB("StemDb::createExpansionDbs: done: " << (cron.secs()) << " S\n" );
|
LOGDEB("StemDb::createExpansionDbs: done: " << cron.secs() << " S\n");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -400,9 +400,10 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
|||||||
if (!is.empty())
|
if (!is.empty())
|
||||||
it.skip_to(is.c_str());
|
it.skip_to(is.c_str());
|
||||||
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
||||||
|
const string ixterm{*it};
|
||||||
// If we're beyond the terms matching the initial
|
// If we're beyond the terms matching the initial
|
||||||
// section, end
|
// section, end
|
||||||
if (!is.empty() && (*it).find(is) != 0)
|
if (!is.empty() && ixterm.find(is) != 0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Else try to match the term. The matcher content
|
// Else try to match the term. The matcher content
|
||||||
@ -411,19 +412,19 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
|||||||
// the prefix.
|
// the prefix.
|
||||||
string term;
|
string term;
|
||||||
if (!prefix.empty()) {
|
if (!prefix.empty()) {
|
||||||
term = (*it).substr(prefix.length());
|
term = ixterm.substr(prefix.length());
|
||||||
} else {
|
} else {
|
||||||
if (has_prefix(*it)) {
|
if (has_prefix(ixterm)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
term = *it;
|
term = ixterm;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matcher && !matcher->match(term))
|
if (matcher && !matcher->match(term))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
res.entries.push_back(
|
res.entries.push_back(
|
||||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
TermMatchEntry(ixterm, xdb.get_collection_freq(ixterm),
|
||||||
it.get_termfreq()));
|
it.get_termfreq()));
|
||||||
|
|
||||||
// The problem with truncating here is that this is done
|
// The problem with truncating here is that this is done
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user