Raw dbs: do not skip diacritics and case expansion for terms which do not look like like natural language words (and for which stemming is skipped)
This commit is contained in:
parent
96d4bc4f41
commit
9949952501
@ -84,21 +84,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
for (Xapian::TermIterator it = wdb.allterms_begin();
|
||||
it != wdb.allterms_end(); it++) {
|
||||
|
||||
// Skip terms which don't look like natural language words.
|
||||
if (!Db::isSpellingCandidate(*it)) {
|
||||
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Detect and skip CJK terms.
|
||||
// We're still sending all other multibyte utf-8 chars to
|
||||
// the stemmer, which is not too well defined for
|
||||
// xapian<1.0 (very obsolete now), but seems to work
|
||||
// anyway. There shouldn't be too many in any case because
|
||||
// accents are stripped at this point.
|
||||
// The effect of stripping accents on stemming is not good,
|
||||
// (e.g: in french partimes -> partim, parti^mes -> part)
|
||||
// but fixing the issue would be complicated.
|
||||
Utf8Iter utfit(*it);
|
||||
if (TextSplit::isCJK(*utfit)) {
|
||||
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
||||
@ -117,6 +103,13 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
}
|
||||
#endif
|
||||
|
||||
// Dont' apply stemming to terms which don't look like
|
||||
// natural language words.
|
||||
if (!Db::isSpellingCandidate(*it)) {
|
||||
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create stemming synonym for every language. The input is the
|
||||
// lowercase accented term
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user