From 9949952501f1e1c7c0d63d0fc9904f30df33ba37 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 16 Nov 2012 13:15:58 +0100 Subject: [PATCH] Raw dbs: do not skip diacritics and case expansion for terms which do not look like like natural language words (and for which stemming is skipped) --- src/rcldb/expansiondbs.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/rcldb/expansiondbs.cpp b/src/rcldb/expansiondbs.cpp index b3ad3e88..35e35773 100644 --- a/src/rcldb/expansiondbs.cpp +++ b/src/rcldb/expansiondbs.cpp @@ -84,21 +84,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb, for (Xapian::TermIterator it = wdb.allterms_begin(); it != wdb.allterms_end(); it++) { - // Skip terms which don't look like natural language words. - if (!Db::isSpellingCandidate(*it)) { - LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str())); - continue; - } - // Detect and skip CJK terms. - // We're still sending all other multibyte utf-8 chars to - // the stemmer, which is not too well defined for - // xapian<1.0 (very obsolete now), but seems to work - // anyway. There shouldn't be too many in any case because - // accents are stripped at this point. - // The effect of stripping accents on stemming is not good, - // (e.g: in french partimes -> partim, parti^mes -> part) - // but fixing the issue would be complicated. Utf8Iter utfit(*it); if (TextSplit::isCJK(*utfit)) { // LOGDEB(("stemskipped: Skipping CJK\n")); @@ -117,6 +103,13 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb, } #endif + // Dont' apply stemming to terms which don't look like + // natural language words. + if (!Db::isSpellingCandidate(*it)) { + LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str())); + continue; + } + // Create stemming synonym for every language. The input is the // lowercase accented term for (unsigned int i = 0; i < langs.size(); i++) {