Raw dbs: do not skip diacritics and case expansion for terms which do not look like like natural language words (and for which stemming is skipped)

2012-11-16 13:15:58 +01:00 · 2012-11-16 13:15:58 +01:00 · 9949952501
commit 9949952501
parent 96d4bc4f41
1 changed files with 7 additions and 14 deletions
--- a/src/rcldb/expansiondbs.cpp
+++ b/src/rcldb/expansiondbs.cpp
@ -84,21 +84,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
        for (Xapian::TermIterator it = wdb.allterms_begin(); 
 	     it != wdb.allterms_end(); it++) {

-	    // Skip terms which don't look like natural language words.
-            if (!Db::isSpellingCandidate(*it)) {
-                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
-                continue;
-            }
-
 	    // Detect and skip CJK terms.
-	    // We're still sending all other multibyte utf-8 chars to
-            // the stemmer, which is not too well defined for
-            // xapian<1.0 (very obsolete now), but seems to work
-            // anyway. There shouldn't be too many in any case because
-            // accents are stripped at this point. 
-	    // The effect of stripping accents on stemming is not good, 
-            // (e.g: in french partimes -> partim, parti^mes -> part)
-	    // but fixing the issue would be complicated.
 	    Utf8Iter utfit(*it);
 	    if (TextSplit::isCJK(*utfit)) {
 		// LOGDEB(("stemskipped: Skipping CJK\n"));
@ -117,6 +103,13 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
 	    }
 #endif

+	    // Dont' apply stemming to terms which don't look like
+	    // natural language words.
+            if (!Db::isSpellingCandidate(*it)) {
+                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
+                continue;
+            }
+
 	    // Create stemming synonym for every language. The input is the 
 	    // lowercase accented term
 	    for (unsigned int i = 0; i < langs.size(); i++) {