From 9949952501f1e1c7c0d63d0fc9904f30df33ba37 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Fri, 16 Nov 2012 13:15:58 +0100
Subject: [PATCH] Raw dbs: do not skip diacritics and case expansion for terms
 which do not look like like natural language words (and for which stemming is
 skipped)

---
 src/rcldb/expansiondbs.cpp | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/rcldb/expansiondbs.cpp b/src/rcldb/expansiondbs.cpp
index b3ad3e88..35e35773 100644
--- a/src/rcldb/expansiondbs.cpp
+++ b/src/rcldb/expansiondbs.cpp
@@ -84,21 +84,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
         for (Xapian::TermIterator it = wdb.allterms_begin(); 
 	     it != wdb.allterms_end(); it++) {
 
-	    // Skip terms which don't look like natural language words.
-            if (!Db::isSpellingCandidate(*it)) {
-                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
-                continue;
-            }
-
 	    // Detect and skip CJK terms.
-	    // We're still sending all other multibyte utf-8 chars to
-            // the stemmer, which is not too well defined for
-            // xapian<1.0 (very obsolete now), but seems to work
-            // anyway. There shouldn't be too many in any case because
-            // accents are stripped at this point. 
-	    // The effect of stripping accents on stemming is not good, 
-            // (e.g: in french partimes -> partim, parti^mes -> part)
-	    // but fixing the issue would be complicated.
 	    Utf8Iter utfit(*it);
 	    if (TextSplit::isCJK(*utfit)) {
 		// LOGDEB(("stemskipped: Skipping CJK\n"));
@@ -117,6 +103,13 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
 	    }
 #endif
 
+	    // Dont' apply stemming to terms which don't look like
+	    // natural language words.
+            if (!Db::isSpellingCandidate(*it)) {
+                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
+                continue;
+            }
+
 	    // Create stemming synonym for every language. The input is the 
 	    // lowercase accented term
 	    for (unsigned int i = 0; i < langs.size(); i++) {