From f853f39ef3e9caf9857a1693caafb65bf32bfb90 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Tue, 25 Apr 2017 10:20:38 +0200
Subject: [PATCH] Partially revert change treating Katakana as words, going
 back to n-grams. Did not work well because of separator-less compounds mostly

---
 src/Makefile.am          |  1 -
 src/common/textsplit.cpp | 24 ++++++++++++++++++++++--
 src/qtgui/recoll.pro.in  |  2 +-
 src/rcldb/rcldb.h        | 22 +++++++++++++++++++---
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index e63d56cd..56b49b4e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -32,7 +32,6 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
     $(X_CFLAGS) \
     -DRECOLL_DATADIR=\"${pkgdatadir}\" \
     -D_GNU_SOURCE \
-    -DTESTING_XAPIAN_SPELL \
     $(DEFS)
 
 ACLOCAL_AMFLAGS = -I m4
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 52bc26f2..081e05d4 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -33,6 +33,16 @@
 #include "uproplist.h"
 #include "smallut.h"
 
+// Decide if we treat katakana as western scripts, splitting into
+// words instead of n-grams. This is not absurd (katakana is a kind of
+// alphabet, albeit phonetic and syllabic and is mostly used to
+// transcribe western words), but it does not work well because
+// japanese uses separator-less compound katakana words, and because
+// the plural terminaisons are irregular and would need a specialized
+// stemmer. So we for now process katakana as the rest of cjk, using
+// ngrams
+#undef KATAKANA_AS_WORDS
+
 using namespace std;
 
 /**
@@ -209,11 +219,15 @@ static inline int whatcc(unsigned int c)
 // katakana variants' to something else.  Look up "Kuromoji" Lucene
 // filter, KuromojiNormalizeFilter.java
 // 309F is Hiragana.
+#ifdef KATAKANA_AS_WORDS
 #define UNICODE_IS_KATAKANA(p)                                          \
     ((p) != 0x309F &&                                                   \
      (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
       ((p) >= 0x31F0 && (p) <= 0x31FF)))
-    
+#else
+#define UNICODE_IS_KATAKANA(p) false
+#endif
+
 bool TextSplit::isCJK(int c)
 {
     return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
@@ -520,6 +534,7 @@ bool TextSplit::text_to_words(const string &in)
 	    LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
 	    return false;
 	}
+
         CharSpanClass csc;
         if (UNICODE_IS_KATAKANA(c)) {
             csc = CSC_KATAKANA;
@@ -528,6 +543,7 @@ bool TextSplit::text_to_words(const string &in)
         } else {
             csc = CSC_OTHER;
         }
+
 	if (o_processCJK && csc == CSC_CJK) {
 	    // CJK excluding Katakana character hit. 
 	    // Do like at EOF with the current non-cjk data.
@@ -548,6 +564,9 @@ bool TextSplit::text_to_words(const string &in)
 		break;
 	}
 
+#ifdef KATAKANA_AS_WORDS
+        // Only needed if we have script transitions inside this
+        // routine, else the call to cjk_to_words does the job.
         if (csc != prev_csc && (m_wordLen || m_span.length())) {
             LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
                    m_wordLen << " spl " << m_span.length() << endl);
@@ -555,8 +574,9 @@ bool TextSplit::text_to_words(const string &in)
                 return false;
             }
         }
+#endif
+
         prev_csc = csc;
-        
 	int cc = whatcc(c);
 
 	switch (cc) {
diff --git a/src/qtgui/recoll.pro.in b/src/qtgui/recoll.pro.in
index e8dc3345..c58105f0 100644
--- a/src/qtgui/recoll.pro.in
+++ b/src/qtgui/recoll.pro.in
@@ -2,7 +2,7 @@ TEMPLATE        = app
 LANGUAGE        = C++
 
 VPATH = @srcdir@
-DEFINES += BUILDING_RECOLL  TESTING_XAPIAN_SPELL
+DEFINES += BUILDING_RECOLL
 
 @QMAKE_ENABLE_WEBKIT@ QT += webkit
 @QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 89a5c360..ebfcfc74 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -200,21 +200,37 @@ class Db {
     vector<string> getStemLangs();
 
     /** Test word for spelling correction candidate: not too long, no 
-	special chars... */
-    static bool isSpellingCandidate(const string& term, bool aspell=true)
+     * special chars... 
+     * @param with_aspell test for use with aspell, else for xapian speller
+     */
+    static bool isSpellingCandidate(const string& term, bool with_aspell=true)
     {
 	if (term.empty() || term.length() > 50)
 	    return false;
 	if (has_prefix(term))
 	    return false;
 	Utf8Iter u8i(term);
-        if (aspell) {
+        if (with_aspell) {
+            // If spelling with aspell, neither katakana nor other cjk
+            // scripts are candidates
             if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
                 return false;
         } else {
+#ifdef TESTING_XAPIAN_SPELL
+            // The Xapian speller (purely proximity-based) can be used
+            // for Katakana (when split as words which is not always
+            // completely feasible because of separator-less
+            // compounds). Currently we don't try to use the Xapian
+            // speller with other scripts with which it would be usable
+            // in the absence of aspell (it would indeed be better
+            // than nothing with e.g. european languages). This would
+            // require a few more config variables, maybe one day.
             if (!TextSplit::isKATAKANA(*u8i)) {
                 return false;
             }
+#else
+            return false;
+#endif
         }
 	if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") 
 	    != string::npos)