From 9565663f09ede1b20c628d3cf245f900326778de Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Tue, 14 Apr 2020 09:27:26 +0200
Subject: [PATCH] textsplit: create isNGRAMMED() method to replace isCJK() and
 let the latter actually return what it says

---
 src/common/textsplit.cpp  | 8 ++++++--
 src/common/textsplit.h    | 6 ++++--
 src/rcldb/rclabstract.cpp | 2 +-
 src/rcldb/rcldb.h         | 5 ++---
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 614a2b5f..1c782fca 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 
 bool TextSplit::isCJK(int c)
 {
-    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
-        !UNICODE_IS_HANGUL(c);
+    return UNICODE_IS_CJK(c);
 }
 bool TextSplit::isKATAKANA(int c)
 {
@@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c)
 {
     return UNICODE_IS_HANGUL(c);
 }
+bool TextSplit::isNGRAMMED(int c)
+{
+    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
+        !UNICODE_IS_HANGUL(c);
+}
 
 
 // This is used to detect katakana/other transitions, which must
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index 3cf7adf3..573b723c 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -89,11 +89,13 @@ public:
     static bool stringToStrings(const std::string &s,
                                 std::vector<std::string> &tokens);
 
-    /** Is char CJK ? (excluding Katakana) */
+    /** Is char CJK ? */
     static bool isCJK(int c);
     static bool isKATAKANA(int c);
     static bool isHANGUL(int c);
-
+    /* Not split in words */
+    static bool isNGRAMMED(int c);
+    
     /** Statistics about word length (average and dispersion) can
      * detect bad data like undecoded base64 or other mis-identified
      * pieces of data taken as text. In practise, this keeps some junk out 
diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp
index b6fb729a..6f333eba 100644
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector(
         }
         Utf8Iter uit(ent.second);
         bool newcjk = false;
-        if (TextSplit::isCJK(*uit))
+        if (TextSplit::isNGRAMMED(*uit))
             newcjk = true;
         if (!incjk || (incjk && !newcjk))
             chunk += " ";
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 10d446e0..520f516f 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -202,9 +202,8 @@ public:
 
         Utf8Iter u8i(term);
         if (with_aspell) {
-            // If spelling with aspell, neither katakana nor other cjk
-            // scripts are candidates
-            if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
+            // If spelling with aspell, CJK scripts are not candidates
+            if (TextSplit::isCJK(*u8i))
                 return false;
         } else {
 #ifdef TESTING_XAPIAN_SPELL