textsplit: create isNGRAMMED() method to replace isCJK() and let the latter actually return what it says

2020-04-14 09:27:26 +02:00 · 2020-04-14 09:27:26 +02:00 · 9565663f09
commit 9565663f09
parent 5dd8774b3c
4 changed files with 13 additions and 8 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)

 bool TextSplit::isCJK(int c)
 {
-    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
-        !UNICODE_IS_HANGUL(c);
+    return UNICODE_IS_CJK(c);
 }
 bool TextSplit::isKATAKANA(int c)
 {
@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c)
 {
    return UNICODE_IS_HANGUL(c);
 }
+bool TextSplit::isNGRAMMED(int c)
+{
+    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
+        !UNICODE_IS_HANGUL(c);
+}


 // This is used to detect katakana/other transitions, which must
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -89,11 +89,13 @@ public:
    static bool stringToStrings(const std::string &s,
                                std::vector<std::string> &tokens);

-    /** Is char CJK ? (excluding Katakana) */
+    /** Is char CJK ? */
    static bool isCJK(int c);
    static bool isKATAKANA(int c);
    static bool isHANGUL(int c);
-
+    /* Not split in words */
+    static bool isNGRAMMED(int c);
+    
    /** Statistics about word length (average and dispersion) can
     * detect bad data like undecoded base64 or other mis-identified
     * pieces of data taken as text. In practise, this keeps some junk out 
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector(
        }
        Utf8Iter uit(ent.second);
        bool newcjk = false;
-        if (TextSplit::isCJK(*uit))
+        if (TextSplit::isNGRAMMED(*uit))
            newcjk = true;
        if (!incjk || (incjk && !newcjk))
            chunk += " ";
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -202,9 +202,8 @@ public:

        Utf8Iter u8i(term);
        if (with_aspell) {
-            // If spelling with aspell, neither katakana nor other cjk
-            // scripts are candidates
-            if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
+            // If spelling with aspell, CJK scripts are not candidates
+            if (TextSplit::isCJK(*u8i))
                return false;
        } else {
 #ifdef TESTING_XAPIAN_SPELL