textsplit: use more regular test for ISHANGUL. CJK: do not ignore whitespace, break on alphabetic non cjk character

2020-04-10 14:28:14 +02:00 · 2020-04-10 14:28:14 +02:00 · de246349da
commit de246349da
parent b63cc1b712
1 changed files with 77 additions and 75 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -34,6 +34,17 @@
 #include "smallut.h"
 #include "rclconfig.h"

+using namespace std;
+/**
+ * Splitting a text into words. The code in this file works with utf-8
+ * in a semi-clean way (see uproplist.h). Ascii still gets special
+ * treatment in the sense that many special characters can only be
+ * ascii (e.g. @, _,...). However, this compromise works quite well
+ * while being much more light-weight than a full-blown Unicode
+ * approach (ICU...)
+ */
+
+
 // Decide if we treat katakana as western scripts, splitting into
 // words instead of n-grams. This is not absurd (katakana is a kind of
 // alphabet, albeit phonetic and syllabic and is mostly used to
@ -49,16 +60,6 @@
 // is defined at compile time.
 #define HANGUL_AS_WORDS

-using namespace std;
-
-/**
- * Splitting a text into words. The code in this file works with utf-8
- * in a semi-clean way (see uproplist.h). Ascii still gets special
- * treatment in the sense that many special characters can only be
- * ascii (e.g. @, _,...). However, this compromise works quite well
- * while being much more light-weight than a full-blown Unicode
- * approach (ICU...)
- */

 // Ascii character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
@ -74,6 +75,58 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
                A_ULETTER=260, A_LLETTER=261, SKIP=262};
 static int charclasses[charclasses_size];

+
+bool          TextSplit::o_processCJK{true};
+unsigned int  TextSplit::o_CJKNgramLen{2};
+bool          TextSplit::o_noNumbers{false};
+bool          TextSplit::o_deHyphenate{false};
+int           TextSplit::o_maxWordLength{40};
+static const int o_CJKMaxNgramLen{5};
+bool o_exthangultagger{false};
+
+void TextSplit::staticConfInit(RclConfig *config)
+{
+    config->getConfParam("maxtermlength", &o_maxWordLength);
+
+    bool bvalue{false};
+    if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
+        o_processCJK = false;
+    } else {
+        o_processCJK = true;
+        int ngramlen;
+        if (config->getConfParam("cjkngramlen", &ngramlen)) {
+            o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
+                                           ngramlen : o_CJKMaxNgramLen);
+        }
+    }
+
+    bvalue = false;
+    if (config->getConfParam("nonumbers", &bvalue)) {
+        o_noNumbers = bvalue;
+    }
+
+    bvalue = false;
+    if (config->getConfParam("dehyphenate", &bvalue)) {
+        o_deHyphenate = bvalue;
+    }
+
+    bvalue = false;
+    if (config->getConfParam("backslashasletter", &bvalue)) {
+        if (bvalue) {
+        } else {
+            charclasses[int('\\')] = SPACE;
+        }
+    }
+
+    string kotagger;
+    config->getConfParam("hangultagger", kotagger);
+    if (!kotagger.empty()) {
+        o_exthangultagger = true;
+        koStaticConfInit(config, kotagger);
+    }
+}
+
+
 // Non-ascii UTF-8 characters are handled with sets holding all
 // characters with interesting properties. This is far from full-blown
 // management of Unicode properties, but seems to do the job well
@ -249,13 +302,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 #endif

 #ifdef HANGUL_AS_WORDS
+// If no external tagger is configured, we process HANGUL as generic
+// cjk (n-grams)
 #define UNICODE_IS_HANGUL(p) (                 \
-        ((p) >= 0x1100 && (p) <= 0x11FF) ||    \
-        ((p) >= 0x3130 && (p) <= 0x318F) ||    \
-        ((p) >= 0x3200 && (p) <= 0x321e) ||    \
-        ((p) >= 0x3248 && (p) <= 0x327F) ||    \
-        ((p) >= 0x3281 && (p) <= 0x32BF) ||    \
-        ((p) >= 0xAC00 && (p) <= 0xD7AF)       \
+        o_exthangultagger &&                   \
+        (((p) >= 0x1100 && (p) <= 0x11FF) ||   \
+         ((p) >= 0x3130 && (p) <= 0x318F) ||   \
+         ((p) >= 0x3200 && (p) <= 0x321e) ||   \
+         ((p) >= 0x3248 && (p) <= 0x327F) ||   \
+         ((p) >= 0x3281 && (p) <= 0x32BF) ||   \
+         ((p) >= 0xAC00 && (p) <= 0xD7AF))     \
        )
 #else
 #define UNICODE_IS_HANGUL(p) false
@ -285,56 +341,6 @@ std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
        CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
        CHARFLAGENTRY(CSC_OTHER)};

-bool          TextSplit::o_processCJK{true};
-unsigned int  TextSplit::o_CJKNgramLen{2};
-bool          TextSplit::o_noNumbers{false};
-bool          TextSplit::o_deHyphenate{false};
-int           TextSplit::o_maxWordLength{40};
-static const int o_CJKMaxNgramLen{5};
-bool o_exthangultagger{false};
-
-void TextSplit::staticConfInit(RclConfig *config)
-{
-    config->getConfParam("maxtermlength", &o_maxWordLength);
-
-    bool bvalue{false};
-    if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
-        o_processCJK = false;
-    } else {
-        o_processCJK = true;
-        int ngramlen;
-        if (config->getConfParam("cjkngramlen", &ngramlen)) {
-            o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
-                                           ngramlen : o_CJKMaxNgramLen);
-        }
-    }
-
-    bvalue = false;
-    if (config->getConfParam("nonumbers", &bvalue)) {
-        o_noNumbers = bvalue;
-    }
-
-    bvalue = false;
-    if (config->getConfParam("dehyphenate", &bvalue)) {
-        o_deHyphenate = bvalue;
-    }
-
-    bvalue = false;
-    if (config->getConfParam("backslashasletter", &bvalue)) {
-        if (bvalue) {
-        } else {
-            charclasses[int('\\')] = SPACE;
-        }
-    }
-
-    string kotagger;
-    config->getConfParam("hangultagger", kotagger);
-    if (!kotagger.empty()) {
-        o_exthangultagger = true;
-        koStaticConfInit(config, kotagger);
-    }
-}
-
 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
 inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
@ -635,11 +641,7 @@ bool TextSplit::text_to_words(const string &in)
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
        } else if (UNICODE_IS_HANGUL(c)) {
-            if (o_exthangultagger) {
-                csc = CSC_HANGUL;
-            } else {
-                csc = CSC_CJK;
-            }
+            csc = CSC_HANGUL;
        } else if (UNICODE_IS_CJK(c)) {
            csc = CSC_CJK;
        } else {
@ -998,10 +1000,10 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    unsigned int c = 0;
    for (; !it.eof() && !it.error(); it++) {
        c = *it;
-        if (c == ' ' || c == '\t' || c == '\n') {
-            continue;
-        }
-        if (!UNICODE_IS_CJK(c)) {
+		// We had a version which ignored whitespace for some time,
+		// but this was a bad idea. Only break on an non-cjk
+		// alphabetic character.
+        if (!UNICODE_IS_CJK(c) && isalpha(c)) {
            // Return to normal handler
            break;
        }