Regularise processing of hangul characters (there was a mixup of cjk/regular processing), and add a build-time option to either use cjk/ngram or regular term splitting for them

2019-07-02 18:02:38 +02:00 · 2019-07-02 18:02:38 +02:00 · 6b058e9758
commit 6b058e9758
parent 7457633b79
3 changed files with 47 additions and 12 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -44,6 +44,9 @@
 // ngrams
 #undef KATAKANA_AS_WORDS
 // Same for Korean syllabic
 #define HANGUL_AS_WORDS
 using namespace std;
 /**
@ -190,12 +193,13 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 // CJK Unicode character detection:
 //
 // 1100..11FF; Hangul Jamo (optional: see UNICODE_IS_HANGUL)
 // 2E80..2EFF; CJK Radicals Supplement
 // 3000..303F; CJK Symbols and Punctuation
 // 3040..309F; Hiragana
 // 30A0..30FF; Katakana
 // 3100..312F; Bopomofo
-// 3130..318F; Hangul Compatibility Jamo
+// 3130..318F; Hangul Compatibility Jamo (optional: see UNICODE_IS_HANGUL)
 // 3190..319F; Kanbun
 // 31A0..31BF; Bopomofo Extended
 // 31C0..31EF; CJK Strokes
@ -206,14 +210,15 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 // 4DC0..4DFF; Yijing Hexagram Symbols
 // 4E00..9FFF; CJK Unified Ideographs
 // A700..A71F; Modifier Tone Letters
-// AC00..D7AF; Hangul Syllables
+// AC00..D7AF; Hangul Syllables (optional: see UNICODE_IS_HANGUL)
 // F900..FAFF; CJK Compatibility Ideographs
 // FE30..FE4F; CJK Compatibility Forms
 // FF00..FFEF; Halfwidth and Fullwidth Forms
 // 20000..2A6DF; CJK Unified Ideographs Extension B
 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
 #define UNICODE_IS_CJK(p)                                               \
-    (((p) >= 0x2E80 && (p) <= 0x2EFF) ||                                \
+    (((p) >= 0x1100 && (p) <= 0x11FF) ||                                \
     ((p) >= 0x2E80 && (p) <= 0x2EFF) ||                                \
     ((p) >= 0x3000 && (p) <= 0x9FFF) ||                                \
     ((p) >= 0xA700 && (p) <= 0xA71F) ||                                \
     ((p) >= 0xAC00 && (p) <= 0xD7AF) ||                                \
@ -236,20 +241,42 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 #define UNICODE_IS_KATAKANA(p) false
 #endif
 #ifdef HANGUL_AS_WORDS
 #define UNICODE_IS_HANGUL(p) (                 \
        ((p) >= 0x1100 && (p) <= 0x11FF) ||    \
        ((p) >= 0x3130 && (p) <= 0x318F) ||    \
        ((p) >= 0x3200 && (p) <= 0x321e) ||    \
        ((p) >= 0x3248 && (p) <= 0x327F) ||    \
        ((p) >= 0x3281 && (p) <= 0x32BF) ||    \
        ((p) >= 0xAC00 && (p) <= 0xD7AF)       \
        )
 #else
 #define UNICODE_IS_HANGUL(p) false
 #endif
 bool TextSplit::isCJK(int c)
 {
-    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
+    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
        !UNICODE_IS_HANGUL(c);
 }
 bool TextSplit::isKATAKANA(int c)
 {
    return UNICODE_IS_KATAKANA(c);
 }
 bool TextSplit::isHANGUL(int c)
 {
    return UNICODE_IS_HANGUL(c);
 }
 // This is used to detect katakana/other transitions, which must
 // trigger a word split (there is not always a separator, and katakana
 // is otherwise treated like other, in the same routine, unless cjk
 // which has its span reader causing a word break)
-enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
+enum CharSpanClass {CSC_HANGUL, CSC_CJK, CSC_KATAKANA, CSC_OTHER};
 std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
        CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
        CHARFLAGENTRY(CSC_OTHER)};
 bool          TextSplit::o_processCJK{true};
 unsigned int  TextSplit::o_CJKNgramLen{2};
@ -577,7 +604,7 @@ bool TextSplit::text_to_words(const string &in)
    int nonalnumcnt = 0;
    Utf8Iter it(in);
-#ifdef KATAKANA_AS_WORDS
+#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
    int prev_csc = -1;
 #endif
    for (; !it.eof(); it++) {
@ -592,6 +619,8 @@ bool TextSplit::text_to_words(const string &in)
        CharSpanClass csc;
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
        } else if (UNICODE_IS_HANGUL(c)) {
            csc = CSC_HANGUL;
        } else if (UNICODE_IS_CJK(c)) {
            csc = CSC_CJK;
        } else {
@ -618,12 +647,17 @@ bool TextSplit::text_to_words(const string &in)
                break;
        }
-#ifdef KATAKANA_AS_WORDS
+#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
        // Only needed if we have script transitions inside this
-        // routine, else the call to cjk_to_words does the job.
+        // routine, else the call to cjk_to_words does the job (so do
-        if (csc != prev_csc && (m_wordLen || m_span.length())) {
+        // nothing right after a CJK section). Because
-            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
+        // katakana-western transitions sometimes have no whitespace
-                   m_wordLen << " spl " << m_span.length() << endl);
+        // (and maybe hangul too, but probably not).
        if (prev_csc != CSC_CJK && csc != prev_csc &&
            (m_wordLen || m_span.length())) {
            LOGDEB2("csc " << valToString(csc_names, csc) << " prev_csc " <<
                    valToString(csc_names, prev_csc) << " wl " <<
                    m_wordLen << " spl " << m_span.length() << endl);
            if (!doemit(true, it.getBpos())) {
                return false;
            }
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -91,6 +91,7 @@ public:
    /** Is char CJK ? (excluding Katakana) */
    static bool isCJK(int c);
    static bool isKATAKANA(int c);
    static bool isHANGUL(int c);
    /** Statistics about word length (average and dispersion) can
     * detect bad data like undecoded base64 or other mis-identified
--- a/src/testmains/trtextsplit.cpp
+++ b/src/testmains/trtextsplit.cpp
@ -196,7 +196,7 @@ int main(int argc, char **argv)
    RclConfig *config = new RclConfig(&cffn);
    TextSplit::staticConfInit(config);
-
+    Logger::getTheLog("stderr")->setLogLevel(Logger::LLDEB0);
    Rcl::StopList stoplist;
    if (op_flags & OPT_S) {