diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 4a61649d..3ea957ad 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -191,7 +191,12 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr) } #endif -// CJK Unicode character detection: +// CJK Unicode character detection. CJK text is indexed using an n-gram +// method, we do not try to extract words. There have been tentative +// exceptions for katakana and hangul, not successful because, even if +// these are closer to european text, they are still too different for +// the normal word splitter to work well on them. katakana and hangul +// are processed by the n-gram splitter at the moment. // // 1100..11FF; Hangul Jamo (optional: see UNICODE_IS_HANGUL) // 2E80..2EFF; CJK Radicals Supplement @@ -628,7 +633,7 @@ bool TextSplit::text_to_words(const string &in) } if (o_processCJK && csc == CSC_CJK) { - // CJK excluding Katakana character hit. + // CJK character hit. // Do like at EOF with the current non-cjk data. if (m_wordLen || m_span.length()) { if (!doemit(true, it.getBpos()))