From 97e89c408a08414779433cc1a649ef6c5a2a3e86 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 25 Mar 2020 16:57:42 +0100 Subject: [PATCH] korean splitter: only break korean stretch on non-korean alphabetic (e.g. not numbers or punctuation) --- src/common/textsplitko.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index f3257b6d..d0c1bdc0 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -124,9 +124,9 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) string::size_type orgbytepos = it.getBpos(); for (; !it.eof(); it++) { c = *it; - if (!isHANGUL(c) && !(isspace(c) || ispunct(c))) { + if (!isHANGUL(c) && isalpha(c)) { // Done with Korean stretch, process and go back to main routine - //std::cerr << "Broke on char " << int(c) << endl; + std::cerr << "Broke on char " << (std::string)it << endl; break; } else { it.appendchartostring(inputdata);