korean splitter: only break korean stretch on non-korean alphabetic (e.g. not numbers or punctuation)

This commit is contained in:
Jean-Francois Dockes 2020-03-25 16:57:42 +01:00
parent 023bdc055e
commit 97e89c408a

View File

@ -124,9 +124,9 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
string::size_type orgbytepos = it.getBpos();
for (; !it.eof(); it++) {
c = *it;
if (!isHANGUL(c) && !(isspace(c) || ispunct(c))) {
if (!isHANGUL(c) && isalpha(c)) {
// Done with Korean stretch, process and go back to main routine
//std::cerr << "Broke on char " << int(c) << endl;
std::cerr << "Broke on char " << (std::string)it << endl;
break;
} else {
it.appendchartostring(inputdata);