From 1afc606718e7d69656e210407ba5583823f519de Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Thu, 26 Mar 2020 09:31:19 +0100
Subject: [PATCH] textsplit: break on it.error() not only it.eof(). Seems to
 make a difference in rare cases? Add Komoran support but this one often fails

---
 src/common/textsplit.cpp   | 10 +++++-----
 src/common/textsplitko.cpp |  6 +++---
 src/filters/kosplitter.py  | 10 +++++++---
 3 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 59bbfbbe..eab58f10 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
 #if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
     int prev_csc = -1;
 #endif
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
         unsigned int c = *it;
         nonalnumcnt++;
 
@@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
             }
             // Check for eof, else c contains the first non-cjk
             // character after the cjk sequence, just go on.
-            if (it.eof())
+            if (it.eof() || it.error())
                 break;
         }
 
@@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
     // Current number of valid offsets;
     unsigned int nchars = 0;
     unsigned int c = 0;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
         c = *it;
         if (c == ' ' || c == '\t' || c == '\n') {
             continue;
@@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
 bool TextSplit::hasVisibleWhite(const string &in)
 {
     Utf8Iter it(in);
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
         unsigned int c = (unsigned char)*it;
         if (c == (unsigned int)-1) {
             LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
@@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
     tokens.clear();
     enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
     states state = SPACE;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
         unsigned int c = *it;
         if (visiblewhite.find(c) != visiblewhite.end()) 
             c = ' ';
diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp
index 678fbd87..65768620 100644
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
 void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
 {
     o_cmdpath = config->findFilter("kosplitter.py");
-    if (tagger == "Okt" || tagger == "Mecab") {
+    if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
         o_taggername = tagger;
     } else {
         LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
@@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
     // Walk the Korean characters section and send the text to the
     // analyser
     string::size_type orgbytepos = it.getBpos();
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
         c = *it;
         if (!isHANGUL(c) && isalpha(c)) {
             // Done with Korean stretch, process and go back to main routine
@@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
     restartcount += inputdata.size();
     unordered_map<string,string> result;
     if (!o_talker->talk(args, result)) {
-        LOGERR("Python splitter for Korean failed\n");
+        LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
         return false;
     }
 
diff --git a/src/filters/kosplitter.py b/src/filters/kosplitter.py
index c586cfff..c917f04c 100755
--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@@ -28,13 +28,14 @@
 import sys
 import cmdtalk
 
-from konlpy.tag import Okt,Mecab
+from konlpy.tag import Okt,Mecab,Komoran
 
 class Processor(object):
     def __init__(self, proto):
         self.proto = proto
         self.tagsOkt = False
         self.tagsMecab = False
+        self.tagsKomoran = False
 
     def _init_tagger(self, taggername):
         if taggername == "Okt":
@@ -43,13 +44,16 @@ class Processor(object):
         elif taggername == "Mecab":
             self.tagger = Mecab()
             self.tagsMecab = True
+        elif taggername == "Komoran":
+            self.tagger = Komoran()
+            self.tagsKomoran = True
         else:
             raise Exception("Bad tagger name " + taggername)
         
     def process(self, params):
         if 'data' not in params:
             return {'error':'No data field in parameters'}
-        if not (self.tagsOkt or self.tagsMecab):
+        if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
             if 'tagger' not in params:
                 return {'error':'No "tagger" field in parameters'}
             self._init_tagger(params['tagger']);
@@ -65,7 +69,7 @@ class Processor(object):
             tag = e[1]
             if self.tagsOkt:
                 pass
-            elif self.tagsMecab:
+            elif self.tagsMecab or self.tagsKomoran:
                 tb = tag[0:2]
                 if tb[0] == "N":
                     tag = "Noun"