textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails

2020-03-26 09:31:19 +01:00 · 2020-03-26 09:31:19 +01:00 · 1afc606718
commit 1afc606718
parent b677171fa8
3 changed files with 15 additions and 11 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
 #if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
    int prev_csc = -1;
 #endif
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = *it;
        nonalnumcnt++;

@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
            }
            // Check for eof, else c contains the first non-cjk
            // character after the cjk sequence, just go on.
-            if (it.eof())
+            if (it.eof() || it.error())
                break;
        }

@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    // Current number of valid offsets;
    unsigned int nchars = 0;
    unsigned int c = 0;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        c = *it;
        if (c == ' ' || c == '\t' || c == '\n') {
            continue;
@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
 bool TextSplit::hasVisibleWhite(const string &in)
 {
    Utf8Iter it(in);
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = (unsigned char)*it;
        if (c == (unsigned int)-1) {
            LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
    tokens.clear();
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
    states state = SPACE;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = *it;
        if (visiblewhite.find(c) != visiblewhite.end()) 
            c = ' ';
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
 void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
 {
    o_cmdpath = config->findFilter("kosplitter.py");
-    if (tagger == "Okt" || tagger == "Mecab") {
+    if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
        o_taggername = tagger;
    } else {
        LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
    // Walk the Korean characters section and send the text to the
    // analyser
    string::size_type orgbytepos = it.getBpos();
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        c = *it;
        if (!isHANGUL(c) && isalpha(c)) {
            // Done with Korean stretch, process and go back to main routine
@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
    restartcount += inputdata.size();
    unordered_map<string,string> result;
    if (!o_talker->talk(args, result)) {
-        LOGERR("Python splitter for Korean failed\n");
+        LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
        return false;
    }

--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -28,13 +28,14 @@
 import sys
 import cmdtalk

-from konlpy.tag import Okt,Mecab
+from konlpy.tag import Okt,Mecab,Komoran

 class Processor(object):
    def __init__(self, proto):
        self.proto = proto
        self.tagsOkt = False
        self.tagsMecab = False
+        self.tagsKomoran = False

    def _init_tagger(self, taggername):
        if taggername == "Okt":
@ -43,13 +44,16 @@ class Processor(object):
        elif taggername == "Mecab":
            self.tagger = Mecab()
            self.tagsMecab = True
+        elif taggername == "Komoran":
+            self.tagger = Komoran()
+            self.tagsKomoran = True
        else:
            raise Exception("Bad tagger name " + taggername)
        
    def process(self, params):
        if 'data' not in params:
            return {'error':'No data field in parameters'}
-        if not (self.tagsOkt or self.tagsMecab):
+        if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
            if 'tagger' not in params:
                return {'error':'No "tagger" field in parameters'}
            self._init_tagger(params['tagger']);
@ -65,7 +69,7 @@ class Processor(object):
            tag = e[1]
            if self.tagsOkt:
                pass
-            elif self.tagsMecab:
+            elif self.tagsMecab or self.tagsKomoran:
                tb = tag[0:2]
                if tb[0] == "N":
                    tag = "Noun"