From 1afc606718e7d69656e210407ba5583823f519de Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 26 Mar 2020 09:31:19 +0100 Subject: [PATCH] textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails --- src/common/textsplit.cpp | 10 +++++----- src/common/textsplitko.cpp | 6 +++--- src/filters/kosplitter.py | 10 +++++++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 59bbfbbe..eab58f10 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in) #if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS) int prev_csc = -1; #endif - for (; !it.eof(); it++) { + for (; !it.eof() && !it.error(); it++) { unsigned int c = *it; nonalnumcnt++; @@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in) } // Check for eof, else c contains the first non-cjk // character after the cjk sequence, just go on. - if (it.eof()) + if (it.eof() || it.error()) break; } @@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) // Current number of valid offsets; unsigned int nchars = 0; unsigned int c = 0; - for (; !it.eof(); it++) { + for (; !it.eof() && !it.error(); it++) { c = *it; if (c == ' ' || c == '\t' || c == '\n') { continue; @@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs) bool TextSplit::hasVisibleWhite(const string &in) { Utf8Iter it(in); - for (; !it.eof(); it++) { + for (; !it.eof() && !it.error(); it++) { unsigned int c = (unsigned char)*it; if (c == (unsigned int)-1) { LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n"); @@ -1117,7 +1117,7 @@ template bool u8stringToStrings(const string &s, T &tokens) tokens.clear(); enum states {SPACE, TOKEN, INQUOTE, ESCAPE}; states state = SPACE; - for (; !it.eof(); it++) { + for (; !it.eof() && !it.error(); it++) { unsigned int c = *it; if (visiblewhite.find(c) != visiblewhite.end()) c = ' '; diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index 678fbd87..65768620 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000; void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger) { o_cmdpath = config->findFilter("kosplitter.py"); - if (tagger == "Okt" || tagger == "Mecab") { + if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") { o_taggername = tagger; } else { LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger << @@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) // Walk the Korean characters section and send the text to the // analyser string::size_type orgbytepos = it.getBpos(); - for (; !it.eof(); it++) { + for (; !it.eof() && !it.error(); it++) { c = *it; if (!isHANGUL(c) && isalpha(c)) { // Done with Korean stretch, process and go back to main routine @@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) restartcount += inputdata.size(); unordered_map result; if (!o_talker->talk(args, result)) { - LOGERR("Python splitter for Korean failed\n"); + LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n"); return false; } diff --git a/src/filters/kosplitter.py b/src/filters/kosplitter.py index c586cfff..c917f04c 100755 --- a/src/filters/kosplitter.py +++ b/src/filters/kosplitter.py @@ -28,13 +28,14 @@ import sys import cmdtalk -from konlpy.tag import Okt,Mecab +from konlpy.tag import Okt,Mecab,Komoran class Processor(object): def __init__(self, proto): self.proto = proto self.tagsOkt = False self.tagsMecab = False + self.tagsKomoran = False def _init_tagger(self, taggername): if taggername == "Okt": @@ -43,13 +44,16 @@ class Processor(object): elif taggername == "Mecab": self.tagger = Mecab() self.tagsMecab = True + elif taggername == "Komoran": + self.tagger = Komoran() + self.tagsKomoran = True else: raise Exception("Bad tagger name " + taggername) def process(self, params): if 'data' not in params: return {'error':'No data field in parameters'} - if not (self.tagsOkt or self.tagsMecab): + if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran): if 'tagger' not in params: return {'error':'No "tagger" field in parameters'} self._init_tagger(params['tagger']); @@ -65,7 +69,7 @@ class Processor(object): tag = e[1] if self.tagsOkt: pass - elif self.tagsMecab: + elif self.tagsMecab or self.tagsKomoran: tb = tag[0:2] if tb[0] == "N": tag = "Noun"