textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails

This commit is contained in:
Jean-Francois Dockes 2020-03-26 09:31:19 +01:00
parent b677171fa8
commit 1afc606718
3 changed files with 15 additions and 11 deletions

View File

@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS) #if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
int prev_csc = -1; int prev_csc = -1;
#endif #endif
for (; !it.eof(); it++) { for (; !it.eof() && !it.error(); it++) {
unsigned int c = *it; unsigned int c = *it;
nonalnumcnt++; nonalnumcnt++;
@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
} }
// Check for eof, else c contains the first non-cjk // Check for eof, else c contains the first non-cjk
// character after the cjk sequence, just go on. // character after the cjk sequence, just go on.
if (it.eof()) if (it.eof() || it.error())
break; break;
} }
@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// Current number of valid offsets; // Current number of valid offsets;
unsigned int nchars = 0; unsigned int nchars = 0;
unsigned int c = 0; unsigned int c = 0;
for (; !it.eof(); it++) { for (; !it.eof() && !it.error(); it++) {
c = *it; c = *it;
if (c == ' ' || c == '\t' || c == '\n') { if (c == ' ' || c == '\t' || c == '\n') {
continue; continue;
@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
bool TextSplit::hasVisibleWhite(const string &in) bool TextSplit::hasVisibleWhite(const string &in)
{ {
Utf8Iter it(in); Utf8Iter it(in);
for (; !it.eof(); it++) { for (; !it.eof() && !it.error(); it++) {
unsigned int c = (unsigned char)*it; unsigned int c = (unsigned char)*it;
if (c == (unsigned int)-1) { if (c == (unsigned int)-1) {
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n"); LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
tokens.clear(); tokens.clear();
enum states {SPACE, TOKEN, INQUOTE, ESCAPE}; enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
states state = SPACE; states state = SPACE;
for (; !it.eof(); it++) { for (; !it.eof() && !it.error(); it++) {
unsigned int c = *it; unsigned int c = *it;
if (visiblewhite.find(c) != visiblewhite.end()) if (visiblewhite.find(c) != visiblewhite.end())
c = ' '; c = ' ';

View File

@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger) void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
{ {
o_cmdpath = config->findFilter("kosplitter.py"); o_cmdpath = config->findFilter("kosplitter.py");
if (tagger == "Okt" || tagger == "Mecab") { if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
o_taggername = tagger; o_taggername = tagger;
} else { } else {
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger << LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
// Walk the Korean characters section and send the text to the // Walk the Korean characters section and send the text to the
// analyser // analyser
string::size_type orgbytepos = it.getBpos(); string::size_type orgbytepos = it.getBpos();
for (; !it.eof(); it++) { for (; !it.eof() && !it.error(); it++) {
c = *it; c = *it;
if (!isHANGUL(c) && isalpha(c)) { if (!isHANGUL(c) && isalpha(c)) {
// Done with Korean stretch, process and go back to main routine // Done with Korean stretch, process and go back to main routine
@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
restartcount += inputdata.size(); restartcount += inputdata.size();
unordered_map<string,string> result; unordered_map<string,string> result;
if (!o_talker->talk(args, result)) { if (!o_talker->talk(args, result)) {
LOGERR("Python splitter for Korean failed\n"); LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
return false; return false;
} }

View File

@ -28,13 +28,14 @@
import sys import sys
import cmdtalk import cmdtalk
from konlpy.tag import Okt,Mecab from konlpy.tag import Okt,Mecab,Komoran
class Processor(object): class Processor(object):
def __init__(self, proto): def __init__(self, proto):
self.proto = proto self.proto = proto
self.tagsOkt = False self.tagsOkt = False
self.tagsMecab = False self.tagsMecab = False
self.tagsKomoran = False
def _init_tagger(self, taggername): def _init_tagger(self, taggername):
if taggername == "Okt": if taggername == "Okt":
@ -43,13 +44,16 @@ class Processor(object):
elif taggername == "Mecab": elif taggername == "Mecab":
self.tagger = Mecab() self.tagger = Mecab()
self.tagsMecab = True self.tagsMecab = True
elif taggername == "Komoran":
self.tagger = Komoran()
self.tagsKomoran = True
else: else:
raise Exception("Bad tagger name " + taggername) raise Exception("Bad tagger name " + taggername)
def process(self, params): def process(self, params):
if 'data' not in params: if 'data' not in params:
return {'error':'No data field in parameters'} return {'error':'No data field in parameters'}
if not (self.tagsOkt or self.tagsMecab): if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
if 'tagger' not in params: if 'tagger' not in params:
return {'error':'No "tagger" field in parameters'} return {'error':'No "tagger" field in parameters'}
self._init_tagger(params['tagger']); self._init_tagger(params['tagger']);
@ -65,7 +69,7 @@ class Processor(object):
tag = e[1] tag = e[1]
if self.tagsOkt: if self.tagsOkt:
pass pass
elif self.tagsMecab: elif self.tagsMecab or self.tagsKomoran:
tb = tag[0:2] tb = tag[0:2]
if tb[0] == "N": if tb[0] == "N":
tag = "Noun" tag = "Noun"