textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails

This commit is contained in:
Jean-Francois Dockes 2020-03-26 09:31:19 +01:00
parent b677171fa8
commit 1afc606718
3 changed files with 15 additions and 11 deletions

View File

@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
int prev_csc = -1;
#endif
for (; !it.eof(); it++) {
for (; !it.eof() && !it.error(); it++) {
unsigned int c = *it;
nonalnumcnt++;
@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
}
// Check for eof, else c contains the first non-cjk
// character after the cjk sequence, just go on.
if (it.eof())
if (it.eof() || it.error())
break;
}
@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// Current number of valid offsets;
unsigned int nchars = 0;
unsigned int c = 0;
for (; !it.eof(); it++) {
for (; !it.eof() && !it.error(); it++) {
c = *it;
if (c == ' ' || c == '\t' || c == '\n') {
continue;
@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
bool TextSplit::hasVisibleWhite(const string &in)
{
Utf8Iter it(in);
for (; !it.eof(); it++) {
for (; !it.eof() && !it.error(); it++) {
unsigned int c = (unsigned char)*it;
if (c == (unsigned int)-1) {
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
tokens.clear();
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
states state = SPACE;
for (; !it.eof(); it++) {
for (; !it.eof() && !it.error(); it++) {
unsigned int c = *it;
if (visiblewhite.find(c) != visiblewhite.end())
c = ' ';

View File

@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
{
o_cmdpath = config->findFilter("kosplitter.py");
if (tagger == "Okt" || tagger == "Mecab") {
if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
o_taggername = tagger;
} else {
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
// Walk the Korean characters section and send the text to the
// analyser
string::size_type orgbytepos = it.getBpos();
for (; !it.eof(); it++) {
for (; !it.eof() && !it.error(); it++) {
c = *it;
if (!isHANGUL(c) && isalpha(c)) {
// Done with Korean stretch, process and go back to main routine
@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
restartcount += inputdata.size();
unordered_map<string,string> result;
if (!o_talker->talk(args, result)) {
LOGERR("Python splitter for Korean failed\n");
LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
return false;
}

View File

@ -28,13 +28,14 @@
import sys
import cmdtalk
from konlpy.tag import Okt,Mecab
from konlpy.tag import Okt,Mecab,Komoran
class Processor(object):
def __init__(self, proto):
self.proto = proto
self.tagsOkt = False
self.tagsMecab = False
self.tagsKomoran = False
def _init_tagger(self, taggername):
if taggername == "Okt":
@ -43,13 +44,16 @@ class Processor(object):
elif taggername == "Mecab":
self.tagger = Mecab()
self.tagsMecab = True
elif taggername == "Komoran":
self.tagger = Komoran()
self.tagsKomoran = True
else:
raise Exception("Bad tagger name " + taggername)
def process(self, params):
if 'data' not in params:
return {'error':'No data field in parameters'}
if not (self.tagsOkt or self.tagsMecab):
if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
if 'tagger' not in params:
return {'error':'No "tagger" field in parameters'}
self._init_tagger(params['tagger']);
@ -65,7 +69,7 @@ class Processor(object):
tag = e[1]
if self.tagsOkt:
pass
elif self.tagsMecab:
elif self.tagsMecab or self.tagsKomoran:
tb = tag[0:2]
if tb[0] == "N":
tag = "Noun"