textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails
This commit is contained in:
parent
b677171fa8
commit
1afc606718
@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
|
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
|
||||||
int prev_csc = -1;
|
int prev_csc = -1;
|
||||||
#endif
|
#endif
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
unsigned int c = *it;
|
unsigned int c = *it;
|
||||||
nonalnumcnt++;
|
nonalnumcnt++;
|
||||||
|
|
||||||
@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
// Check for eof, else c contains the first non-cjk
|
// Check for eof, else c contains the first non-cjk
|
||||||
// character after the cjk sequence, just go on.
|
// character after the cjk sequence, just go on.
|
||||||
if (it.eof())
|
if (it.eof() || it.error())
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Current number of valid offsets;
|
// Current number of valid offsets;
|
||||||
unsigned int nchars = 0;
|
unsigned int nchars = 0;
|
||||||
unsigned int c = 0;
|
unsigned int c = 0;
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
c = *it;
|
c = *it;
|
||||||
if (c == ' ' || c == '\t' || c == '\n') {
|
if (c == ' ' || c == '\t' || c == '\n') {
|
||||||
continue;
|
continue;
|
||||||
@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
|||||||
bool TextSplit::hasVisibleWhite(const string &in)
|
bool TextSplit::hasVisibleWhite(const string &in)
|
||||||
{
|
{
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
unsigned int c = (unsigned char)*it;
|
unsigned int c = (unsigned char)*it;
|
||||||
if (c == (unsigned int)-1) {
|
if (c == (unsigned int)-1) {
|
||||||
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
|
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
|
||||||
@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
|||||||
tokens.clear();
|
tokens.clear();
|
||||||
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
||||||
states state = SPACE;
|
states state = SPACE;
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
unsigned int c = *it;
|
unsigned int c = *it;
|
||||||
if (visiblewhite.find(c) != visiblewhite.end())
|
if (visiblewhite.find(c) != visiblewhite.end())
|
||||||
c = ' ';
|
c = ' ';
|
||||||
|
|||||||
@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
|
|||||||
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
|
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
|
||||||
{
|
{
|
||||||
o_cmdpath = config->findFilter("kosplitter.py");
|
o_cmdpath = config->findFilter("kosplitter.py");
|
||||||
if (tagger == "Okt" || tagger == "Mecab") {
|
if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
|
||||||
o_taggername = tagger;
|
o_taggername = tagger;
|
||||||
} else {
|
} else {
|
||||||
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
|
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
|
||||||
@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Walk the Korean characters section and send the text to the
|
// Walk the Korean characters section and send the text to the
|
||||||
// analyser
|
// analyser
|
||||||
string::size_type orgbytepos = it.getBpos();
|
string::size_type orgbytepos = it.getBpos();
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
c = *it;
|
c = *it;
|
||||||
if (!isHANGUL(c) && isalpha(c)) {
|
if (!isHANGUL(c) && isalpha(c)) {
|
||||||
// Done with Korean stretch, process and go back to main routine
|
// Done with Korean stretch, process and go back to main routine
|
||||||
@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
restartcount += inputdata.size();
|
restartcount += inputdata.size();
|
||||||
unordered_map<string,string> result;
|
unordered_map<string,string> result;
|
||||||
if (!o_talker->talk(args, result)) {
|
if (!o_talker->talk(args, result)) {
|
||||||
LOGERR("Python splitter for Korean failed\n");
|
LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -28,13 +28,14 @@
|
|||||||
import sys
|
import sys
|
||||||
import cmdtalk
|
import cmdtalk
|
||||||
|
|
||||||
from konlpy.tag import Okt,Mecab
|
from konlpy.tag import Okt,Mecab,Komoran
|
||||||
|
|
||||||
class Processor(object):
|
class Processor(object):
|
||||||
def __init__(self, proto):
|
def __init__(self, proto):
|
||||||
self.proto = proto
|
self.proto = proto
|
||||||
self.tagsOkt = False
|
self.tagsOkt = False
|
||||||
self.tagsMecab = False
|
self.tagsMecab = False
|
||||||
|
self.tagsKomoran = False
|
||||||
|
|
||||||
def _init_tagger(self, taggername):
|
def _init_tagger(self, taggername):
|
||||||
if taggername == "Okt":
|
if taggername == "Okt":
|
||||||
@ -43,13 +44,16 @@ class Processor(object):
|
|||||||
elif taggername == "Mecab":
|
elif taggername == "Mecab":
|
||||||
self.tagger = Mecab()
|
self.tagger = Mecab()
|
||||||
self.tagsMecab = True
|
self.tagsMecab = True
|
||||||
|
elif taggername == "Komoran":
|
||||||
|
self.tagger = Komoran()
|
||||||
|
self.tagsKomoran = True
|
||||||
else:
|
else:
|
||||||
raise Exception("Bad tagger name " + taggername)
|
raise Exception("Bad tagger name " + taggername)
|
||||||
|
|
||||||
def process(self, params):
|
def process(self, params):
|
||||||
if 'data' not in params:
|
if 'data' not in params:
|
||||||
return {'error':'No data field in parameters'}
|
return {'error':'No data field in parameters'}
|
||||||
if not (self.tagsOkt or self.tagsMecab):
|
if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
|
||||||
if 'tagger' not in params:
|
if 'tagger' not in params:
|
||||||
return {'error':'No "tagger" field in parameters'}
|
return {'error':'No "tagger" field in parameters'}
|
||||||
self._init_tagger(params['tagger']);
|
self._init_tagger(params['tagger']);
|
||||||
@ -65,7 +69,7 @@ class Processor(object):
|
|||||||
tag = e[1]
|
tag = e[1]
|
||||||
if self.tagsOkt:
|
if self.tagsOkt:
|
||||||
pass
|
pass
|
||||||
elif self.tagsMecab:
|
elif self.tagsMecab or self.tagsKomoran:
|
||||||
tb = tag[0:2]
|
tb = tag[0:2]
|
||||||
if tb[0] == "N":
|
if tb[0] == "N":
|
||||||
tag = "Noun"
|
tag = "Noun"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user