textsplit: break on it.error() not only it.eof(). Seems to make a difference in rare cases? Add Komoran support but this one often fails
This commit is contained in:
parent
b677171fa8
commit
1afc606718
@ -622,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
|
||||
int prev_csc = -1;
|
||||
#endif
|
||||
for (; !it.eof(); it++) {
|
||||
for (; !it.eof() && !it.error(); it++) {
|
||||
unsigned int c = *it;
|
||||
nonalnumcnt++;
|
||||
|
||||
@ -668,7 +668,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
}
|
||||
// Check for eof, else c contains the first non-cjk
|
||||
// character after the cjk sequence, just go on.
|
||||
if (it.eof())
|
||||
if (it.eof() || it.error())
|
||||
break;
|
||||
}
|
||||
|
||||
@ -996,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
// Current number of valid offsets;
|
||||
unsigned int nchars = 0;
|
||||
unsigned int c = 0;
|
||||
for (; !it.eof(); it++) {
|
||||
for (; !it.eof() && !it.error(); it++) {
|
||||
c = *it;
|
||||
if (c == ' ' || c == '\t' || c == '\n') {
|
||||
continue;
|
||||
@ -1097,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
||||
bool TextSplit::hasVisibleWhite(const string &in)
|
||||
{
|
||||
Utf8Iter it(in);
|
||||
for (; !it.eof(); it++) {
|
||||
for (; !it.eof() && !it.error(); it++) {
|
||||
unsigned int c = (unsigned char)*it;
|
||||
if (c == (unsigned int)-1) {
|
||||
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
|
||||
@ -1117,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
||||
tokens.clear();
|
||||
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
||||
states state = SPACE;
|
||||
for (; !it.eof(); it++) {
|
||||
for (; !it.eof() && !it.error(); it++) {
|
||||
unsigned int c = *it;
|
||||
if (visiblewhite.find(c) != visiblewhite.end())
|
||||
c = ' ';
|
||||
|
||||
@ -56,7 +56,7 @@ static uint64_t restartthreshold = 5 * 1000 * 1000;
|
||||
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
|
||||
{
|
||||
o_cmdpath = config->findFilter("kosplitter.py");
|
||||
if (tagger == "Okt" || tagger == "Mecab") {
|
||||
if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
|
||||
o_taggername = tagger;
|
||||
} else {
|
||||
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
|
||||
@ -122,7 +122,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
// Walk the Korean characters section and send the text to the
|
||||
// analyser
|
||||
string::size_type orgbytepos = it.getBpos();
|
||||
for (; !it.eof(); it++) {
|
||||
for (; !it.eof() && !it.error(); it++) {
|
||||
c = *it;
|
||||
if (!isHANGUL(c) && isalpha(c)) {
|
||||
// Done with Korean stretch, process and go back to main routine
|
||||
@ -137,7 +137,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
restartcount += inputdata.size();
|
||||
unordered_map<string,string> result;
|
||||
if (!o_talker->talk(args, result)) {
|
||||
LOGERR("Python splitter for Korean failed\n");
|
||||
LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -28,13 +28,14 @@
|
||||
import sys
|
||||
import cmdtalk
|
||||
|
||||
from konlpy.tag import Okt,Mecab
|
||||
from konlpy.tag import Okt,Mecab,Komoran
|
||||
|
||||
class Processor(object):
|
||||
def __init__(self, proto):
|
||||
self.proto = proto
|
||||
self.tagsOkt = False
|
||||
self.tagsMecab = False
|
||||
self.tagsKomoran = False
|
||||
|
||||
def _init_tagger(self, taggername):
|
||||
if taggername == "Okt":
|
||||
@ -43,13 +44,16 @@ class Processor(object):
|
||||
elif taggername == "Mecab":
|
||||
self.tagger = Mecab()
|
||||
self.tagsMecab = True
|
||||
elif taggername == "Komoran":
|
||||
self.tagger = Komoran()
|
||||
self.tagsKomoran = True
|
||||
else:
|
||||
raise Exception("Bad tagger name " + taggername)
|
||||
|
||||
def process(self, params):
|
||||
if 'data' not in params:
|
||||
return {'error':'No data field in parameters'}
|
||||
if not (self.tagsOkt or self.tagsMecab):
|
||||
if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
|
||||
if 'tagger' not in params:
|
||||
return {'error':'No "tagger" field in parameters'}
|
||||
self._init_tagger(params['tagger']);
|
||||
@ -65,7 +69,7 @@ class Processor(object):
|
||||
tag = e[1]
|
||||
if self.tagsOkt:
|
||||
pass
|
||||
elif self.tagsMecab:
|
||||
elif self.tagsMecab or self.tagsKomoran:
|
||||
tb = tag[0:2]
|
||||
if tb[0] == "N":
|
||||
tag = "Noun"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user