diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index f1929a6f..e4c624b4 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -68,6 +68,8 @@ static bool initCmd() return true; } +static const string sepchars("\t"); + bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) { std::unique_lock mylock(o_mutex); @@ -82,6 +84,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) unordered_map args; args.insert(pair{"data", string()}); string& inputdata{args.begin()->second}; + string::size_type orgbytepos = it.getBpos(); // Gather all Korean characters and send the text to the analyser for (; !it.eof(); it++) { @@ -94,6 +97,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) it.appendchartostring(inputdata); } } + // Need to convert white text spans to single space otherwise the + // byte offsets will be wrong + + string::size_type textsize = inputdata.size(); LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() << " bytes " << inputdata << endl); unordered_map result; @@ -101,34 +108,69 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) LOGERR("Python splitter for Korean failed\n"); return false; } - auto resit = result.find("data"); + + auto resit = result.find("text"); if (resit == result.end()) { - LOGERR("No data in Python splitter for Korean\n"); + LOGERR("No text in Python splitter for Korean\n"); return false; } - string& outdata = resit->second; - char sepchar = '^'; - //std::cerr << "GOT FROM SPLITTER: " << outdata << endl; - string::size_type wordstart = 0; - string::size_type wordend = outdata.find(sepchar); - for (;;) { - //cerr << "start " << wordstart << " end " << wordend << endl; - if (wordend != wordstart) { - string::size_type len = (wordend == string::npos) ? - wordend : wordend-wordstart; - string word = outdata.substr(wordstart, len); - //cerr << " WORD[" << word << "]\n"; - if (!takeword(word, m_wordpos++, 0, 0)) { + string& outtext = resit->second; + vector words; + stringToTokens(outtext, words, sepchars); + + resit = result.find("tags"); + if (resit == result.end()) { + LOGERR("No tags in Python splitter for Korean\n"); + return false; + } + string& outtags = resit->second; + vector tags; + stringToTokens(outtags, tags, sepchars); + + // This is the position in the whole text, not the local fragment, + // which is bytepos-orgbytepos + string::size_type bytepos(orgbytepos); + for (unsigned int i = 0; i < words.size(); i++) { + // The POS tagger strips characters from the input (e.g. multiple + // spaces, sometimes new lines, possibly other stuff). This + // means that we can't easily reconstruct the byte position + // from the concatenated terms. The output seems to be always + // shorter than the input, so we try to look ahead for the + // term. Can't be too sure that this works though, depending + // on exactly what transformation may have been applied from + // the original input to the term. + string word = words[i]; + trimstring(word); + string::size_type newpos = bytepos - orgbytepos; + newpos = inputdata.find(word, newpos); + if (newpos != string::npos) { + bytepos = orgbytepos + newpos; + } + LOGDEB1("WORD OPOS " << bytepos-orgbytepos << + " FOUND POS " << newpos << endl); + if (tags[i] == "Noun" || tags[i] == "Verb" || + tags[i] == "Adjective" || tags[i] == "Adverb") { + if (!takeword( + word, m_wordpos++, bytepos, bytepos + words[i].size())) { return false; } } - if (wordend == string::npos) - break; - wordstart = wordend + 1; - wordend = outdata.find(sepchar, wordstart); + LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() << + " TAG " << tags[i] << endl); + bytepos += words[i].size(); } - +#if DO_CHECK_THINGS + int sizediff = textsize - (bytepos - orgbytepos); + if (sizediff < 0) + sizediff = -sizediff; + if (sizediff > 1) { + LOGERR("ORIGINAL TEXT SIZE: " << textsize << + " FINAL BYTE POS " << bytepos - orgbytepos << + " TEXT [" << inputdata << "]\n"); + } +#endif + // Reset state, saving term position, and return the found non-cjk // Unicode character value. The current input byte offset is kept // in the utf8Iter diff --git a/src/filters/cmdtalk.py b/src/filters/cmdtalk.py index 8bb49e28..2949e936 100644 --- a/src/filters/cmdtalk.py +++ b/src/filters/cmdtalk.py @@ -201,7 +201,9 @@ class CmdTalk: # Common main routine for testing: either run the normal protocol -# engine or a local loop. +# engine or a local loop. This means that you can call +# cmdtalk.main(proto,processor) instead of proto.mainloop(processor) +# from your module, and get the benefits of command line testing def main(proto, processor): if len(sys.argv) == 1: proto.mainloop(processor) @@ -220,7 +222,7 @@ def main(proto, processor): if len(args) == 0 or len(args) % 2 != 0: usage() params = dict() - for i in range(len(args)/2): + for i in range(int(len(args)/2)): params[args[2*i]] = args[2*i+1] res = processor.process(params) diff --git a/src/filters/kosplitter.py b/src/filters/kosplitter.py index d7d394c6..4037acb5 100755 --- a/src/filters/kosplitter.py +++ b/src/filters/kosplitter.py @@ -28,25 +28,29 @@ import sys import cmdtalk -from konlpy.tag import Okt +from konlpy.tag import Okt,Kkma class Processor(object): def __init__(self, proto): self.proto = proto - self.okt = Okt() + self.tagger = Okt() + #self.tagger = Kkma() def process(self, params): if 'data' not in params: return {'error':'No data field in parameters'} - pos = self.okt.pos(params['data']) + pos = self.tagger.pos(params['data']) #proto.log("%s" % pos) - output = "" + text = "" + tags = "" for e in pos: - if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \ - e[1] == 'Adverb': - output += e[0] + '^' - return {'data': output} + word = e[0] + word = word.replace('\t', ' ') + text += word + "\t" + tags += e[1] + "\t" + return {'text': text, 'tags': tags} proto = cmdtalk.CmdTalk() processor = Processor(proto) -proto.mainloop(processor) +cmdtalk.main(proto, processor) +