Korean text: sort-of-working version, in need of validation

2020-03-22 15:49:24 +01:00 · 2020-03-22 15:49:24 +01:00 · c9667b5ba7
commit c9667b5ba7
parent 384e3a1087
3 changed files with 79 additions and 31 deletions
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -68,6 +68,8 @@ static bool initCmd()
    return true;
 }
 static const string sepchars("\t");
 bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
 {
    std::unique_lock<std::mutex> mylock(o_mutex);
@ -82,6 +84,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
    unordered_map<string, string> args;
    args.insert(pair<string,string>{"data", string()});
    string& inputdata{args.begin()->second};
    string::size_type orgbytepos = it.getBpos();
    // Gather all Korean characters and send the text to the analyser
    for (; !it.eof(); it++) {
@ -94,6 +97,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
            it.appendchartostring(inputdata);
        }
    }
    // Need to convert white text spans to single space otherwise the
    // byte offsets will be wrong
    string::size_type textsize = inputdata.size();
    LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
            " bytes " << inputdata << endl);
    unordered_map<string,string> result;
@ -101,33 +108,68 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
        LOGERR("Python splitter for Korean failed\n");
        return false;
    }
-    auto resit = result.find("data");
+
    auto resit = result.find("text");
    if (resit == result.end()) {
-        LOGERR("No data in Python splitter for Korean\n");
+        LOGERR("No text in Python splitter for Korean\n");
        return false;
    }        
-    string& outdata = resit->second;
+    string& outtext = resit->second;
-    char sepchar = '^';
+    vector<string> words;
-    //std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
+    stringToTokens(outtext, words, sepchars);
-    string::size_type wordstart = 0;
+
-    string::size_type wordend = outdata.find(sepchar);
+    resit = result.find("tags");
-    for (;;) {
+    if (resit == result.end()) {
-        //cerr << "start " << wordstart << " end " << wordend << endl;        
+        LOGERR("No tags in Python splitter for Korean\n");
-        if (wordend != wordstart) {
+        return false;
-            string::size_type len = (wordend == string::npos) ?
+    }        
-                wordend : wordend-wordstart;
+    string& outtags = resit->second;
-            string word = outdata.substr(wordstart, len);
+    vector<string> tags;
-            //cerr << " WORD[" <<  word << "]\n";
+    stringToTokens(outtags, tags, sepchars);
-            if (!takeword(word, m_wordpos++, 0, 0)) {
+
    // This is the position in the whole text, not the local fragment,
    // which is bytepos-orgbytepos
    string::size_type bytepos(orgbytepos);
    for (unsigned int i = 0; i < words.size(); i++) {
        // The POS tagger strips characters from the input (e.g. multiple
        // spaces, sometimes new lines, possibly other stuff). This
        // means that we can't easily reconstruct the byte position
        // from the concatenated terms. The output seems to be always
        // shorter than the input, so we try to look ahead for the
        // term. Can't be too sure that this works though, depending
        // on exactly what transformation may have been applied from
        // the original input to the term.
        string word = words[i];
        trimstring(word);
        string::size_type newpos = bytepos - orgbytepos;
        newpos = inputdata.find(word, newpos);
        if (newpos != string::npos) {
            bytepos = orgbytepos + newpos;
        }
        LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
                " FOUND POS " << newpos << endl);
        if (tags[i] == "Noun" || tags[i] == "Verb" ||
            tags[i] == "Adjective" || tags[i] == "Adverb") {
            if (!takeword(
                    word, m_wordpos++, bytepos, bytepos + words[i].size())) {
                return false;
            }
        }
-        if (wordend == string::npos)
+        LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
-            break;
+               " TAG " << tags[i] << endl);
-        wordstart = wordend + 1;
+        bytepos += words[i].size();
        wordend = outdata.find(sepchar, wordstart);
    }
 #if DO_CHECK_THINGS
    int sizediff = textsize - (bytepos - orgbytepos);
    if (sizediff < 0)
        sizediff = -sizediff;
    if (sizediff > 1) {
        LOGERR("ORIGINAL TEXT SIZE: " << textsize <<
               " FINAL BYTE POS " << bytepos - orgbytepos <<
               " TEXT [" << inputdata << "]\n");
    }
 #endif
    // Reset state, saving term position, and return the found non-cjk
    // Unicode character value. The current input byte offset is kept
--- a/src/filters/cmdtalk.py
+++ b/src/filters/cmdtalk.py
@ -201,7 +201,9 @@ class CmdTalk:
 # Common main routine for testing: either run the normal protocol
-# engine or a local loop.
+# engine or a local loop. This means that you can call
 # cmdtalk.main(proto,processor) instead of proto.mainloop(processor)
 # from your module, and get the benefits of command line testing
 def main(proto, processor):
    if len(sys.argv) == 1:
        proto.mainloop(processor)
@ -220,7 +222,7 @@ def main(proto, processor):
    if len(args) == 0 or len(args) % 2 != 0:
        usage()
    params = dict()
-    for i in range(len(args)/2):
+    for i in range(int(len(args)/2)):
        params[args[2*i]] = args[2*i+1]
    res = processor.process(params)
--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -28,25 +28,29 @@
 import sys
 import cmdtalk
-from konlpy.tag import Okt
+from konlpy.tag import Okt,Kkma
 class Processor(object):
    def __init__(self, proto):
        self.proto = proto
-        self.okt = Okt()
+        self.tagger = Okt()
        #self.tagger = Kkma()
    def process(self, params):
        if 'data' not in params:
            return {'error':'No data field in parameters'}
-        pos = self.okt.pos(params['data'])
+        pos = self.tagger.pos(params['data'])
        #proto.log("%s" % pos)
-        output = ""
+        text = ""
        tags = ""
        for e in pos:
-            if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
+            word = e[0]
-               e[1] == 'Adverb':
+            word = word.replace('\t', ' ')
-                output += e[0] + '^'
+            text += word + "\t"
-        return {'data': output}
+            tags += e[1] + "\t"
        return {'text': text, 'tags': tags}
 proto = cmdtalk.CmdTalk()
 processor = Processor(proto)
-proto.mainloop(processor)
+cmdtalk.main(proto, processor)