Korean text: sort-of-working version, in need of validation

2020-03-22 15:49:24 +01:00 · 2020-03-22 15:49:24 +01:00 · c9667b5ba7
commit c9667b5ba7
parent 384e3a1087
3 changed files with 79 additions and 31 deletions
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -68,6 +68,8 @@ static bool initCmd()
    return true;
 }

+static const string sepchars("\t");
+
 bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
 {
    std::unique_lock<std::mutex> mylock(o_mutex);
@ -82,6 +84,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
    unordered_map<string, string> args;
    args.insert(pair<string,string>{"data", string()});
    string& inputdata{args.begin()->second};
+    string::size_type orgbytepos = it.getBpos();
    
    // Gather all Korean characters and send the text to the analyser
    for (; !it.eof(); it++) {
@ -94,6 +97,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
            it.appendchartostring(inputdata);
        }
    }
+    // Need to convert white text spans to single space otherwise the
+    // byte offsets will be wrong
+    
+    string::size_type textsize = inputdata.size();
    LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
            " bytes " << inputdata << endl);
    unordered_map<string,string> result;
@ -101,34 +108,69 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
        LOGERR("Python splitter for Korean failed\n");
        return false;
    }
-    auto resit = result.find("data");
+
+    auto resit = result.find("text");
    if (resit == result.end()) {
-        LOGERR("No data in Python splitter for Korean\n");
+        LOGERR("No text in Python splitter for Korean\n");
        return false;
    }        
-    string& outdata = resit->second;
-    char sepchar = '^';
-    //std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
-    string::size_type wordstart = 0;
-    string::size_type wordend = outdata.find(sepchar);
-    for (;;) {
-        //cerr << "start " << wordstart << " end " << wordend << endl;        
-        if (wordend != wordstart) {
-            string::size_type len = (wordend == string::npos) ?
-                wordend : wordend-wordstart;
-            string word = outdata.substr(wordstart, len);
-            //cerr << " WORD[" <<  word << "]\n";
-            if (!takeword(word, m_wordpos++, 0, 0)) {
+    string& outtext = resit->second;
+    vector<string> words;
+    stringToTokens(outtext, words, sepchars);
+
+    resit = result.find("tags");
+    if (resit == result.end()) {
+        LOGERR("No tags in Python splitter for Korean\n");
+        return false;
+    }        
+    string& outtags = resit->second;
+    vector<string> tags;
+    stringToTokens(outtags, tags, sepchars);
+
+    // This is the position in the whole text, not the local fragment,
+    // which is bytepos-orgbytepos
+    string::size_type bytepos(orgbytepos);
+    for (unsigned int i = 0; i < words.size(); i++) {
+        // The POS tagger strips characters from the input (e.g. multiple
+        // spaces, sometimes new lines, possibly other stuff). This
+        // means that we can't easily reconstruct the byte position
+        // from the concatenated terms. The output seems to be always
+        // shorter than the input, so we try to look ahead for the
+        // term. Can't be too sure that this works though, depending
+        // on exactly what transformation may have been applied from
+        // the original input to the term.
+        string word = words[i];
+        trimstring(word);
+        string::size_type newpos = bytepos - orgbytepos;
+        newpos = inputdata.find(word, newpos);
+        if (newpos != string::npos) {
+            bytepos = orgbytepos + newpos;
+        }
+        LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
+                " FOUND POS " << newpos << endl);
+        if (tags[i] == "Noun" || tags[i] == "Verb" ||
+            tags[i] == "Adjective" || tags[i] == "Adverb") {
+            if (!takeword(
+                    word, m_wordpos++, bytepos, bytepos + words[i].size())) {
                return false;
            }
        }
-        if (wordend == string::npos)
-            break;
-        wordstart = wordend + 1;
-        wordend = outdata.find(sepchar, wordstart);
+        LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
+               " TAG " << tags[i] << endl);
+        bytepos += words[i].size();
    }
-    

+#if DO_CHECK_THINGS
+    int sizediff = textsize - (bytepos - orgbytepos);
+    if (sizediff < 0)
+        sizediff = -sizediff;
+    if (sizediff > 1) {
+        LOGERR("ORIGINAL TEXT SIZE: " << textsize <<
+               " FINAL BYTE POS " << bytepos - orgbytepos <<
+               " TEXT [" << inputdata << "]\n");
+    }
+#endif
+    
    // Reset state, saving term position, and return the found non-cjk
    // Unicode character value. The current input byte offset is kept
    // in the utf8Iter
--- a/src/filters/cmdtalk.py
+++ b/src/filters/cmdtalk.py
@ -201,7 +201,9 @@ class CmdTalk:


 # Common main routine for testing: either run the normal protocol
-# engine or a local loop.
+# engine or a local loop. This means that you can call
+# cmdtalk.main(proto,processor) instead of proto.mainloop(processor)
+# from your module, and get the benefits of command line testing
 def main(proto, processor):
    if len(sys.argv) == 1:
        proto.mainloop(processor)
@ -220,7 +222,7 @@ def main(proto, processor):
    if len(args) == 0 or len(args) % 2 != 0:
        usage()
    params = dict()
-    for i in range(len(args)/2):
+    for i in range(int(len(args)/2)):
        params[args[2*i]] = args[2*i+1]
    res = processor.process(params)

--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -28,25 +28,29 @@
 import sys
 import cmdtalk

-from konlpy.tag import Okt
+from konlpy.tag import Okt,Kkma

 class Processor(object):
    def __init__(self, proto):
        self.proto = proto
-        self.okt = Okt()
+        self.tagger = Okt()
+        #self.tagger = Kkma()

    def process(self, params):
        if 'data' not in params:
            return {'error':'No data field in parameters'}
-        pos = self.okt.pos(params['data'])
+        pos = self.tagger.pos(params['data'])
        #proto.log("%s" % pos)
-        output = ""
+        text = ""
+        tags = ""
        for e in pos:
-            if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
-               e[1] == 'Adverb':
-                output += e[0] + '^'
-        return {'data': output}
+            word = e[0]
+            word = word.replace('\t', ' ')
+            text += word + "\t"
+            tags += e[1] + "\t"
+        return {'text': text, 'tags': tags}

 proto = cmdtalk.CmdTalk()
 processor = Processor(proto)
-proto.mainloop(processor)
+cmdtalk.main(proto, processor)
+