Merge branch 'kopostag'

2020-03-26 14:03:17 +01:00 · 2020-03-26 14:03:17 +01:00 · 9b3a5fac12
commit 9b3a5fac12
parent f755505e98 1afc606718
18 changed files with 5527 additions and 1275 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -82,6 +82,7 @@ common/rclinit.h \
 common/syngroups.cpp \
 common/syngroups.h \
 common/textsplit.cpp \
+common/textsplitko.cpp \
 common/textsplit.h \
 common/unacpp.cpp \
 common/unacpp.h \
@ -210,6 +211,8 @@ utils/circache.cpp \
 utils/circache.h \
 utils/closefrom.cpp \
 utils/closefrom.h \
+utils/cmdtalk.cpp \
+utils/cmdtalk.h \
 utils/conftree.cpp \
 utils/conftree.h \
 utils/copyfile.cpp \
@ -645,8 +648,10 @@ filterdir = $(pkgdatadir)/filters
 dist_filter_DATA = \
 desktop/hotrecoll.py \
 filters/abiword.xsl \
+filters/cmdtalk.py \
 filters/fb2.xsl \
 filters/gnumeric.xsl \
+filters/kosplitter.py \
 filters/msodump.zip \
 filters/okular-note.xsl \
 filters/opendoc-body.xsl \
@ -724,7 +729,7 @@ python/recoll/recoll/rclconfig.py
 install-data-hook: 
 	(cd $(DESTDIR)/$(filterdir); \
 	chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
-	chmod a+x recoll-we-move-files.py ../examples/rclmon.sh; \
+	chmod a+x recoll-we-move-files.py ../examples/rclmon.sh kosplitter.py; \
 	chmod 0644 msodump.zip recollepub.zip rclexecm.py rcllatinstops.zip \
        rclconfig.py conftree.py rclmidi.py rclexec1.py rcluncomp.py rclxslt.py)

--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -44,8 +44,10 @@
 // ngrams
 #undef KATAKANA_AS_WORDS

-// Same for Korean syllabic, and same problem, not used.
-#undef HANGUL_AS_WORDS
+// Same for Korean syllabic, and same problem. However we have a
+// runtime option to use an external text analyser for hangul, so this
+// is defined at compile time.
+#define HANGUL_AS_WORDS

 using namespace std;

@ -289,6 +291,7 @@ bool          TextSplit::o_noNumbers{false};
 bool          TextSplit::o_deHyphenate{false};
 int           TextSplit::o_maxWordLength{40};
 static const int o_CJKMaxNgramLen{5};
+bool o_exthangultagger{false};

 void TextSplit::staticConfInit(RclConfig *config)
 {
@ -323,7 +326,14 @@ void TextSplit::staticConfInit(RclConfig *config)
            charclasses[int('\\')] = SPACE;
        }
    }
-}    
+
+    string kotagger;
+    config->getConfParam("hangultagger", kotagger);
+    if (!kotagger.empty()) {
+        o_exthangultagger = true;
+        koStaticConfInit(config, kotagger);
+    }
+}

 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
@ -612,7 +622,7 @@ bool TextSplit::text_to_words(const string &in)
 #if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
    int prev_csc = -1;
 #endif
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = *it;
        nonalnumcnt++;

@ -625,30 +635,40 @@ bool TextSplit::text_to_words(const string &in)
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
        } else if (UNICODE_IS_HANGUL(c)) {
-            csc = CSC_HANGUL;
+            if (o_exthangultagger) {
+                csc = CSC_HANGUL;
+            } else {
+                csc = CSC_CJK;
+            }
        } else if (UNICODE_IS_CJK(c)) {
            csc = CSC_CJK;
        } else {
            csc = CSC_OTHER;
        }

-        if (o_processCJK && csc == CSC_CJK) {
-            // CJK character hit. 
+        if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) {
+            // CJK character hit. Hangul processing may be special.
+
            // Do like at EOF with the current non-cjk data.
            if (m_wordLen || m_span.length()) {
                if (!doemit(true, it.getBpos()))
                    return false;
            }
-
-            // Hand off situation to the cjk routine.
-            if (!cjk_to_words(&it, &c)) {
-                LOGERR("Textsplit: scan error in cjk handler\n");
-                return false;
+            // Hand off situation to the appropriate routine.
+            if (csc == CSC_HANGUL) {
+                if (!ko_to_words(&it, &c)) {
+                    LOGERR("Textsplit: scan error in korean handler\n");
+                    return false;
+                }
+            } else {
+                if (!cjk_to_words(&it, &c)) {
+                    LOGERR("Textsplit: scan error in cjk handler\n");
+                    return false;
+                }
            }
-
            // Check for eof, else c contains the first non-cjk
            // character after the cjk sequence, just go on.
-            if (it.eof())
+            if (it.eof() || it.error())
                break;
        }

@ -976,7 +996,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
    // Current number of valid offsets;
    unsigned int nchars = 0;
    unsigned int c = 0;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        c = *it;
        if (c == ' ' || c == '\t' || c == '\n') {
            continue;
@ -1077,7 +1097,7 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
 bool TextSplit::hasVisibleWhite(const string &in)
 {
    Utf8Iter it(in);
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = (unsigned char)*it;
        if (c == (unsigned int)-1) {
            LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
@ -1097,7 +1117,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
    tokens.clear();
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
    states state = SPACE;
-    for (; !it.eof(); it++) {
+    for (; !it.eof() && !it.error(); it++) {
        unsigned int c = *it;
        if (visiblewhite.find(c) != visiblewhite.end()) 
            c = ' ';
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -54,6 +54,7 @@ public:
    /** Call at program initialization to read non default values from the 
        configuration */
    static void staticConfInit(RclConfig *config);
+    static void koStaticConfInit(RclConfig *config, const std::string& tagger);
    
    /** Split text, emit words and positions. */
    virtual bool text_to_words(const std::string &in);
@ -199,6 +200,9 @@ private:
    // This processes cjk text:
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

+    // Experimental Korean splitter. This uses an external Python tokenizer
+    bool ko_to_words(Utf8Iter *it, unsigned int *cp);
+    
    bool emitterm(bool isspan, std::string &term, int pos, size_t bs,size_t be);
    bool doemit(bool spanerase, size_t bp);
    void discardspan();
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -0,0 +1,214 @@
+/* Copyright (C) 2020 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+// Specialized Korean text splitter using konlpy running in a Python
+// subprocess. konlpy can use several different backends. We support
+// Okt (Twitter) and Mecab at this point. Unfortunately the different
+// backends have different POS TAG names, so that things are not
+// completly transparent when using another (need to translate the tag
+// names in the Python program).
+
+#include "autoconfig.h"
+
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <unordered_set>
+#include <mutex>
+
+#include "textsplit.h"
+#include "log.h"
+//#define UTF8ITER_CHECK
+#include "utf8iter.h"
+#include "smallut.h"
+#include "rclconfig.h"
+#include "cmdtalk.h"
+
+using namespace std;
+
+// Separator char used in words and tags lists.
+static const string sepchars("\t");
+
+static CmdTalk *o_talker;
+static bool o_starterror{false};
+static string o_cmdpath;
+std::mutex o_mutex;
+static string o_taggername{"Okt"};
+
+// The Python/Java splitter is leaking memory. We restart it from time to time
+static uint64_t restartcount;
+static uint64_t restartthreshold = 5 * 1000 * 1000;
+
+void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
+{
+    o_cmdpath = config->findFilter("kosplitter.py");
+    if (tagger == "Okt" || tagger == "Mecab" || tagger == "Komoran") {
+        o_taggername = tagger;
+    } else {
+        LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
+               "], using Okt\n");
+    }
+}
+
+// Start the Python subprocess
+static bool initCmd()
+{
+    if (o_starterror) {
+        // No use retrying
+        return false;
+    }
+    if (o_talker) {
+        if (restartcount > restartthreshold) {
+            delete o_talker;
+            o_talker = nullptr;
+            restartcount = 0;
+        } else {
+            return true;
+        }
+    }
+    if (o_cmdpath.empty()) {
+        return false;
+    }
+    if (nullptr == (o_talker = new CmdTalk(300))) {
+        o_starterror = true;
+        return false;
+    }
+    if (!o_talker->startCmd(o_cmdpath)) {
+        delete o_talker;
+        o_talker = nullptr;
+        o_starterror = true;
+        return false;
+    }
+    return true;
+}
+
+bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
+{
+    std::unique_lock<std::mutex> mylock(o_mutex);
+    initCmd();
+    if (nullptr == o_talker) {
+        return false;
+    }
+
+    LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n");
+    Utf8Iter &it = *itp;
+    unsigned int c = 0;
+
+    unordered_map<string, string> args;
+
+    args.insert(pair<string,string>{"data", string()});
+    string& inputdata{args.begin()->second};
+
+    // We send the tagger name every time but it's only used the first
+    // one: can't change it after init. We could avoid sending it
+    // every time, but I don't think that the performance hit is
+    // significant
+    args.insert(pair<string,string>{"tagger", o_taggername});
+    
+    // Walk the Korean characters section and send the text to the
+    // analyser
+    string::size_type orgbytepos = it.getBpos();
+    for (; !it.eof() && !it.error(); it++) {
+        c = *it;
+        if (!isHANGUL(c) && isalpha(c)) {
+            // Done with Korean stretch, process and go back to main routine
+            //std::cerr << "Broke on char " << (std::string)it << endl;
+            break;
+        } else {
+            it.appendchartostring(inputdata);
+        }
+    }
+    LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
+            " bytes " << inputdata << endl);
+    restartcount += inputdata.size();
+    unordered_map<string,string> result;
+    if (!o_talker->talk(args, result)) {
+        LOGERR("Python splitter for Korean failed for [" << inputdata << "]\n");
+        return false;
+    }
+
+    auto resit = result.find("text");
+    if (resit == result.end()) {
+        LOGERR("No text in Python splitter for Korean\n");
+        return false;
+    }        
+    string& outtext = resit->second;
+    vector<string> words;
+    stringToTokens(outtext, words, sepchars);
+
+    resit = result.find("tags");
+    if (resit == result.end()) {
+        LOGERR("No tags in Python splitter for Korean\n");
+        return false;
+    }        
+    string& outtags = resit->second;
+    vector<string> tags;
+    stringToTokens(outtags, tags, sepchars);
+
+    // This is the position in the whole text, not the local fragment,
+    // which is bytepos-orgbytepos
+    string::size_type bytepos(orgbytepos);
+    for (unsigned int i = 0; i < words.size(); i++) {
+        // The POS tagger strips characters from the input (e.g. multiple
+        // spaces, sometimes new lines, possibly other stuff). This
+        // means that we can't easily reconstruct the byte position
+        // from the concatenated terms. The output seems to be always
+        // shorter than the input, so we try to look ahead for the
+        // term. Can't be too sure that this works though, depending
+        // on exactly what transformation may have been applied from
+        // the original input to the term.
+        string word = words[i];
+        trimstring(word);
+        string::size_type newpos = bytepos - orgbytepos;
+        newpos = inputdata.find(word, newpos);
+        if (newpos != string::npos) {
+            bytepos = orgbytepos + newpos;
+        }
+        LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
+                " FOUND POS " << newpos << endl);
+        if (tags[i] == "Noun" || tags[i] == "Verb" ||
+            tags[i] == "Adjective" || tags[i] == "Adverb") {
+            if (!takeword(
+                    word, m_wordpos++, bytepos, bytepos + words[i].size())) {
+                return false;
+            }
+        }
+        LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
+               " TAG " << tags[i] << endl);
+        bytepos += words[i].size();
+    }
+
+#if DO_CHECK_THINGS
+    int sizediff = inputdata.size() - (bytepos - orgbytepos);
+    if (sizediff < 0)
+        sizediff = -sizediff;
+    if (sizediff > 1) {
+        LOGERR("ORIGINAL TEXT SIZE: " << inputdata.size() <<
+               " FINAL BYTE POS " << bytepos - orgbytepos <<
+               " TEXT [" << inputdata << "]\n");
+    }
+#endif
+    
+    // Reset state, saving term position, and return the found non-cjk
+    // Unicode character value. The current input byte offset is kept
+    // in the utf8Iter
+    int pos = m_wordpos;
+    clearsplitstate();
+    m_spanpos = m_wordpos = pos;
+    *cp = c;
+    return true;
+}
--- a/src/filters/cmdtalk.py
+++ b/src/filters/cmdtalk.py
@ -0,0 +1,236 @@
+#################################
+# Copyright (C) 2016 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+# Command communication module and utilities. See commands in cmdtalk.h
+#
+# All data is binary. This is important for Python3
+# All parameter names are converted to and processed as str/unicode
+
+from __future__ import print_function
+
+import sys
+import os
+import tempfile
+import shutil
+import getopt
+import traceback
+
+PY3 = sys.version > '3'
+
+if PY3:
+    def makebytes(data):
+        if isinstance(data, bytes):
+            return data
+        else:
+            return data.encode("UTF-8")
+else:
+    def makebytes(data):
+        if isinstance(data, unicode):
+            return data.encode("UTF-8")
+        else:
+            return data
+
+
+############################################
+# CmdTalk implements the
+# communication protocol with the master process. It calls an external
+# method to use the args and produce return data.
+class CmdTalk:
+
+    def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None):
+        try:
+            self.myname = os.path.basename(sys.argv[0])
+        except:
+            self.myname = "???"
+
+        self.outfile = outfile
+        self.infile = infile
+        self.exitfunc = exitfunc
+        self.fields = {}
+        
+        if sys.platform == "win32":
+            import msvcrt
+            msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
+            msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
+        self.debugfile = None
+        if self.debugfile:
+            self.errfout = open(self.debugfile, "a")
+        else:
+            self.errfout = sys.stderr
+        
+    def log(self, s, doexit = 0, exitvalue = 1):
+        print("CMDTALK: %s: %s" % (self.myname, s), file=self.errfout)
+        if doexit:
+            if self.exitfunc:
+                self.exitfunc(exitvalue)
+            sys.exit(exitvalue)
+
+    def breakwrite(self, outfile, data):
+        if sys.platform != "win32":
+            outfile.write(data)
+        else:
+            # On windows, writing big chunks can fail with a "not enough space"
+            # error. Seems a combined windows/python bug, depending on versions.
+            # See https://bugs.python.org/issue11395
+            # In any case, just break it up
+            total = len(data)
+            bs = 4*1024
+            offset = 0
+            while total > 0:
+                if total < bs:
+                    tow = total
+                else:
+                    tow = bs
+                #self.log("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
+                outfile.write(data[offset:offset+tow])
+                offset += tow
+                total -= tow
+                
+    # Read single parameter from process input: line with param name and size
+    # followed by data. The param name is returned as str/unicode, the data
+    # as bytes
+    def readparam(self):
+        if PY3:
+            inf = self.infile.buffer
+        else:
+            inf = self.infile
+        s = inf.readline()
+        if s == b'':
+            if self.exitfunc:
+                self.exitfunc(0)
+            sys.exit(0)
+
+        s = s.rstrip(b'\n')
+
+        if s == b'':
+            return ('', b'')
+        l = s.split()
+        if len(l) != 2:
+            self.log(b'bad line: [' + s + b']', 1, 1)
+
+        paramname = l[0].decode('ASCII').rstrip(':')
+        paramsize = int(l[1])
+        if paramsize > 0:
+            paramdata = inf.read(paramsize)
+            if len(paramdata) != paramsize:
+                self.log("Bad read: wanted %d, got %d" %
+                      (paramsize, len(paramdata)), 1, 1)
+        else:
+            paramdata = b''
+        if PY3:
+            paramdata = paramdata.decode('utf-8')
+    
+        #self.log("paramname [%s] paramsize %d value [%s]" %
+        #          (paramname, paramsize, paramdata))
+        return (paramname, paramdata)
+
+    if PY3:
+        def senditem(self, nm, data):
+            data = makebytes(data)
+            l = len(data)
+            self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l)))
+            self.breakwrite(self.outfile.buffer, data)
+    else:
+        def senditem(self, nm, data):
+            data = makebytes(data)
+            l = len(data)
+            self.outfile.write(makebytes("%s: %d\n" % (nm, l)))
+            self.breakwrite(self.outfile, data)
+        
+    # Send answer: document, ipath, possible eof.
+    def answer(self, outfields):
+        for nm,value in outfields.items():
+            #self.log("Senditem: [%s] -> [%s]" % (nm, value))
+            self.senditem(nm, value)
+            
+        # End of message
+        print(file=self.outfile)
+        self.outfile.flush()
+        #self.log("done writing data")
+
+    # Call processor with input params, send result
+    def processmessage(self, processor, params):
+        # In normal usage we try to recover from processor errors, but
+        # we sometimes want to see the real stack trace when testing
+        safeexec = True
+        if safeexec:
+            try:
+                outfields = processor.process(params)
+            except Exception as err:
+                self.log("processmessage: processor raised: [%s]" % err)
+                traceback.print_exc()
+                outfields = {}
+                outfields["cmdtalkstatus"] = "1"
+                outfields["cmdtalkerrstr"] = str(err)
+        else:
+            outfields = processor.process(params)
+
+        self.answer(outfields)
+
+    # Loop on messages from our master
+    def mainloop(self, processor):
+        while 1:
+            #self.log("waiting for command")
+
+            params = dict()
+
+            # Read at most 10 parameters (normally 1 or 2), stop at empty line
+            # End of message is signalled by empty paramname
+            for i in range(10):
+                paramname, paramdata = self.readparam()
+                if paramname == "":
+                    break
+                params[paramname] = paramdata
+
+            # Got message, act on it
+            self.processmessage(processor, params)
+
+
+# Common main routine for testing: either run the normal protocol
+# engine or a local loop. This means that you can call
+# cmdtalk.main(proto,processor) instead of proto.mainloop(processor)
+# from your module, and get the benefits of command line testing
+def main(proto, processor):
+    if len(sys.argv) == 1:
+        proto.mainloop(processor)
+        # mainloop does not return. Just in case
+        sys.exit(1)
+
+    # Not running the main loop: run one processor call for debugging
+    def usage():
+        print("Usage: cmdtalk.py pname pvalue [pname pvalue...]",
+              file=sys.stderr)
+        sys.exit(1)
+    def debprint(out, s):
+        proto.breakwrite(out, makebytes(s+'\n'))
+        
+    args = sys.argv[1:]
+    if len(args) == 0 or len(args) % 2 != 0:
+        usage()
+    params = dict()
+    for i in range(int(len(args)/2)):
+        params[args[2*i]] = args[2*i+1]
+    res = processor.process(params)
+
+    ioout = sys.stdout.buffer if PY3 else sys.stdout
+
+    for nm,value in res.items():
+        #self.log("Senditem: [%s] -> [%s]" % (nm, value))
+        bdata = makebytes(value)
+        debprint(ioout, "%s->" % nm)
+        proto.breakwrite(ioout, bdata)
+        ioout.write(b'\n')
--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -0,0 +1,90 @@
+#!/usr/bin/python3
+#################################
+# Copyright (C) 2020 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+
+#
+# Interface to the konlpy Korean text analyser: we receive text from
+# our parent process and have it segmented by the analyser, then
+# return the results. The analyser startup is very expensive (several
+# seconds), which is why we can't just execute it from the main
+# process.
+#
+
+import sys
+import cmdtalk
+
+from konlpy.tag import Okt,Mecab,Komoran
+
+class Processor(object):
+    def __init__(self, proto):
+        self.proto = proto
+        self.tagsOkt = False
+        self.tagsMecab = False
+        self.tagsKomoran = False
+
+    def _init_tagger(self, taggername):
+        if taggername == "Okt":
+            self.tagger = Okt()
+            self.tagsOkt = True
+        elif taggername == "Mecab":
+            self.tagger = Mecab()
+            self.tagsMecab = True
+        elif taggername == "Komoran":
+            self.tagger = Komoran()
+            self.tagsKomoran = True
+        else:
+            raise Exception("Bad tagger name " + taggername)
+        
+    def process(self, params):
+        if 'data' not in params:
+            return {'error':'No data field in parameters'}
+        if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
+            if 'tagger' not in params:
+                return {'error':'No "tagger" field in parameters'}
+            self._init_tagger(params['tagger']);
+                              
+        pos = self.tagger.pos(params['data'])
+        #proto.log("%s" % pos)
+        text = ""
+        tags = ""
+        for e in pos:
+            word = e[0]
+            word = word.replace('\t', ' ')
+            text += word + "\t"
+            tag = e[1]
+            if self.tagsOkt:
+                pass
+            elif self.tagsMecab or self.tagsKomoran:
+                tb = tag[0:2]
+                if tb[0] == "N":
+                    tag = "Noun"
+                elif tb == "VV":
+                    tag = "Verb"
+                elif tb == "VA":
+                    tag = "Adjective"
+                elif tag == "MAG":
+                    tag = "Adverb"
+            else:
+                pass
+            tags += tag + "\t"
+        return {'text': text, 'tags': tags}
+
+
+proto = cmdtalk.CmdTalk()
+processor = Processor(proto)
+cmdtalk.main(proto, processor)
--- a/src/filters/rclhwp.py
+++ b/src/filters/rclhwp.py
@ -36,20 +36,6 @@ from hwp5.xmlmodel import Hwp5File as xml_Hwp5File
 from hwp5.utils import cached_property


-# This was duplicated from hwp5 hwp5text.py and I don't really
-# understand what it does...
-RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl'
-class TextTransform(BaseTransform):
-    @property
-    def transform_hwp5_to_text(self):
-        transform_xhwp5 = self.transform_xhwp5_to_text
-        return self.make_transform_hwp5(transform_xhwp5)
-    @cached_property
-    def transform_xhwp5_to_text(self):
-        resource_path = RESOURCE_PATH_XSL_TEXT
-        return self.make_xsl_transform(resource_path)
-
-
 # Associate HTML meta names and hwp summaryinfo values
 def metafields(summaryinfo):
    yield(('Description', summaryinfo.subject + " " +
--- a/src/qtgui/guiutils.cpp
+++ b/src/qtgui/guiutils.cpp
@ -158,7 +158,15 @@ void rwSettings(bool writing)
               "/Recoll/prefs/reslist/collapseDuplicates", Bool, false);
    SETTING_RW(prefs.showResultsAsTable, 
               "/Recoll/prefs/showResultsAsTable", Bool, false);
-    SETTING_RW(prefs.maxhltextmbs, "/Recoll/prefs/preview/maxhltextmbs", Int, 3);
+
+    SETTING_RW(prefs.maxhltextkbs, "/Recoll/prefs/preview/maxhltextkbs", Int,
+               3000);
+    // Compat: if maxhltextkbs is not set but old maxhltextmbs is set use it
+    if (!writing && !settings.contains("/Recoll/prefs/preview/maxhltextkbs") &&
+        settings.contains("/Recoll/prefs/preview/maxhltextmbs")) {
+        prefs.maxhltextkbs = settings.value(
+            "/Recoll/prefs/preview/maxhltextmbs").toInt() * 1024;
+    }

    SETTING_RW(prefs.previewPlainPre, 
               "/Recoll/prefs/preview/plainPre", Int, PrefsPack::PP_PREWRAP);
--- a/src/qtgui/guiutils.h
+++ b/src/qtgui/guiutils.h
@ -20,6 +20,7 @@
 #include <string>
 #include <list>
 #include <vector>
+#include <set>

 #include <qstring.h>
 #include <qstringlist.h>
@ -46,7 +47,7 @@ class PrefsPack {
    int filterCtlStyle;
    int respagesize{8};
    int historysize{0};
-    int maxhltextmbs;
+    int maxhltextkbs;
    QString reslistfontfamily;
    // Not saved in prefs for now. Computed from qt defaults and used to
    // set main character color for webkit/textbrowser reslist and
@ -154,6 +155,11 @@ class PrefsPack {

    std::string stemlang();

+    // MIME types for which we prefer to use stored text from preview
+    // rather than extracting the possibly nicer HTML because the
+    // extractor is very slow. This is compiled in and there is no UI
+    // for now.
+    std::set<std::string> preferStoredTextMimes{"application/x-hwp"};
 };

 /** Global preferences record */
--- a/src/qtgui/i18n/recoll_ko.qm
+++ b/src/qtgui/i18n/recoll_ko.qm
--- a/src/qtgui/i18n/recoll_ko.ts
+++ b/src/qtgui/i18n/recoll_ko.ts
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -574,6 +574,90 @@ void Preview::emitWordSelect(QString word)
    emit(wordSelect(word));
 }

+// Display message dialog after load failed
+void Preview::displayLoadError(
+    FileInterner::ErrorPossibleCause explain, bool canGetRawText)
+{
+    // Note that we can't easily check for a readable file
+    // because it's possible that only a region is locked
+    // (e.g. on Windows for an ost file the first block is
+    // readable even if Outlook is running).
+    QString msg;
+    switch (explain) {
+    case FileInterner::FetchMissing:
+        msg = tr("Error loading the document: file missing.");
+        break;
+    case FileInterner::FetchPerm:
+        msg = tr("Error loading the document: no permission.");
+        break;
+    case FileInterner::FetchNoBackend:
+        msg =
+            tr("Error loading: backend not configured.");
+        break;
+    case FileInterner::InternfileOther:
+#ifdef _WIN32
+        msg = tr("Error loading the document: "
+                 "other handler error<br>"
+                 "Maybe the application is locking the file ?");
+#else
+        msg = tr("Error loading the document: other handler error.");
+#endif
+        break;
+    }
+    if (canGetRawText) {
+        msg += tr("<br>Attempting to display from stored text.");
+    }
+    QMessageBox::warning(0, "Recoll", msg);
+}
+
+bool Preview::runLoadThread(LoadThread& lthr, QTimer& tT, QEventLoop& loop,
+                            QProgressDialog& progress, bool canGetRawText)
+{
+    lthr.start();
+    for (int i = 0;;i++) {
+        tT.start(1000); 
+        loop.exec();
+        if (lthr.isFinished())
+            break;
+        if (progress.wasCanceled()) {
+            CancelCheck::instance().setCancel();
+        }
+        if (i == 1)
+            progress.show();
+    }
+
+    LOGDEB("loadDocInCurrentTab: after file load: cancel " <<
+           CancelCheck::instance().cancelState() << " status " << lthr.status <<
+           " text length " << lthr.fdoc.text.length() << "\n");
+
+    if (lthr.status == 0) {
+        return true;
+    }
+
+    if (CancelCheck::instance().cancelState())
+        return false;
+
+    QString explain;
+    if (!lthr.missing.empty()) {
+        explain = QString::fromUtf8("<br>") +
+            tr("Missing helper program: ") +
+            QString::fromLocal8Bit(lthr.missing.c_str());
+        QMessageBox::warning(0, "Recoll",
+                             tr("Can't turn doc into internal "
+                                "representation for ") +
+                             lthr.fdoc.mimetype.c_str() + explain);
+    } else {
+        if (progress.wasCanceled()) {
+            QMessageBox::warning(0, "Recoll", tr("Canceled"));
+        } else {
+            progress.reset();
+            displayLoadError(lthr.explain, canGetRawText);
+        }
+    }
+
+    return false;
+}
+
 /*
  Code for loading a file into an editor window. The operations that
  we call have no provision to indicate progression, and it would be
@ -627,93 +711,42 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
    connect(&tT, SIGNAL(timeout()), &loop, SLOT(quit()));

    ////////////////////////////////////////////////////////////////////////
-    // Load and convert document
-    // idoc came out of the index data (main text and some fields missing). 
-    // fdoc is the complete one what we are going to extract from storage.
+    // Load and convert document 
+    //  - idoc came out of the index data (main text and some fields missing).
+    //  - fdoc is the complete one what we are going to extract from storage.
+    // 
+    // If the preference to use the stored text is set, we still
+    // create the LoadThread object for convenience (using its fdoc
+    // field, but don't start it.
+
    LoadThread lthr(theconfig, idoc, prefs.previewHtml, this);
    connect(&lthr, SIGNAL(finished()), &loop, SLOT(quit()));

-    lthr.start();
-    for (int i = 0;;i++) {
-        tT.start(1000); 
-        loop.exec();
-        if (lthr.isFinished())
-            break;
-        if (progress.wasCanceled()) {
-            CancelCheck::instance().setCancel();
+    bool canGetRawText = rcldb && rcldb->storesDocText();
+    auto it = prefs.preferStoredTextMimes.find(idoc.mimetype);
+    bool preferStoredText = (it != prefs.preferStoredTextMimes.end());
+    bool loadok{false};
+
+    if (!preferStoredText || !canGetRawText) {
+        // Try load from actual document
+        loadok = runLoadThread(lthr, tT, loop, progress, canGetRawText);
+    }
+    
+    if (!loadok && canGetRawText) {
+        // Preferring/able to use stored text or extern load failed
+        lthr.fdoc = idoc;
+        loadok = rcldb->getDocRawText(lthr.fdoc);
+        if (!loadok) {
+            QMessageBox::warning(0,"Recoll",tr("Could not fetch stored text"));
        }
-        if (i == 1)
-            progress.show();
    }

-    LOGDEB("loadDocInCurrentTab: after file load: cancel " <<
-           CancelCheck::instance().cancelState() << " status " << lthr.status <<
-           " text length " << lthr.fdoc.text.length() << "\n");
-
-    if (CancelCheck::instance().cancelState())
+    if (!loadok) {
+        // Everything failed.
+        progress.close();
        return false;
-    if (lthr.status != 0) {
-        bool canGetRawText = rcldb && rcldb->storesDocText();
-        QString explain;
-        if (!lthr.missing.empty()) {
-            explain = QString::fromUtf8("<br>") +
-                tr("Missing helper program: ") +
-                QString::fromLocal8Bit(lthr.missing.c_str());
-            QMessageBox::warning(0, "Recoll",
-                                 tr("Can't turn doc into internal "
-                                    "representation for ") +
-                                 lthr.fdoc.mimetype.c_str() + explain);
-        } else {
-            if (progress.wasCanceled()) {
-                QMessageBox::warning(0, "Recoll", tr("Canceled"));
-            } else {
-                progress.reset();
-                // Note that we can't easily check for a readable file
-                // because it's possible that only a region is locked
-                // (e.g. on Windows for an ost file the first block is
-                // readable even if Outlook is running).
-                QString msg;
-                switch (lthr.explain) {
-                case FileInterner::FetchMissing:
-                    msg = tr("Error loading the document: file missing.");
-                    break;
-                case FileInterner::FetchPerm:
-                    msg = tr("Error loading the document: no permission.");
-                    break;
-                case FileInterner::FetchNoBackend:
-                    msg =
-                        tr("Error loading: backend not configured.");
-                    break;
-                case FileInterner::InternfileOther:
-#ifdef _WIN32
-                    msg = tr("Error loading the document: "
-                             "other handler error<br>"
-                             "Maybe the application is locking the file ?");
-#else
-                    msg = tr("Error loading the document: other handler error.");
-#endif
-                    break;
-                }
-                if (canGetRawText) {
-                    msg += tr("<br>Attempting to display from stored text.");
-                }
-                QMessageBox::warning(0, "Recoll", msg);
-            }
-        }
-
-
-        if (canGetRawText) {
-            lthr.fdoc = idoc;
-            if (!rcldb->getDocRawText(lthr.fdoc)) {
-                QMessageBox::warning(0, "Recoll",
-                                     tr("Could not fetch stored text"));
-                progress.close();
-                return false;
-            }
-        } else {
-            progress.close();
-        }
    }
+    
    // Reset config just in case.
    theconfig->setKeyDir("");

@ -722,8 +755,8 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
    // We don't do the highlighting for very big texts: too long. We
    // should at least do special char escaping, in case a '&' or '<'
    // somehow slipped through previous processing.
-    bool highlightTerms = lthr.fdoc.text.length() < 
-        (unsigned long)prefs.maxhltextmbs * 1024 * 1024;
+    bool highlightTerms = int(lthr.fdoc.text.length()) < 
+        prefs.maxhltextkbs * 1024;

    // Final text is produced in chunks so that we can display the top
    // while still inserting at bottom
@ -752,7 +785,6 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
    QStringList qrichlst;
    editor->m_plaintorich->set_activatelinks(prefs.previewActiveLinks);
    
-#if 1
    if (highlightTerms) {
        progress.setLabelText(tr("Creating preview text"));
        qApp->processEvents();
@ -815,17 +847,6 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
            }
        }
    }
-#else // For testing qtextedit bugs...
-    highlightTerms = true;
-    const char *textlist[] =
-        {
-            "Du plain text avec un\n <termtag>termtag</termtag> fin de ligne:",
-            "texte apres le tag\n",
-        };
-    const int listl = sizeof(textlist) / sizeof(char*);
-    for (int i = 0 ; i < listl ; i++)
-        qrichlst.push_back(QString::fromUtf8(textlist[i]));
-#endif


    ///////////////////////////////////////////////////////////
--- a/src/qtgui/preview_w.h
+++ b/src/qtgui/preview_w.h
@ -44,9 +44,11 @@
 #include "rcldb.h"
 #include "plaintorich.h"
 #include "rclmain_w.h"
+#include "internfile.h"

 #include "ui_preview.h"

+
 class QTabWidget;
 class QLabel;
 class QPushButton;
@ -55,6 +57,10 @@ class Preview;
 class PlainToRichQtPreview;
 class QUrl;
 class RclMain;
+class LoadThread;
+class QTimer;
+class QEventLoop;
+class QProgressDialog;

 class PreviewTextEdit : public PREVIEW_PARENTCLASS {
    Q_OBJECT;
@ -185,6 +191,10 @@ private:
    virtual PreviewTextEdit *currentEditor();
    virtual PreviewTextEdit *addEditorTab();
    virtual bool loadDocInCurrentTab(const Rcl::Doc& idoc, int dnm);
+    void displayLoadError(
+        FileInterner::ErrorPossibleCause explain, bool canGetRawText);
+    bool runLoadThread(LoadThread& lthr, QTimer& tT, QEventLoop& loop,
+                       QProgressDialog& progress, bool canGetRawText);
 };

 #endif /* _PREVIEW_W_H_INCLUDED_ */
--- a/src/qtgui/recoll.pro.in
+++ b/src/qtgui/recoll.pro.in
@ -168,7 +168,7 @@ i18n/recoll_zh_CN.ts \
 i18n/recoll_fr.ts \
 i18n/recoll_xx.ts \
 i18n/recoll_cs.ts \
-i18n/recoll_kr.ts \
+i18n/recoll_ko.ts \
 i18n/recoll_el.ts \
 i18n/recoll_tr.ts

--- a/src/qtgui/uiprefs.ui
+++ b/src/qtgui/uiprefs.ui
--- a/src/qtgui/uiprefs_w.cpp
+++ b/src/qtgui/uiprefs_w.cpp
@ -112,7 +112,7 @@ void UIPrefsDialog::setFromPrefs()
    pageLenSB->setValue(prefs.respagesize);
    maxHistSizeSB->setValue(prefs.historysize);
    collapseDupsCB->setChecked(prefs.collapseDuplicates);
-    maxHLTSB->setValue(prefs.maxhltextmbs);
+    maxHLTSB->setValue(prefs.maxhltextkbs);

    if (prefs.ssearchTypSav) {
        ssearchTypCMB->setCurrentIndex(4);
@ -304,7 +304,7 @@ void UIPrefsDialog::accept()
    prefs.respagesize = pageLenSB->value();
    prefs.historysize = maxHistSizeSB->value();
    prefs.collapseDuplicates = collapseDupsCB->isChecked();
-    prefs.maxhltextmbs = maxHLTSB->value();
+    prefs.maxhltextkbs = maxHLTSB->value();

    prefs.qtermstyle = qtermStyleLE->text();
    prefs.abssep = abssepLE->text();
--- a/src/utils/cmdtalk.cpp
+++ b/src/utils/cmdtalk.cpp
@ -0,0 +1,243 @@
+/* Copyright (C) 2016 J.F.Dockes 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "cmdtalk.h"
+
+#include <stdio.h>
+
+#include <iostream>
+#include <sstream>
+#include <mutex>
+
+#include "smallut.h"
+#include "execmd.h"
+#ifdef MDU_INCLUDE_LOG
+#include MDU_INCLUDE_LOG
+#else
+#include "log.h"
+#endif
+
+using namespace std;
+
+class TimeoutExcept {};
+
+class Canceler : public ExecCmdAdvise {
+public:
+    Canceler(int tmsecs) 
+        : m_timeosecs(tmsecs) {}
+
+    virtual void newData(int cnt) {
+        if (m_starttime && (time(0) - m_starttime) > m_timeosecs) {
+            throw TimeoutExcept();
+        }
+    }
+
+    void reset() {
+        m_starttime = time(0);
+    }
+    int m_timeosecs;
+    time_t m_starttime{0};
+};
+
+class CmdTalk::Internal {
+public:
+    Internal(int timeosecs)
+        : m_cancel(timeosecs) {}
+
+    ~Internal() {
+        delete cmd;
+    }
+
+    bool readDataElement(string& name, string &data);
+
+    bool talk(const pair<string, string>& arg0,
+              const unordered_map<string, string>& args,
+              unordered_map<string, string>& rep);
+
+    ExecCmd *cmd{0};
+    Canceler m_cancel;
+    std::mutex mmutex;
+};
+
+CmdTalk::CmdTalk(int timeosecs)
+{
+    m = new Internal(timeosecs);
+}
+CmdTalk::~CmdTalk()
+{
+    delete m;
+}
+
+bool CmdTalk::startCmd(const string& cmdname,
+                       const vector<string>& args,
+                       const vector<string>& env,
+                       const vector<string>& path)
+{
+    LOGDEB("CmdTalk::startCmd\n");
+
+    delete m->cmd;
+    m->cmd = new ExecCmd;
+    m->cmd->setAdvise(&m->m_cancel);
+
+    for (const auto& it : env) {
+        m->cmd->putenv(it);
+    }
+
+    string acmdname(cmdname);
+    if (!path.empty()) {
+        string colonpath;
+        for (const auto& it: path) {
+            colonpath += it + ":";
+        }
+        if (!colonpath.empty()) {
+            colonpath.erase(colonpath.size()-1);
+        }
+        LOGDEB("CmdTalk::startCmd: PATH: [" << colonpath << "]\n");
+        ExecCmd::which(cmdname, acmdname, colonpath.c_str());
+    }
+
+    if (m->cmd->startExec(acmdname, args, 1, 1) < 0) {
+        return false;
+    }
+    return true;
+}
+
+// Messages are made of data elements. Each element is like:
+// name: len\ndata
+// An empty line signals the end of the message, so the whole thing
+// would look like:
+// Name1: Len1\nData1Name2: Len2\nData2\n
+bool CmdTalk::Internal::readDataElement(string& name, string &data)
+{
+    string ibuf;
+
+    m_cancel.reset();
+    try {
+        // Read name and length
+        if (cmd->getline(ibuf) <= 0) {
+            LOGERR("CmdTalk: getline error\n");
+            return false;
+        }
+    } catch (TimeoutExcept) {
+        LOGINF("CmdTalk:readDataElement: fatal timeout (" <<
+               m_cancel.m_timeosecs << " S)\n");
+        return false;
+    }
+    
+    LOGDEB1("CmdTalk:rde: line [" << ibuf << "]\n");
+
+    // Empty line (end of message) ?
+    if (!ibuf.compare("\n")) {
+        LOGDEB1("CmdTalk: Got empty line\n");
+        return true;
+    }
+
+    // We're expecting something like Name: len\n
+    vector<string> tokens;
+    stringToTokens(ibuf, tokens);
+    if (tokens.size() != 2) {
+        LOGERR("CmdTalk: bad line in filter output: [" << ibuf << "]\n");
+        return false;
+    }
+    vector<string>::iterator it = tokens.begin();
+    name = *it++;
+    string& slen = *it;
+    int len;
+    if (sscanf(slen.c_str(), "%d", &len) != 1) {
+        LOGERR("CmdTalk: bad line in filter output: [" << ibuf << "]\n");
+        return false;
+    }
+
+    // Read element data
+    data.erase();
+    if (len > 0 && cmd->receive(data, len) != len) {
+        LOGERR("CmdTalk: expected " << len << " bytes of data, got " <<
+               data.length() << "\n");
+        return false;
+    }
+    LOGDEB1("CmdTalk:rde: got: name [" << name << "] len " << len <<"value ["<<
+            (data.size() > 100 ? (data.substr(0, 100) + " ...") : data)<< endl);
+    return true;
+}
+
+bool CmdTalk::Internal::talk(const pair<string, string>& arg0,
+                             const unordered_map<string, string>& args,
+                             unordered_map<string, string>& rep)
+{
+    std::unique_lock<std::mutex> lock(mmutex);
+    if (cmd->getChildPid() <= 0) {
+        LOGERR("CmdTalk::talk: no process\n");
+        return false;
+    }
+
+    ostringstream obuf;
+    if (!arg0.first.empty()) {
+        obuf << arg0.first << ": " << arg0.second.size() << "\n" << arg0.second;
+    }
+    for (const auto& it : args) {
+        obuf << it.first << ": " << it.second.size() << "\n" << it.second;
+    }
+    obuf << "\n";
+
+    if (cmd->send(obuf.str()) < 0) {
+        cmd->zapChild();
+        LOGERR("CmdTalk: send error\n");
+        return false;
+    }
+
+    // Read answer (multiple elements)
+    LOGDEB1("CmdTalk: reading answer\n");
+    for (;;) {
+        string name, data;
+        if (!readDataElement(name, data)) {
+            cmd->zapChild();
+            return false;
+        }
+        if (name.empty()) {
+            break;
+        }
+        trimstring(name, ":");
+        LOGDEB1("CmdTalk: got [" << name << "] -> [" << data << "]\n");
+        rep[name] = data;
+    }
+
+    if (rep.find("cmdtalkstatus") != rep.end()) {
+        return false;
+    } else {
+        return true;
+    }
+}
+
+bool CmdTalk::running()
+{
+    return m && m->cmd && m->cmd->getChildPid() > 0;
+}
+
+bool CmdTalk::talk(const unordered_map<string, string>& args,
+                   unordered_map<string, string>& rep)
+{
+    return m->talk({"",""}, args, rep);
+}
+
+bool CmdTalk::callproc(
+    const string& proc,
+    const unordered_map<std::string, std::string>& args,
+    unordered_map<std::string, std::string>& rep)
+{
+    return m->talk({"cmdtalk:proc", proc}, args, rep);
+}
+
+    
--- a/src/utils/cmdtalk.h
+++ b/src/utils/cmdtalk.h
@ -0,0 +1,109 @@
+/* Copyright (C) 2016 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _CMDTALK_H_INCLUDED_
+#define _CMDTALK_H_INCLUDED_
+
+/** 
+ * Execute commands and exchange messages with it.
+ *
+ * A simple stream protocol is used for the dialog. HTTP or some kind
+ * of full-blown RPC could have been used, but there was also good
+ * reason to keep it simple (yet powerful), given the limited context
+ * of dialog through a pipe.
+ *
+ * The data is exchanged in TLV fashion, in a way that should be
+ * usable in most script languages. The basic unit of data has one line 
+ * with a data type and a count (both ASCII), followed by the data. A
+ * 'message' is made of one or several units or tags and ends with one empty
+ * line. 
+ * 
+ * Example:(the message begins before 'Filename' and has 'Filename' and 
+ * 'Ipath' tags):
+ * 
+Filename: 24
+/my/home/mail/somefolderIpath: 2
+22
+
+<Message ends here: because of the empty line after '22'
+
+ * 
+ * Example answer, with 'Mimetype' and 'Data' tags
+ * 
+Mimetype: 10
+text/plainData: 10
+0123456789
+
+<Message ends here because of empty line
+
+ *        
+ * This format is both extensible and reasonably easy to parse. 
+ * While it's more fitted for python or perl on the script side, it
+ * should even be sort of usable from the shell (e.g.: use dd to read
+ * the counted data). Most alternatives would need data encoding in
+ * some cases.
+ *
+ * Higher level dialog:
+ * The C++ program is the master and sends request messages to the script. 
+ * Both sides of the communication should be prepared to receive and discard 
+ * unknown tags.
+ */
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+class CmdTalk {
+ public:
+    CmdTalk(int timeosecs);
+    virtual ~CmdTalk();
+
+    // @param env each entry should be of the form name=value. They
+    //   augment the subprocess environnement.
+    // @param path replaces the PATH variable when looking for the command.
+    // 
+    // Note that cmdtalk.py:main() method is a test routine which
+    // expects data pairs on the command line. If actual parameters
+    // need to be passed, it can't be used by the processor.
+    virtual bool startCmd(const std::string& cmdname,
+			  const std::vector<std::string>& args =
+			  std::vector<std::string>(),
+			  const std::vector<std::string>& env =
+			  std::vector<std::string>(),
+			  const std::vector<std::string>& path =
+			  std::vector<std::string>()
+	);
+    virtual bool running();
+    
+    // Single exchange: send and receive data.
+    virtual bool talk(const std::unordered_map<std::string, std::string>& args,
+		      std::unordered_map<std::string, std::string>& rep);
+
+    // Specialized version with special argument used by dispatcher to call
+    // designated method
+    virtual bool callproc(
+	const std::string& proc,
+	const std::unordered_map<std::string, std::string>& args,
+	std::unordered_map<std::string, std::string>& rep);
+
+    CmdTalk(const CmdTalk&) = delete;
+    CmdTalk &operator=(const CmdTalk &) = delete;
+private:
+    class Internal;
+    Internal *m{0};
+};
+
+#endif /* _CMDTALK_H_INCLUDED_ */