korean textsplit with extern help from konlpy, first step

2020-03-22 10:09:50 +01:00 · 2020-03-22 10:09:50 +01:00 · 384e3a1087
commit 384e3a1087
parent d83bb8cf69
8 changed files with 802 additions and 9 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -82,6 +82,7 @@ common/rclinit.h \
 common/syngroups.cpp \
 common/syngroups.h \
 common/textsplit.cpp \
+common/textsplitko.cpp \
 common/textsplit.h \
 common/unacpp.cpp \
 common/unacpp.h \
@ -210,6 +211,8 @@ utils/circache.cpp \
 utils/circache.h \
 utils/closefrom.cpp \
 utils/closefrom.h \
+utils/cmdtalk.cpp \
+utils/cmdtalk.h \
 utils/conftree.cpp \
 utils/conftree.h \
 utils/copyfile.cpp \
@ -645,8 +648,10 @@ filterdir = $(pkgdatadir)/filters
 dist_filter_DATA = \
 desktop/hotrecoll.py \
 filters/abiword.xsl \
+filters/cmdtalk.py \
 filters/fb2.xsl \
 filters/gnumeric.xsl \
+filters/kosplitter.py \
 filters/msodump.zip \
 filters/okular-note.xsl \
 filters/opendoc-body.xsl \
@ -724,7 +729,7 @@ python/recoll/recoll/rclconfig.py
 install-data-hook: 
 	(cd $(DESTDIR)/$(filterdir); \
 	chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
-	chmod a+x recoll-we-move-files.py ../examples/rclmon.sh; \
+	chmod a+x recoll-we-move-files.py ../examples/rclmon.sh kosplitter.py; \
 	chmod 0644 msodump.zip recollepub.zip rclexecm.py rcllatinstops.zip \
        rclconfig.py conftree.py rclmidi.py rclexec1.py rcluncomp.py rclxslt.py)

--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -246,6 +246,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 #define UNICODE_IS_KATAKANA(p) false
 #endif

+#define HANGUL_AS_WORDS
 #ifdef HANGUL_AS_WORDS
 #define UNICODE_IS_HANGUL(p) (                 \
        ((p) >= 0x1100 && (p) <= 0x11FF) ||    \
@ -323,7 +324,8 @@ void TextSplit::staticConfInit(RclConfig *config)
            charclasses[int('\\')] = SPACE;
        }
    }
-}    
+    koStaticConfInit(config);
+}

 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
@ -632,20 +634,28 @@ bool TextSplit::text_to_words(const string &in)
            csc = CSC_OTHER;
        }

-        if (o_processCJK && csc == CSC_CJK) {
-            // CJK character hit. 
+        if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) {
+            // CJK character hit. Hangul processing may be special or
+            // not depending on how we were built.
+
            // Do like at EOF with the current non-cjk data.
            if (m_wordLen || m_span.length()) {
                if (!doemit(true, it.getBpos()))
                    return false;
            }

-            // Hand off situation to the cjk routine.
-            if (!cjk_to_words(&it, &c)) {
-                LOGERR("Textsplit: scan error in cjk handler\n");
-                return false;
+            // Hand off situation to the appropriate routine.
+            if (csc == CSC_HANGUL) {
+                if (!ko_to_words(&it, &c)) {
+                    LOGERR("Textsplit: scan error in korean handler\n");
+                    return false;
+                }
+            } else {
+                if (!cjk_to_words(&it, &c)) {
+                    LOGERR("Textsplit: scan error in cjk handler\n");
+                    return false;
+                }
            }
-
            // Check for eof, else c contains the first non-cjk
            // character after the cjk sequence, just go on.
            if (it.eof())
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -54,6 +54,7 @@ public:
    /** Call at program initialization to read non default values from the 
        configuration */
    static void staticConfInit(RclConfig *config);
+    static void koStaticConfInit(RclConfig *config);
    
    /** Split text, emit words and positions. */
    virtual bool text_to_words(const std::string &in);
@ -199,6 +200,9 @@ private:
    // This processes cjk text:
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

+    // Experimental Korean splitter. This uses an external Python tokenizer
+    bool ko_to_words(Utf8Iter *it, unsigned int *cp);
+    
    bool emitterm(bool isspan, std::string &term, int pos, size_t bs,size_t be);
    bool doemit(bool spanerase, size_t bp);
    void discardspan();
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -0,0 +1,140 @@
+/* Copyright (C) 2020 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "autoconfig.h"
+
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <unordered_set>
+#include <mutex>
+
+#include "textsplit.h"
+#include "log.h"
+//#define UTF8ITER_CHECK
+#include "utf8iter.h"
+#include "smallut.h"
+#include "rclconfig.h"
+#include "cmdtalk.h"
+
+using namespace std;
+
+static CmdTalk *o_talker;
+static bool o_starterror{false};
+static string o_cmdpath;
+std::mutex o_mutex;
+
+void TextSplit::koStaticConfInit(RclConfig *config)
+{
+    o_cmdpath = config->findFilter("kosplitter.py");
+}
+
+static bool initCmd()
+{
+    if (o_starterror) {
+        // No use retrying
+        return false;
+    }
+    if (o_talker) {
+        return true;
+    }
+    if (o_cmdpath.empty()) {
+        return false;
+    }
+    if (nullptr == (o_talker = new CmdTalk(300))) {
+        o_starterror = true;
+        return false;
+    }
+    if (!o_talker->startCmd(o_cmdpath)) {
+        delete o_talker;
+        o_talker = nullptr;
+        o_starterror = true;
+        return false;
+    }
+    return true;
+}
+
+bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
+{
+    std::unique_lock<std::mutex> mylock(o_mutex);
+    if (nullptr == o_talker) {
+        if (!initCmd()) {
+            return false;
+        }
+    }
+    LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n");
+    Utf8Iter &it = *itp;
+    unsigned int c = 0;
+    unordered_map<string, string> args;
+    args.insert(pair<string,string>{"data", string()});
+    string& inputdata{args.begin()->second};
+    
+    // Gather all Korean characters and send the text to the analyser
+    for (; !it.eof(); it++) {
+        c = *it;
+        if (!isHANGUL(c) && !(isascii(c) && (isspace(c) || ispunct(c)))) {
+            // Done with Korean stretch, process and go back to main routine
+            //std::cerr << "Broke on char " << int(c) << endl;
+            break;
+        } else {
+            it.appendchartostring(inputdata);
+        }
+    }
+    LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
+            " bytes " << inputdata << endl);
+    unordered_map<string,string> result;
+    if (!o_talker->talk(args, result)) {
+        LOGERR("Python splitter for Korean failed\n");
+        return false;
+    }
+    auto resit = result.find("data");
+    if (resit == result.end()) {
+        LOGERR("No data in Python splitter for Korean\n");
+        return false;
+    }        
+    string& outdata = resit->second;
+    char sepchar = '^';
+    //std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
+    string::size_type wordstart = 0;
+    string::size_type wordend = outdata.find(sepchar);
+    for (;;) {
+        //cerr << "start " << wordstart << " end " << wordend << endl;        
+        if (wordend != wordstart) {
+            string::size_type len = (wordend == string::npos) ?
+                wordend : wordend-wordstart;
+            string word = outdata.substr(wordstart, len);
+            //cerr << " WORD[" <<  word << "]\n";
+            if (!takeword(word, m_wordpos++, 0, 0)) {
+                return false;
+            }
+        }
+        if (wordend == string::npos)
+            break;
+        wordstart = wordend + 1;
+        wordend = outdata.find(sepchar, wordstart);
+    }
+    
+
+    // Reset state, saving term position, and return the found non-cjk
+    // Unicode character value. The current input byte offset is kept
+    // in the utf8Iter
+    int pos = m_wordpos;
+    clearsplitstate();
+    m_spanpos = m_wordpos = pos;
+    *cp = c;
+    return true;
+}
--- a/src/filters/cmdtalk.py
+++ b/src/filters/cmdtalk.py
@ -0,0 +1,234 @@
+#################################
+# Copyright (C) 2016 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+# Command communication module and utilities. See commands in cmdtalk.h
+#
+# All data is binary. This is important for Python3
+# All parameter names are converted to and processed as str/unicode
+
+from __future__ import print_function
+
+import sys
+import os
+import tempfile
+import shutil
+import getopt
+import traceback
+
+PY3 = sys.version > '3'
+
+if PY3:
+    def makebytes(data):
+        if isinstance(data, bytes):
+            return data
+        else:
+            return data.encode("UTF-8")
+else:
+    def makebytes(data):
+        if isinstance(data, unicode):
+            return data.encode("UTF-8")
+        else:
+            return data
+
+
+############################################
+# CmdTalk implements the
+# communication protocol with the master process. It calls an external
+# method to use the args and produce return data.
+class CmdTalk:
+
+    def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None):
+        try:
+            self.myname = os.path.basename(sys.argv[0])
+        except:
+            self.myname = "???"
+
+        self.outfile = outfile
+        self.infile = infile
+        self.exitfunc = exitfunc
+        self.fields = {}
+        
+        if sys.platform == "win32":
+            import msvcrt
+            msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
+            msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
+        self.debugfile = None
+        if self.debugfile:
+            self.errfout = open(self.debugfile, "a")
+        else:
+            self.errfout = sys.stderr
+        
+    def log(self, s, doexit = 0, exitvalue = 1):
+        print("CMDTALK: %s: %s" % (self.myname, s), file=self.errfout)
+        if doexit:
+            if self.exitfunc:
+                self.exitfunc(exitvalue)
+            sys.exit(exitvalue)
+
+    def breakwrite(self, outfile, data):
+        if sys.platform != "win32":
+            outfile.write(data)
+        else:
+            # On windows, writing big chunks can fail with a "not enough space"
+            # error. Seems a combined windows/python bug, depending on versions.
+            # See https://bugs.python.org/issue11395
+            # In any case, just break it up
+            total = len(data)
+            bs = 4*1024
+            offset = 0
+            while total > 0:
+                if total < bs:
+                    tow = total
+                else:
+                    tow = bs
+                #self.log("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
+                outfile.write(data[offset:offset+tow])
+                offset += tow
+                total -= tow
+                
+    # Read single parameter from process input: line with param name and size
+    # followed by data. The param name is returned as str/unicode, the data
+    # as bytes
+    def readparam(self):
+        if PY3:
+            inf = self.infile.buffer
+        else:
+            inf = self.infile
+        s = inf.readline()
+        if s == b'':
+            if self.exitfunc:
+                self.exitfunc(0)
+            sys.exit(0)
+
+        s = s.rstrip(b'\n')
+
+        if s == b'':
+            return ('', b'')
+        l = s.split()
+        if len(l) != 2:
+            self.log(b'bad line: [' + s + b']', 1, 1)
+
+        paramname = l[0].decode('ASCII').rstrip(':')
+        paramsize = int(l[1])
+        if paramsize > 0:
+            paramdata = inf.read(paramsize)
+            if len(paramdata) != paramsize:
+                self.log("Bad read: wanted %d, got %d" %
+                      (paramsize, len(paramdata)), 1, 1)
+        else:
+            paramdata = b''
+        if PY3:
+            paramdata = paramdata.decode('utf-8')
+    
+        #self.log("paramname [%s] paramsize %d value [%s]" %
+        #          (paramname, paramsize, paramdata))
+        return (paramname, paramdata)
+
+    if PY3:
+        def senditem(self, nm, data):
+            data = makebytes(data)
+            l = len(data)
+            self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l)))
+            self.breakwrite(self.outfile.buffer, data)
+    else:
+        def senditem(self, nm, data):
+            data = makebytes(data)
+            l = len(data)
+            self.outfile.write(makebytes("%s: %d\n" % (nm, l)))
+            self.breakwrite(self.outfile, data)
+        
+    # Send answer: document, ipath, possible eof.
+    def answer(self, outfields):
+        for nm,value in outfields.items():
+            #self.log("Senditem: [%s] -> [%s]" % (nm, value))
+            self.senditem(nm, value)
+            
+        # End of message
+        print(file=self.outfile)
+        self.outfile.flush()
+        #self.log("done writing data")
+
+    # Call processor with input params, send result
+    def processmessage(self, processor, params):
+        # In normal usage we try to recover from processor errors, but
+        # we sometimes want to see the real stack trace when testing
+        safeexec = True
+        if safeexec:
+            try:
+                outfields = processor.process(params)
+            except Exception as err:
+                self.log("processmessage: processor raised: [%s]" % err)
+                traceback.print_exc()
+                outfields = {}
+                outfields["cmdtalkstatus"] = "1"
+                outfields["cmdtalkerrstr"] = str(err)
+        else:
+            outfields = processor.process(params)
+
+        self.answer(outfields)
+
+    # Loop on messages from our master
+    def mainloop(self, processor):
+        while 1:
+            #self.log("waiting for command")
+
+            params = dict()
+
+            # Read at most 10 parameters (normally 1 or 2), stop at empty line
+            # End of message is signalled by empty paramname
+            for i in range(10):
+                paramname, paramdata = self.readparam()
+                if paramname == "":
+                    break
+                params[paramname] = paramdata
+
+            # Got message, act on it
+            self.processmessage(processor, params)
+
+
+# Common main routine for testing: either run the normal protocol
+# engine or a local loop.
+def main(proto, processor):
+    if len(sys.argv) == 1:
+        proto.mainloop(processor)
+        # mainloop does not return. Just in case
+        sys.exit(1)
+
+    # Not running the main loop: run one processor call for debugging
+    def usage():
+        print("Usage: cmdtalk.py pname pvalue [pname pvalue...]",
+              file=sys.stderr)
+        sys.exit(1)
+    def debprint(out, s):
+        proto.breakwrite(out, makebytes(s+'\n'))
+        
+    args = sys.argv[1:]
+    if len(args) == 0 or len(args) % 2 != 0:
+        usage()
+    params = dict()
+    for i in range(len(args)/2):
+        params[args[2*i]] = args[2*i+1]
+    res = processor.process(params)
+
+    ioout = sys.stdout.buffer if PY3 else sys.stdout
+
+    for nm,value in res.items():
+        #self.log("Senditem: [%s] -> [%s]" % (nm, value))
+        bdata = makebytes(value)
+        debprint(ioout, "%s->" % nm)
+        proto.breakwrite(ioout, bdata)
+        ioout.write(b'\n')
--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -0,0 +1,52 @@
+#!/usr/bin/python3
+#################################
+# Copyright (C) 2020 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+
+#
+# Interface to the konlpy Korean text analyser: we receive text from
+# our parent process and have it segmented by the analyser, then
+# return the results. The analyser startup is very expensive (several
+# seconds), which is why we can't just execute it from the main
+# process.
+#
+
+import sys
+import cmdtalk
+
+from konlpy.tag import Okt
+
+class Processor(object):
+    def __init__(self, proto):
+        self.proto = proto
+        self.okt = Okt()
+
+    def process(self, params):
+        if 'data' not in params:
+            return {'error':'No data field in parameters'}
+        pos = self.okt.pos(params['data'])
+        #proto.log("%s" % pos)
+        output = ""
+        for e in pos:
+            if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
+               e[1] == 'Adverb':
+                output += e[0] + '^'
+        return {'data': output}
+
+proto = cmdtalk.CmdTalk()
+processor = Processor(proto)
+proto.mainloop(processor)
--- a/src/utils/cmdtalk.cpp
+++ b/src/utils/cmdtalk.cpp
@ -0,0 +1,243 @@
+/* Copyright (C) 2016 J.F.Dockes 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "cmdtalk.h"
+
+#include <stdio.h>
+
+#include <iostream>
+#include <sstream>
+#include <mutex>
+
+#include "smallut.h"
+#include "execmd.h"
+#ifdef MDU_INCLUDE_LOG
+#include MDU_INCLUDE_LOG
+#else
+#include "log.h"
+#endif
+
+using namespace std;
+
+class TimeoutExcept {};
+
+class Canceler : public ExecCmdAdvise {
+public:
+    Canceler(int tmsecs) 
+        : m_timeosecs(tmsecs) {}
+
+    virtual void newData(int cnt) {
+        if (m_starttime && (time(0) - m_starttime) > m_timeosecs) {
+            throw TimeoutExcept();
+        }
+    }
+
+    void reset() {
+        m_starttime = time(0);
+    }
+    int m_timeosecs;
+    time_t m_starttime{0};
+};
+
+class CmdTalk::Internal {
+public:
+    Internal(int timeosecs)
+	: m_cancel(timeosecs) {}
+
+    ~Internal() {
+	delete cmd;
+    }
+
+    bool readDataElement(string& name, string &data);
+
+    bool talk(const pair<string, string>& arg0,
+	      const unordered_map<string, string>& args,
+	      unordered_map<string, string>& rep);
+
+    ExecCmd *cmd{0};
+    Canceler m_cancel;
+    std::mutex mmutex;
+};
+
+CmdTalk::CmdTalk(int timeosecs)
+{
+    m = new Internal(timeosecs);
+}
+CmdTalk::~CmdTalk()
+{
+    delete m;
+}
+
+bool CmdTalk::startCmd(const string& cmdname,
+		       const vector<string>& args,
+		       const vector<string>& env,
+		       const vector<string>& path)
+{
+    LOGDEB("CmdTalk::startCmd\n" );
+
+    delete m->cmd;
+    m->cmd = new ExecCmd;
+    m->cmd->setAdvise(&m->m_cancel);
+
+    for (const auto& it : env) {
+	m->cmd->putenv(it);
+    }
+
+    string acmdname(cmdname);
+    if (!path.empty()) {
+	string colonpath;
+	for (const auto& it: path) {
+	    colonpath += it + ":";
+	}
+	if (!colonpath.empty()) {
+	    colonpath.erase(colonpath.size()-1);
+	}
+	LOGDEB("CmdTalk::startCmd: PATH: [" << colonpath << "]\n");
+	ExecCmd::which(cmdname, acmdname, colonpath.c_str());
+    }
+
+    if (m->cmd->startExec(acmdname, args, 1, 1) < 0) {
+        return false;
+    }
+    return true;
+}
+
+// Messages are made of data elements. Each element is like:
+// name: len\ndata
+// An empty line signals the end of the message, so the whole thing
+// would look like:
+// Name1: Len1\nData1Name2: Len2\nData2\n
+bool CmdTalk::Internal::readDataElement(string& name, string &data)
+{
+    string ibuf;
+
+    m_cancel.reset();
+    try {
+        // Read name and length
+        if (cmd->getline(ibuf) <= 0) {
+            LOGERR("CmdTalk: getline error\n" );
+            return false;
+        }
+    } catch (TimeoutExcept) {
+        LOGINF("CmdTalk:readDataElement: fatal timeout (" <<
+               m_cancel.m_timeosecs << " S)\n");
+        return false;
+    }
+    
+    LOGDEB1("CmdTalk:rde: line ["  << (ibuf) << "]\n" );
+
+    // Empty line (end of message) ?
+    if (!ibuf.compare("\n")) {
+        LOGDEB("CmdTalk: Got empty line\n" );
+        return true;
+    }
+
+    // We're expecting something like Name: len\n
+    vector<string> tokens;
+    stringToTokens(ibuf, tokens);
+    if (tokens.size() != 2) {
+        LOGERR("CmdTalk: bad line in filter output: ["  << (ibuf) << "]\n" );
+        return false;
+    }
+    vector<string>::iterator it = tokens.begin();
+    name = *it++;
+    string& slen = *it;
+    int len;
+    if (sscanf(slen.c_str(), "%d", &len) != 1) {
+        LOGERR("CmdTalk: bad line in filter output: ["  << (ibuf) << "]\n" );
+        return false;
+    }
+
+    // Read element data
+    data.erase();
+    if (len > 0 && cmd->receive(data, len) != len) {
+        LOGERR("CmdTalk: expected " << len << " bytes of data, got " <<
+	       data.length() << "\n");
+        return false;
+    }
+    LOGDEB1("CmdTalk:rde: got: name [" << name << "] len " << len <<"value ["<<
+	    (data.size() > 100 ? (data.substr(0, 100) + " ...") : data)<< endl);
+    return true;
+}
+
+bool CmdTalk::Internal::talk(const pair<string, string>& arg0,
+			     const unordered_map<string, string>& args,
+			     unordered_map<string, string>& rep)
+{
+    std::unique_lock<std::mutex> lock(mmutex);
+    if (cmd->getChildPid() <= 0) {
+	LOGERR("CmdTalk::talk: no process\n");
+        return false;
+    }
+
+    ostringstream obuf;
+    if (!arg0.first.empty()) {
+        obuf << arg0.first << ": " << arg0.second.size() << "\n" << arg0.second;
+    }
+    for (const auto& it : args) {
+        obuf << it.first << ": " << it.second.size() << "\n" << it.second;
+    }
+    obuf << "\n";
+
+    if (cmd->send(obuf.str()) < 0) {
+        cmd->zapChild();
+        LOGERR("CmdTalk: send error\n" );
+        return false;
+    }
+
+    // Read answer (multiple elements)
+    LOGDEB1("CmdTalk: reading answer\n" );
+    for (;;) {
+        string name, data;
+	if (!readDataElement(name, data)) {
+	    cmd->zapChild();
+	    return false;
+	}
+        if (name.empty()) {
+            break;
+	}
+	trimstring(name, ":");
+	LOGDEB1("CmdTalk: got [" << name << "] -> [" << data << "]\n");
+	rep[name] = data;
+    }
+
+    if (rep.find("cmdtalkstatus") != rep.end()) {
+	return false;
+    } else {
+	return true;
+    }
+}
+
+bool CmdTalk::running()
+{
+    return m && m->cmd && m->cmd->getChildPid() > 0;
+}
+
+bool CmdTalk::talk(const unordered_map<string, string>& args,
+		   unordered_map<string, string>& rep)
+{
+    return m->talk({"",""}, args, rep);
+}
+
+bool CmdTalk::callproc(
+	const string& proc,
+	const unordered_map<std::string, std::string>& args,
+	unordered_map<std::string, std::string>& rep)
+{
+    return m->talk({"cmdtalk:proc", proc}, args, rep);
+}
+
+    
--- a/src/utils/cmdtalk.h
+++ b/src/utils/cmdtalk.h
@ -0,0 +1,105 @@
+/* Copyright (C) 2016 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _CMDTALK_H_INCLUDED_
+#define _CMDTALK_H_INCLUDED_
+
+/** 
+ * Execute commands and exchange messages with it.
+ *
+ * A simple stream protocol is used for the dialog. HTTP or some kind
+ * of full-blown RPC could have been used, but there was also good
+ * reason to keep it simple (yet powerful), given the limited context
+ * of dialog through a pipe.
+ *
+ * The data is exchanged in TLV fashion, in a way that should be
+ * usable in most script languages. The basic unit of data has one line 
+ * with a data type and a count (both ASCII), followed by the data. A
+ * 'message' is made of one or several units or tags and ends with one empty
+ * line. 
+ * 
+ * Example:(the message begins before 'Filename' and has 'Filename' and 
+ * 'Ipath' tags):
+ * 
+Filename: 24
+/my/home/mail/somefolderIpath: 2
+22
+
+<Message ends here: because of the empty line after '22'
+
+ * 
+ * Example answer, with 'Mimetype' and 'Data' tags
+ * 
+Mimetype: 10
+text/plainData: 10
+0123456789
+
+<Message ends here because of empty line
+
+ *        
+ * This format is both extensible and reasonably easy to parse. 
+ * While it's more fitted for python or perl on the script side, it
+ * should even be sort of usable from the shell (e.g.: use dd to read
+ * the counted data). Most alternatives would need data encoding in
+ * some cases.
+ *
+ * Higher level dialog:
+ * The C++ program is the master and sends request messages to the script. 
+ * Both sides of the communication should be prepared to receive and discard 
+ * unknown tags.
+ */
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+class CmdTalk {
+ public:
+    CmdTalk(int timeosecs);
+    virtual ~CmdTalk();
+
+    // @param env each entry should be of the form name=value. They
+    //   augment the subprocess environnement.
+    // @param path replaces the PATH variable when looking for the command.
+    virtual bool startCmd(const std::string& cmdname,
+			  const std::vector<std::string>& args =
+			  std::vector<std::string>(),
+			  const std::vector<std::string>& env =
+			  std::vector<std::string>(),
+			  const std::vector<std::string>& path =
+			  std::vector<std::string>()
+	);
+    virtual bool running();
+    
+    // Single exchange: send and receive data.
+    virtual bool talk(const std::unordered_map<std::string, std::string>& args,
+		      std::unordered_map<std::string, std::string>& rep);
+
+    // Specialized version with special argument used by dispatcher to call
+    // designated method
+    virtual bool callproc(
+	const std::string& proc,
+	const std::unordered_map<std::string, std::string>& args,
+	std::unordered_map<std::string, std::string>& rep);
+
+    CmdTalk(const CmdTalk&) = delete;
+    CmdTalk &operator=(const CmdTalk &) = delete;
+private:
+    class Internal;
+    Internal *m{0};
+};
+
+#endif /* _CMDTALK_H_INCLUDED_ */