From 384e3a1087eaf8c65d8149d8a307a2e624ec5551 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 22 Mar 2020 10:09:50 +0100 Subject: [PATCH] korean textsplit with extern help from konlpy, first step --- src/Makefile.am | 7 +- src/common/textsplit.cpp | 26 ++-- src/common/textsplit.h | 4 + src/common/textsplitko.cpp | 140 +++++++++++++++++++++ src/filters/cmdtalk.py | 234 +++++++++++++++++++++++++++++++++++ src/filters/kosplitter.py | 52 ++++++++ src/utils/cmdtalk.cpp | 243 +++++++++++++++++++++++++++++++++++++ src/utils/cmdtalk.h | 105 ++++++++++++++++ 8 files changed, 802 insertions(+), 9 deletions(-) create mode 100644 src/common/textsplitko.cpp create mode 100644 src/filters/cmdtalk.py create mode 100755 src/filters/kosplitter.py create mode 100644 src/utils/cmdtalk.cpp create mode 100644 src/utils/cmdtalk.h diff --git a/src/Makefile.am b/src/Makefile.am index 73f2f415..d30735ce 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -82,6 +82,7 @@ common/rclinit.h \ common/syngroups.cpp \ common/syngroups.h \ common/textsplit.cpp \ +common/textsplitko.cpp \ common/textsplit.h \ common/unacpp.cpp \ common/unacpp.h \ @@ -210,6 +211,8 @@ utils/circache.cpp \ utils/circache.h \ utils/closefrom.cpp \ utils/closefrom.h \ +utils/cmdtalk.cpp \ +utils/cmdtalk.h \ utils/conftree.cpp \ utils/conftree.h \ utils/copyfile.cpp \ @@ -645,8 +648,10 @@ filterdir = $(pkgdatadir)/filters dist_filter_DATA = \ desktop/hotrecoll.py \ filters/abiword.xsl \ +filters/cmdtalk.py \ filters/fb2.xsl \ filters/gnumeric.xsl \ +filters/kosplitter.py \ filters/msodump.zip \ filters/okular-note.xsl \ filters/opendoc-body.xsl \ @@ -724,7 +729,7 @@ python/recoll/recoll/rclconfig.py install-data-hook: (cd $(DESTDIR)/$(filterdir); \ chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \ - chmod a+x recoll-we-move-files.py ../examples/rclmon.sh; \ + chmod a+x recoll-we-move-files.py ../examples/rclmon.sh kosplitter.py; \ chmod 0644 msodump.zip recollepub.zip rclexecm.py rcllatinstops.zip \ rclconfig.py conftree.py rclmidi.py rclexec1.py rcluncomp.py rclxslt.py) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 3ea957ad..bb61696e 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -246,6 +246,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr) #define UNICODE_IS_KATAKANA(p) false #endif +#define HANGUL_AS_WORDS #ifdef HANGUL_AS_WORDS #define UNICODE_IS_HANGUL(p) ( \ ((p) >= 0x1100 && (p) <= 0x11FF) || \ @@ -323,7 +324,8 @@ void TextSplit::staticConfInit(RclConfig *config) charclasses[int('\\')] = SPACE; } } -} + koStaticConfInit(config); +} // Final term checkpoint: do some checking (the kind which is simpler // to do here than in the main loop), then send term to our client. @@ -632,20 +634,28 @@ bool TextSplit::text_to_words(const string &in) csc = CSC_OTHER; } - if (o_processCJK && csc == CSC_CJK) { - // CJK character hit. + if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) { + // CJK character hit. Hangul processing may be special or + // not depending on how we were built. + // Do like at EOF with the current non-cjk data. if (m_wordLen || m_span.length()) { if (!doemit(true, it.getBpos())) return false; } - // Hand off situation to the cjk routine. - if (!cjk_to_words(&it, &c)) { - LOGERR("Textsplit: scan error in cjk handler\n"); - return false; + // Hand off situation to the appropriate routine. + if (csc == CSC_HANGUL) { + if (!ko_to_words(&it, &c)) { + LOGERR("Textsplit: scan error in korean handler\n"); + return false; + } + } else { + if (!cjk_to_words(&it, &c)) { + LOGERR("Textsplit: scan error in cjk handler\n"); + return false; + } } - // Check for eof, else c contains the first non-cjk // character after the cjk sequence, just go on. if (it.eof()) diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 7e4f8222..8f8f19d3 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -54,6 +54,7 @@ public: /** Call at program initialization to read non default values from the configuration */ static void staticConfInit(RclConfig *config); + static void koStaticConfInit(RclConfig *config); /** Split text, emit words and positions. */ virtual bool text_to_words(const std::string &in); @@ -199,6 +200,9 @@ private: // This processes cjk text: bool cjk_to_words(Utf8Iter *it, unsigned int *cp); + // Experimental Korean splitter. This uses an external Python tokenizer + bool ko_to_words(Utf8Iter *it, unsigned int *cp); + bool emitterm(bool isspan, std::string &term, int pos, size_t bs,size_t be); bool doemit(bool spanerase, size_t bp); void discardspan(); diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp new file mode 100644 index 00000000..f1929a6f --- /dev/null +++ b/src/common/textsplitko.cpp @@ -0,0 +1,140 @@ +/* Copyright (C) 2020 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "autoconfig.h" + +#include +#include +#include +#include +#include + +#include "textsplit.h" +#include "log.h" +//#define UTF8ITER_CHECK +#include "utf8iter.h" +#include "smallut.h" +#include "rclconfig.h" +#include "cmdtalk.h" + +using namespace std; + +static CmdTalk *o_talker; +static bool o_starterror{false}; +static string o_cmdpath; +std::mutex o_mutex; + +void TextSplit::koStaticConfInit(RclConfig *config) +{ + o_cmdpath = config->findFilter("kosplitter.py"); +} + +static bool initCmd() +{ + if (o_starterror) { + // No use retrying + return false; + } + if (o_talker) { + return true; + } + if (o_cmdpath.empty()) { + return false; + } + if (nullptr == (o_talker = new CmdTalk(300))) { + o_starterror = true; + return false; + } + if (!o_talker->startCmd(o_cmdpath)) { + delete o_talker; + o_talker = nullptr; + o_starterror = true; + return false; + } + return true; +} + +bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) +{ + std::unique_lock mylock(o_mutex); + if (nullptr == o_talker) { + if (!initCmd()) { + return false; + } + } + LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n"); + Utf8Iter &it = *itp; + unsigned int c = 0; + unordered_map args; + args.insert(pair{"data", string()}); + string& inputdata{args.begin()->second}; + + // Gather all Korean characters and send the text to the analyser + for (; !it.eof(); it++) { + c = *it; + if (!isHANGUL(c) && !(isascii(c) && (isspace(c) || ispunct(c)))) { + // Done with Korean stretch, process and go back to main routine + //std::cerr << "Broke on char " << int(c) << endl; + break; + } else { + it.appendchartostring(inputdata); + } + } + LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() << + " bytes " << inputdata << endl); + unordered_map result; + if (!o_talker->talk(args, result)) { + LOGERR("Python splitter for Korean failed\n"); + return false; + } + auto resit = result.find("data"); + if (resit == result.end()) { + LOGERR("No data in Python splitter for Korean\n"); + return false; + } + string& outdata = resit->second; + char sepchar = '^'; + //std::cerr << "GOT FROM SPLITTER: " << outdata << endl; + string::size_type wordstart = 0; + string::size_type wordend = outdata.find(sepchar); + for (;;) { + //cerr << "start " << wordstart << " end " << wordend << endl; + if (wordend != wordstart) { + string::size_type len = (wordend == string::npos) ? + wordend : wordend-wordstart; + string word = outdata.substr(wordstart, len); + //cerr << " WORD[" << word << "]\n"; + if (!takeword(word, m_wordpos++, 0, 0)) { + return false; + } + } + if (wordend == string::npos) + break; + wordstart = wordend + 1; + wordend = outdata.find(sepchar, wordstart); + } + + + // Reset state, saving term position, and return the found non-cjk + // Unicode character value. The current input byte offset is kept + // in the utf8Iter + int pos = m_wordpos; + clearsplitstate(); + m_spanpos = m_wordpos = pos; + *cp = c; + return true; +} diff --git a/src/filters/cmdtalk.py b/src/filters/cmdtalk.py new file mode 100644 index 00000000..8bb49e28 --- /dev/null +++ b/src/filters/cmdtalk.py @@ -0,0 +1,234 @@ +################################# +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## +# Command communication module and utilities. See commands in cmdtalk.h +# +# All data is binary. This is important for Python3 +# All parameter names are converted to and processed as str/unicode + +from __future__ import print_function + +import sys +import os +import tempfile +import shutil +import getopt +import traceback + +PY3 = sys.version > '3' + +if PY3: + def makebytes(data): + if isinstance(data, bytes): + return data + else: + return data.encode("UTF-8") +else: + def makebytes(data): + if isinstance(data, unicode): + return data.encode("UTF-8") + else: + return data + + +############################################ +# CmdTalk implements the +# communication protocol with the master process. It calls an external +# method to use the args and produce return data. +class CmdTalk: + + def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None): + try: + self.myname = os.path.basename(sys.argv[0]) + except: + self.myname = "???" + + self.outfile = outfile + self.infile = infile + self.exitfunc = exitfunc + self.fields = {} + + if sys.platform == "win32": + import msvcrt + msvcrt.setmode(self.outfile.fileno(), os.O_BINARY) + msvcrt.setmode(self.infile.fileno(), os.O_BINARY) + self.debugfile = None + if self.debugfile: + self.errfout = open(self.debugfile, "a") + else: + self.errfout = sys.stderr + + def log(self, s, doexit = 0, exitvalue = 1): + print("CMDTALK: %s: %s" % (self.myname, s), file=self.errfout) + if doexit: + if self.exitfunc: + self.exitfunc(exitvalue) + sys.exit(exitvalue) + + def breakwrite(self, outfile, data): + if sys.platform != "win32": + outfile.write(data) + else: + # On windows, writing big chunks can fail with a "not enough space" + # error. Seems a combined windows/python bug, depending on versions. + # See https://bugs.python.org/issue11395 + # In any case, just break it up + total = len(data) + bs = 4*1024 + offset = 0 + while total > 0: + if total < bs: + tow = total + else: + tow = bs + #self.log("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow])) + outfile.write(data[offset:offset+tow]) + offset += tow + total -= tow + + # Read single parameter from process input: line with param name and size + # followed by data. The param name is returned as str/unicode, the data + # as bytes + def readparam(self): + if PY3: + inf = self.infile.buffer + else: + inf = self.infile + s = inf.readline() + if s == b'': + if self.exitfunc: + self.exitfunc(0) + sys.exit(0) + + s = s.rstrip(b'\n') + + if s == b'': + return ('', b'') + l = s.split() + if len(l) != 2: + self.log(b'bad line: [' + s + b']', 1, 1) + + paramname = l[0].decode('ASCII').rstrip(':') + paramsize = int(l[1]) + if paramsize > 0: + paramdata = inf.read(paramsize) + if len(paramdata) != paramsize: + self.log("Bad read: wanted %d, got %d" % + (paramsize, len(paramdata)), 1, 1) + else: + paramdata = b'' + if PY3: + paramdata = paramdata.decode('utf-8') + + #self.log("paramname [%s] paramsize %d value [%s]" % + # (paramname, paramsize, paramdata)) + return (paramname, paramdata) + + if PY3: + def senditem(self, nm, data): + data = makebytes(data) + l = len(data) + self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l))) + self.breakwrite(self.outfile.buffer, data) + else: + def senditem(self, nm, data): + data = makebytes(data) + l = len(data) + self.outfile.write(makebytes("%s: %d\n" % (nm, l))) + self.breakwrite(self.outfile, data) + + # Send answer: document, ipath, possible eof. + def answer(self, outfields): + for nm,value in outfields.items(): + #self.log("Senditem: [%s] -> [%s]" % (nm, value)) + self.senditem(nm, value) + + # End of message + print(file=self.outfile) + self.outfile.flush() + #self.log("done writing data") + + # Call processor with input params, send result + def processmessage(self, processor, params): + # In normal usage we try to recover from processor errors, but + # we sometimes want to see the real stack trace when testing + safeexec = True + if safeexec: + try: + outfields = processor.process(params) + except Exception as err: + self.log("processmessage: processor raised: [%s]" % err) + traceback.print_exc() + outfields = {} + outfields["cmdtalkstatus"] = "1" + outfields["cmdtalkerrstr"] = str(err) + else: + outfields = processor.process(params) + + self.answer(outfields) + + # Loop on messages from our master + def mainloop(self, processor): + while 1: + #self.log("waiting for command") + + params = dict() + + # Read at most 10 parameters (normally 1 or 2), stop at empty line + # End of message is signalled by empty paramname + for i in range(10): + paramname, paramdata = self.readparam() + if paramname == "": + break + params[paramname] = paramdata + + # Got message, act on it + self.processmessage(processor, params) + + +# Common main routine for testing: either run the normal protocol +# engine or a local loop. +def main(proto, processor): + if len(sys.argv) == 1: + proto.mainloop(processor) + # mainloop does not return. Just in case + sys.exit(1) + + # Not running the main loop: run one processor call for debugging + def usage(): + print("Usage: cmdtalk.py pname pvalue [pname pvalue...]", + file=sys.stderr) + sys.exit(1) + def debprint(out, s): + proto.breakwrite(out, makebytes(s+'\n')) + + args = sys.argv[1:] + if len(args) == 0 or len(args) % 2 != 0: + usage() + params = dict() + for i in range(len(args)/2): + params[args[2*i]] = args[2*i+1] + res = processor.process(params) + + ioout = sys.stdout.buffer if PY3 else sys.stdout + + for nm,value in res.items(): + #self.log("Senditem: [%s] -> [%s]" % (nm, value)) + bdata = makebytes(value) + debprint(ioout, "%s->" % nm) + proto.breakwrite(ioout, bdata) + ioout.write(b'\n') diff --git a/src/filters/kosplitter.py b/src/filters/kosplitter.py new file mode 100755 index 00000000..d7d394c6 --- /dev/null +++ b/src/filters/kosplitter.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 +################################# +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## + +# +# Interface to the konlpy Korean text analyser: we receive text from +# our parent process and have it segmented by the analyser, then +# return the results. The analyser startup is very expensive (several +# seconds), which is why we can't just execute it from the main +# process. +# + +import sys +import cmdtalk + +from konlpy.tag import Okt + +class Processor(object): + def __init__(self, proto): + self.proto = proto + self.okt = Okt() + + def process(self, params): + if 'data' not in params: + return {'error':'No data field in parameters'} + pos = self.okt.pos(params['data']) + #proto.log("%s" % pos) + output = "" + for e in pos: + if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \ + e[1] == 'Adverb': + output += e[0] + '^' + return {'data': output} + +proto = cmdtalk.CmdTalk() +processor = Processor(proto) +proto.mainloop(processor) diff --git a/src/utils/cmdtalk.cpp b/src/utils/cmdtalk.cpp new file mode 100644 index 00000000..3c5b461e --- /dev/null +++ b/src/utils/cmdtalk.cpp @@ -0,0 +1,243 @@ +/* Copyright (C) 2016 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "cmdtalk.h" + +#include + +#include +#include +#include + +#include "smallut.h" +#include "execmd.h" +#ifdef MDU_INCLUDE_LOG +#include MDU_INCLUDE_LOG +#else +#include "log.h" +#endif + +using namespace std; + +class TimeoutExcept {}; + +class Canceler : public ExecCmdAdvise { +public: + Canceler(int tmsecs) + : m_timeosecs(tmsecs) {} + + virtual void newData(int cnt) { + if (m_starttime && (time(0) - m_starttime) > m_timeosecs) { + throw TimeoutExcept(); + } + } + + void reset() { + m_starttime = time(0); + } + int m_timeosecs; + time_t m_starttime{0}; +}; + +class CmdTalk::Internal { +public: + Internal(int timeosecs) + : m_cancel(timeosecs) {} + + ~Internal() { + delete cmd; + } + + bool readDataElement(string& name, string &data); + + bool talk(const pair& arg0, + const unordered_map& args, + unordered_map& rep); + + ExecCmd *cmd{0}; + Canceler m_cancel; + std::mutex mmutex; +}; + +CmdTalk::CmdTalk(int timeosecs) +{ + m = new Internal(timeosecs); +} +CmdTalk::~CmdTalk() +{ + delete m; +} + +bool CmdTalk::startCmd(const string& cmdname, + const vector& args, + const vector& env, + const vector& path) +{ + LOGDEB("CmdTalk::startCmd\n" ); + + delete m->cmd; + m->cmd = new ExecCmd; + m->cmd->setAdvise(&m->m_cancel); + + for (const auto& it : env) { + m->cmd->putenv(it); + } + + string acmdname(cmdname); + if (!path.empty()) { + string colonpath; + for (const auto& it: path) { + colonpath += it + ":"; + } + if (!colonpath.empty()) { + colonpath.erase(colonpath.size()-1); + } + LOGDEB("CmdTalk::startCmd: PATH: [" << colonpath << "]\n"); + ExecCmd::which(cmdname, acmdname, colonpath.c_str()); + } + + if (m->cmd->startExec(acmdname, args, 1, 1) < 0) { + return false; + } + return true; +} + +// Messages are made of data elements. Each element is like: +// name: len\ndata +// An empty line signals the end of the message, so the whole thing +// would look like: +// Name1: Len1\nData1Name2: Len2\nData2\n +bool CmdTalk::Internal::readDataElement(string& name, string &data) +{ + string ibuf; + + m_cancel.reset(); + try { + // Read name and length + if (cmd->getline(ibuf) <= 0) { + LOGERR("CmdTalk: getline error\n" ); + return false; + } + } catch (TimeoutExcept) { + LOGINF("CmdTalk:readDataElement: fatal timeout (" << + m_cancel.m_timeosecs << " S)\n"); + return false; + } + + LOGDEB1("CmdTalk:rde: line [" << (ibuf) << "]\n" ); + + // Empty line (end of message) ? + if (!ibuf.compare("\n")) { + LOGDEB("CmdTalk: Got empty line\n" ); + return true; + } + + // We're expecting something like Name: len\n + vector tokens; + stringToTokens(ibuf, tokens); + if (tokens.size() != 2) { + LOGERR("CmdTalk: bad line in filter output: [" << (ibuf) << "]\n" ); + return false; + } + vector::iterator it = tokens.begin(); + name = *it++; + string& slen = *it; + int len; + if (sscanf(slen.c_str(), "%d", &len) != 1) { + LOGERR("CmdTalk: bad line in filter output: [" << (ibuf) << "]\n" ); + return false; + } + + // Read element data + data.erase(); + if (len > 0 && cmd->receive(data, len) != len) { + LOGERR("CmdTalk: expected " << len << " bytes of data, got " << + data.length() << "\n"); + return false; + } + LOGDEB1("CmdTalk:rde: got: name [" << name << "] len " << len <<"value ["<< + (data.size() > 100 ? (data.substr(0, 100) + " ...") : data)<< endl); + return true; +} + +bool CmdTalk::Internal::talk(const pair& arg0, + const unordered_map& args, + unordered_map& rep) +{ + std::unique_lock lock(mmutex); + if (cmd->getChildPid() <= 0) { + LOGERR("CmdTalk::talk: no process\n"); + return false; + } + + ostringstream obuf; + if (!arg0.first.empty()) { + obuf << arg0.first << ": " << arg0.second.size() << "\n" << arg0.second; + } + for (const auto& it : args) { + obuf << it.first << ": " << it.second.size() << "\n" << it.second; + } + obuf << "\n"; + + if (cmd->send(obuf.str()) < 0) { + cmd->zapChild(); + LOGERR("CmdTalk: send error\n" ); + return false; + } + + // Read answer (multiple elements) + LOGDEB1("CmdTalk: reading answer\n" ); + for (;;) { + string name, data; + if (!readDataElement(name, data)) { + cmd->zapChild(); + return false; + } + if (name.empty()) { + break; + } + trimstring(name, ":"); + LOGDEB1("CmdTalk: got [" << name << "] -> [" << data << "]\n"); + rep[name] = data; + } + + if (rep.find("cmdtalkstatus") != rep.end()) { + return false; + } else { + return true; + } +} + +bool CmdTalk::running() +{ + return m && m->cmd && m->cmd->getChildPid() > 0; +} + +bool CmdTalk::talk(const unordered_map& args, + unordered_map& rep) +{ + return m->talk({"",""}, args, rep); +} + +bool CmdTalk::callproc( + const string& proc, + const unordered_map& args, + unordered_map& rep) +{ + return m->talk({"cmdtalk:proc", proc}, args, rep); +} + + diff --git a/src/utils/cmdtalk.h b/src/utils/cmdtalk.h new file mode 100644 index 00000000..b7a55cb1 --- /dev/null +++ b/src/utils/cmdtalk.h @@ -0,0 +1,105 @@ +/* Copyright (C) 2016 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _CMDTALK_H_INCLUDED_ +#define _CMDTALK_H_INCLUDED_ + +/** + * Execute commands and exchange messages with it. + * + * A simple stream protocol is used for the dialog. HTTP or some kind + * of full-blown RPC could have been used, but there was also good + * reason to keep it simple (yet powerful), given the limited context + * of dialog through a pipe. + * + * The data is exchanged in TLV fashion, in a way that should be + * usable in most script languages. The basic unit of data has one line + * with a data type and a count (both ASCII), followed by the data. A + * 'message' is made of one or several units or tags and ends with one empty + * line. + * + * Example:(the message begins before 'Filename' and has 'Filename' and + * 'Ipath' tags): + * +Filename: 24 +/my/home/mail/somefolderIpath: 2 +22 + + +#include +#include + +class CmdTalk { + public: + CmdTalk(int timeosecs); + virtual ~CmdTalk(); + + // @param env each entry should be of the form name=value. They + // augment the subprocess environnement. + // @param path replaces the PATH variable when looking for the command. + virtual bool startCmd(const std::string& cmdname, + const std::vector& args = + std::vector(), + const std::vector& env = + std::vector(), + const std::vector& path = + std::vector() + ); + virtual bool running(); + + // Single exchange: send and receive data. + virtual bool talk(const std::unordered_map& args, + std::unordered_map& rep); + + // Specialized version with special argument used by dispatcher to call + // designated method + virtual bool callproc( + const std::string& proc, + const std::unordered_map& args, + std::unordered_map& rep); + + CmdTalk(const CmdTalk&) = delete; + CmdTalk &operator=(const CmdTalk &) = delete; +private: + class Internal; + Internal *m{0}; +}; + +#endif /* _CMDTALK_H_INCLUDED_ */