korean textsplit with extern help from konlpy, first step
This commit is contained in:
parent
d83bb8cf69
commit
384e3a1087
@ -82,6 +82,7 @@ common/rclinit.h \
|
||||
common/syngroups.cpp \
|
||||
common/syngroups.h \
|
||||
common/textsplit.cpp \
|
||||
common/textsplitko.cpp \
|
||||
common/textsplit.h \
|
||||
common/unacpp.cpp \
|
||||
common/unacpp.h \
|
||||
@ -210,6 +211,8 @@ utils/circache.cpp \
|
||||
utils/circache.h \
|
||||
utils/closefrom.cpp \
|
||||
utils/closefrom.h \
|
||||
utils/cmdtalk.cpp \
|
||||
utils/cmdtalk.h \
|
||||
utils/conftree.cpp \
|
||||
utils/conftree.h \
|
||||
utils/copyfile.cpp \
|
||||
@ -645,8 +648,10 @@ filterdir = $(pkgdatadir)/filters
|
||||
dist_filter_DATA = \
|
||||
desktop/hotrecoll.py \
|
||||
filters/abiword.xsl \
|
||||
filters/cmdtalk.py \
|
||||
filters/fb2.xsl \
|
||||
filters/gnumeric.xsl \
|
||||
filters/kosplitter.py \
|
||||
filters/msodump.zip \
|
||||
filters/okular-note.xsl \
|
||||
filters/opendoc-body.xsl \
|
||||
@ -724,7 +729,7 @@ python/recoll/recoll/rclconfig.py
|
||||
install-data-hook:
|
||||
(cd $(DESTDIR)/$(filterdir); \
|
||||
chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
|
||||
chmod a+x recoll-we-move-files.py ../examples/rclmon.sh; \
|
||||
chmod a+x recoll-we-move-files.py ../examples/rclmon.sh kosplitter.py; \
|
||||
chmod 0644 msodump.zip recollepub.zip rclexecm.py rcllatinstops.zip \
|
||||
rclconfig.py conftree.py rclmidi.py rclexec1.py rcluncomp.py rclxslt.py)
|
||||
|
||||
|
||||
@ -246,6 +246,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
||||
#define UNICODE_IS_KATAKANA(p) false
|
||||
#endif
|
||||
|
||||
#define HANGUL_AS_WORDS
|
||||
#ifdef HANGUL_AS_WORDS
|
||||
#define UNICODE_IS_HANGUL(p) ( \
|
||||
((p) >= 0x1100 && (p) <= 0x11FF) || \
|
||||
@ -323,7 +324,8 @@ void TextSplit::staticConfInit(RclConfig *config)
|
||||
charclasses[int('\\')] = SPACE;
|
||||
}
|
||||
}
|
||||
}
|
||||
koStaticConfInit(config);
|
||||
}
|
||||
|
||||
// Final term checkpoint: do some checking (the kind which is simpler
|
||||
// to do here than in the main loop), then send term to our client.
|
||||
@ -632,20 +634,28 @@ bool TextSplit::text_to_words(const string &in)
|
||||
csc = CSC_OTHER;
|
||||
}
|
||||
|
||||
if (o_processCJK && csc == CSC_CJK) {
|
||||
// CJK character hit.
|
||||
if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) {
|
||||
// CJK character hit. Hangul processing may be special or
|
||||
// not depending on how we were built.
|
||||
|
||||
// Do like at EOF with the current non-cjk data.
|
||||
if (m_wordLen || m_span.length()) {
|
||||
if (!doemit(true, it.getBpos()))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Hand off situation to the cjk routine.
|
||||
if (!cjk_to_words(&it, &c)) {
|
||||
LOGERR("Textsplit: scan error in cjk handler\n");
|
||||
return false;
|
||||
// Hand off situation to the appropriate routine.
|
||||
if (csc == CSC_HANGUL) {
|
||||
if (!ko_to_words(&it, &c)) {
|
||||
LOGERR("Textsplit: scan error in korean handler\n");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!cjk_to_words(&it, &c)) {
|
||||
LOGERR("Textsplit: scan error in cjk handler\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for eof, else c contains the first non-cjk
|
||||
// character after the cjk sequence, just go on.
|
||||
if (it.eof())
|
||||
|
||||
@ -54,6 +54,7 @@ public:
|
||||
/** Call at program initialization to read non default values from the
|
||||
configuration */
|
||||
static void staticConfInit(RclConfig *config);
|
||||
static void koStaticConfInit(RclConfig *config);
|
||||
|
||||
/** Split text, emit words and positions. */
|
||||
virtual bool text_to_words(const std::string &in);
|
||||
@ -199,6 +200,9 @@ private:
|
||||
// This processes cjk text:
|
||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
||||
|
||||
// Experimental Korean splitter. This uses an external Python tokenizer
|
||||
bool ko_to_words(Utf8Iter *it, unsigned int *cp);
|
||||
|
||||
bool emitterm(bool isspan, std::string &term, int pos, size_t bs,size_t be);
|
||||
bool doemit(bool spanerase, size_t bp);
|
||||
void discardspan();
|
||||
|
||||
140
src/common/textsplitko.cpp
Normal file
140
src/common/textsplitko.cpp
Normal file
@ -0,0 +1,140 @@
|
||||
/* Copyright (C) 2020 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <unordered_set>
|
||||
#include <mutex>
|
||||
|
||||
#include "textsplit.h"
|
||||
#include "log.h"
|
||||
//#define UTF8ITER_CHECK
|
||||
#include "utf8iter.h"
|
||||
#include "smallut.h"
|
||||
#include "rclconfig.h"
|
||||
#include "cmdtalk.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static CmdTalk *o_talker;
|
||||
static bool o_starterror{false};
|
||||
static string o_cmdpath;
|
||||
std::mutex o_mutex;
|
||||
|
||||
void TextSplit::koStaticConfInit(RclConfig *config)
|
||||
{
|
||||
o_cmdpath = config->findFilter("kosplitter.py");
|
||||
}
|
||||
|
||||
static bool initCmd()
|
||||
{
|
||||
if (o_starterror) {
|
||||
// No use retrying
|
||||
return false;
|
||||
}
|
||||
if (o_talker) {
|
||||
return true;
|
||||
}
|
||||
if (o_cmdpath.empty()) {
|
||||
return false;
|
||||
}
|
||||
if (nullptr == (o_talker = new CmdTalk(300))) {
|
||||
o_starterror = true;
|
||||
return false;
|
||||
}
|
||||
if (!o_talker->startCmd(o_cmdpath)) {
|
||||
delete o_talker;
|
||||
o_talker = nullptr;
|
||||
o_starterror = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
{
|
||||
std::unique_lock<std::mutex> mylock(o_mutex);
|
||||
if (nullptr == o_talker) {
|
||||
if (!initCmd()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n");
|
||||
Utf8Iter &it = *itp;
|
||||
unsigned int c = 0;
|
||||
unordered_map<string, string> args;
|
||||
args.insert(pair<string,string>{"data", string()});
|
||||
string& inputdata{args.begin()->second};
|
||||
|
||||
// Gather all Korean characters and send the text to the analyser
|
||||
for (; !it.eof(); it++) {
|
||||
c = *it;
|
||||
if (!isHANGUL(c) && !(isascii(c) && (isspace(c) || ispunct(c)))) {
|
||||
// Done with Korean stretch, process and go back to main routine
|
||||
//std::cerr << "Broke on char " << int(c) << endl;
|
||||
break;
|
||||
} else {
|
||||
it.appendchartostring(inputdata);
|
||||
}
|
||||
}
|
||||
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
||||
" bytes " << inputdata << endl);
|
||||
unordered_map<string,string> result;
|
||||
if (!o_talker->talk(args, result)) {
|
||||
LOGERR("Python splitter for Korean failed\n");
|
||||
return false;
|
||||
}
|
||||
auto resit = result.find("data");
|
||||
if (resit == result.end()) {
|
||||
LOGERR("No data in Python splitter for Korean\n");
|
||||
return false;
|
||||
}
|
||||
string& outdata = resit->second;
|
||||
char sepchar = '^';
|
||||
//std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
|
||||
string::size_type wordstart = 0;
|
||||
string::size_type wordend = outdata.find(sepchar);
|
||||
for (;;) {
|
||||
//cerr << "start " << wordstart << " end " << wordend << endl;
|
||||
if (wordend != wordstart) {
|
||||
string::size_type len = (wordend == string::npos) ?
|
||||
wordend : wordend-wordstart;
|
||||
string word = outdata.substr(wordstart, len);
|
||||
//cerr << " WORD[" << word << "]\n";
|
||||
if (!takeword(word, m_wordpos++, 0, 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (wordend == string::npos)
|
||||
break;
|
||||
wordstart = wordend + 1;
|
||||
wordend = outdata.find(sepchar, wordstart);
|
||||
}
|
||||
|
||||
|
||||
// Reset state, saving term position, and return the found non-cjk
|
||||
// Unicode character value. The current input byte offset is kept
|
||||
// in the utf8Iter
|
||||
int pos = m_wordpos;
|
||||
clearsplitstate();
|
||||
m_spanpos = m_wordpos = pos;
|
||||
*cp = c;
|
||||
return true;
|
||||
}
|
||||
234
src/filters/cmdtalk.py
Normal file
234
src/filters/cmdtalk.py
Normal file
@ -0,0 +1,234 @@
|
||||
#################################
|
||||
# Copyright (C) 2016 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
# Command communication module and utilities. See commands in cmdtalk.h
|
||||
#
|
||||
# All data is binary. This is important for Python3
|
||||
# All parameter names are converted to and processed as str/unicode
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import getopt
|
||||
import traceback
|
||||
|
||||
PY3 = sys.version > '3'
|
||||
|
||||
if PY3:
|
||||
def makebytes(data):
|
||||
if isinstance(data, bytes):
|
||||
return data
|
||||
else:
|
||||
return data.encode("UTF-8")
|
||||
else:
|
||||
def makebytes(data):
|
||||
if isinstance(data, unicode):
|
||||
return data.encode("UTF-8")
|
||||
else:
|
||||
return data
|
||||
|
||||
|
||||
############################################
|
||||
# CmdTalk implements the
|
||||
# communication protocol with the master process. It calls an external
|
||||
# method to use the args and produce return data.
|
||||
class CmdTalk:
|
||||
|
||||
def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None):
|
||||
try:
|
||||
self.myname = os.path.basename(sys.argv[0])
|
||||
except:
|
||||
self.myname = "???"
|
||||
|
||||
self.outfile = outfile
|
||||
self.infile = infile
|
||||
self.exitfunc = exitfunc
|
||||
self.fields = {}
|
||||
|
||||
if sys.platform == "win32":
|
||||
import msvcrt
|
||||
msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
|
||||
msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
|
||||
self.debugfile = None
|
||||
if self.debugfile:
|
||||
self.errfout = open(self.debugfile, "a")
|
||||
else:
|
||||
self.errfout = sys.stderr
|
||||
|
||||
def log(self, s, doexit = 0, exitvalue = 1):
|
||||
print("CMDTALK: %s: %s" % (self.myname, s), file=self.errfout)
|
||||
if doexit:
|
||||
if self.exitfunc:
|
||||
self.exitfunc(exitvalue)
|
||||
sys.exit(exitvalue)
|
||||
|
||||
def breakwrite(self, outfile, data):
|
||||
if sys.platform != "win32":
|
||||
outfile.write(data)
|
||||
else:
|
||||
# On windows, writing big chunks can fail with a "not enough space"
|
||||
# error. Seems a combined windows/python bug, depending on versions.
|
||||
# See https://bugs.python.org/issue11395
|
||||
# In any case, just break it up
|
||||
total = len(data)
|
||||
bs = 4*1024
|
||||
offset = 0
|
||||
while total > 0:
|
||||
if total < bs:
|
||||
tow = total
|
||||
else:
|
||||
tow = bs
|
||||
#self.log("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
|
||||
outfile.write(data[offset:offset+tow])
|
||||
offset += tow
|
||||
total -= tow
|
||||
|
||||
# Read single parameter from process input: line with param name and size
|
||||
# followed by data. The param name is returned as str/unicode, the data
|
||||
# as bytes
|
||||
def readparam(self):
|
||||
if PY3:
|
||||
inf = self.infile.buffer
|
||||
else:
|
||||
inf = self.infile
|
||||
s = inf.readline()
|
||||
if s == b'':
|
||||
if self.exitfunc:
|
||||
self.exitfunc(0)
|
||||
sys.exit(0)
|
||||
|
||||
s = s.rstrip(b'\n')
|
||||
|
||||
if s == b'':
|
||||
return ('', b'')
|
||||
l = s.split()
|
||||
if len(l) != 2:
|
||||
self.log(b'bad line: [' + s + b']', 1, 1)
|
||||
|
||||
paramname = l[0].decode('ASCII').rstrip(':')
|
||||
paramsize = int(l[1])
|
||||
if paramsize > 0:
|
||||
paramdata = inf.read(paramsize)
|
||||
if len(paramdata) != paramsize:
|
||||
self.log("Bad read: wanted %d, got %d" %
|
||||
(paramsize, len(paramdata)), 1, 1)
|
||||
else:
|
||||
paramdata = b''
|
||||
if PY3:
|
||||
paramdata = paramdata.decode('utf-8')
|
||||
|
||||
#self.log("paramname [%s] paramsize %d value [%s]" %
|
||||
# (paramname, paramsize, paramdata))
|
||||
return (paramname, paramdata)
|
||||
|
||||
if PY3:
|
||||
def senditem(self, nm, data):
|
||||
data = makebytes(data)
|
||||
l = len(data)
|
||||
self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l)))
|
||||
self.breakwrite(self.outfile.buffer, data)
|
||||
else:
|
||||
def senditem(self, nm, data):
|
||||
data = makebytes(data)
|
||||
l = len(data)
|
||||
self.outfile.write(makebytes("%s: %d\n" % (nm, l)))
|
||||
self.breakwrite(self.outfile, data)
|
||||
|
||||
# Send answer: document, ipath, possible eof.
|
||||
def answer(self, outfields):
|
||||
for nm,value in outfields.items():
|
||||
#self.log("Senditem: [%s] -> [%s]" % (nm, value))
|
||||
self.senditem(nm, value)
|
||||
|
||||
# End of message
|
||||
print(file=self.outfile)
|
||||
self.outfile.flush()
|
||||
#self.log("done writing data")
|
||||
|
||||
# Call processor with input params, send result
|
||||
def processmessage(self, processor, params):
|
||||
# In normal usage we try to recover from processor errors, but
|
||||
# we sometimes want to see the real stack trace when testing
|
||||
safeexec = True
|
||||
if safeexec:
|
||||
try:
|
||||
outfields = processor.process(params)
|
||||
except Exception as err:
|
||||
self.log("processmessage: processor raised: [%s]" % err)
|
||||
traceback.print_exc()
|
||||
outfields = {}
|
||||
outfields["cmdtalkstatus"] = "1"
|
||||
outfields["cmdtalkerrstr"] = str(err)
|
||||
else:
|
||||
outfields = processor.process(params)
|
||||
|
||||
self.answer(outfields)
|
||||
|
||||
# Loop on messages from our master
|
||||
def mainloop(self, processor):
|
||||
while 1:
|
||||
#self.log("waiting for command")
|
||||
|
||||
params = dict()
|
||||
|
||||
# Read at most 10 parameters (normally 1 or 2), stop at empty line
|
||||
# End of message is signalled by empty paramname
|
||||
for i in range(10):
|
||||
paramname, paramdata = self.readparam()
|
||||
if paramname == "":
|
||||
break
|
||||
params[paramname] = paramdata
|
||||
|
||||
# Got message, act on it
|
||||
self.processmessage(processor, params)
|
||||
|
||||
|
||||
# Common main routine for testing: either run the normal protocol
|
||||
# engine or a local loop.
|
||||
def main(proto, processor):
|
||||
if len(sys.argv) == 1:
|
||||
proto.mainloop(processor)
|
||||
# mainloop does not return. Just in case
|
||||
sys.exit(1)
|
||||
|
||||
# Not running the main loop: run one processor call for debugging
|
||||
def usage():
|
||||
print("Usage: cmdtalk.py pname pvalue [pname pvalue...]",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
def debprint(out, s):
|
||||
proto.breakwrite(out, makebytes(s+'\n'))
|
||||
|
||||
args = sys.argv[1:]
|
||||
if len(args) == 0 or len(args) % 2 != 0:
|
||||
usage()
|
||||
params = dict()
|
||||
for i in range(len(args)/2):
|
||||
params[args[2*i]] = args[2*i+1]
|
||||
res = processor.process(params)
|
||||
|
||||
ioout = sys.stdout.buffer if PY3 else sys.stdout
|
||||
|
||||
for nm,value in res.items():
|
||||
#self.log("Senditem: [%s] -> [%s]" % (nm, value))
|
||||
bdata = makebytes(value)
|
||||
debprint(ioout, "%s->" % nm)
|
||||
proto.breakwrite(ioout, bdata)
|
||||
ioout.write(b'\n')
|
||||
52
src/filters/kosplitter.py
Executable file
52
src/filters/kosplitter.py
Executable file
@ -0,0 +1,52 @@
|
||||
#!/usr/bin/python3
|
||||
#################################
|
||||
# Copyright (C) 2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
|
||||
#
|
||||
# Interface to the konlpy Korean text analyser: we receive text from
|
||||
# our parent process and have it segmented by the analyser, then
|
||||
# return the results. The analyser startup is very expensive (several
|
||||
# seconds), which is why we can't just execute it from the main
|
||||
# process.
|
||||
#
|
||||
|
||||
import sys
|
||||
import cmdtalk
|
||||
|
||||
from konlpy.tag import Okt
|
||||
|
||||
class Processor(object):
|
||||
def __init__(self, proto):
|
||||
self.proto = proto
|
||||
self.okt = Okt()
|
||||
|
||||
def process(self, params):
|
||||
if 'data' not in params:
|
||||
return {'error':'No data field in parameters'}
|
||||
pos = self.okt.pos(params['data'])
|
||||
#proto.log("%s" % pos)
|
||||
output = ""
|
||||
for e in pos:
|
||||
if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
|
||||
e[1] == 'Adverb':
|
||||
output += e[0] + '^'
|
||||
return {'data': output}
|
||||
|
||||
proto = cmdtalk.CmdTalk()
|
||||
processor = Processor(proto)
|
||||
proto.mainloop(processor)
|
||||
243
src/utils/cmdtalk.cpp
Normal file
243
src/utils/cmdtalk.cpp
Normal file
@ -0,0 +1,243 @@
|
||||
/* Copyright (C) 2016 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation; either version 2.1 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
#include "cmdtalk.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <mutex>
|
||||
|
||||
#include "smallut.h"
|
||||
#include "execmd.h"
|
||||
#ifdef MDU_INCLUDE_LOG
|
||||
#include MDU_INCLUDE_LOG
|
||||
#else
|
||||
#include "log.h"
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
|
||||
class TimeoutExcept {};
|
||||
|
||||
class Canceler : public ExecCmdAdvise {
|
||||
public:
|
||||
Canceler(int tmsecs)
|
||||
: m_timeosecs(tmsecs) {}
|
||||
|
||||
virtual void newData(int cnt) {
|
||||
if (m_starttime && (time(0) - m_starttime) > m_timeosecs) {
|
||||
throw TimeoutExcept();
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
m_starttime = time(0);
|
||||
}
|
||||
int m_timeosecs;
|
||||
time_t m_starttime{0};
|
||||
};
|
||||
|
||||
class CmdTalk::Internal {
|
||||
public:
|
||||
Internal(int timeosecs)
|
||||
: m_cancel(timeosecs) {}
|
||||
|
||||
~Internal() {
|
||||
delete cmd;
|
||||
}
|
||||
|
||||
bool readDataElement(string& name, string &data);
|
||||
|
||||
bool talk(const pair<string, string>& arg0,
|
||||
const unordered_map<string, string>& args,
|
||||
unordered_map<string, string>& rep);
|
||||
|
||||
ExecCmd *cmd{0};
|
||||
Canceler m_cancel;
|
||||
std::mutex mmutex;
|
||||
};
|
||||
|
||||
CmdTalk::CmdTalk(int timeosecs)
|
||||
{
|
||||
m = new Internal(timeosecs);
|
||||
}
|
||||
CmdTalk::~CmdTalk()
|
||||
{
|
||||
delete m;
|
||||
}
|
||||
|
||||
bool CmdTalk::startCmd(const string& cmdname,
|
||||
const vector<string>& args,
|
||||
const vector<string>& env,
|
||||
const vector<string>& path)
|
||||
{
|
||||
LOGDEB("CmdTalk::startCmd\n" );
|
||||
|
||||
delete m->cmd;
|
||||
m->cmd = new ExecCmd;
|
||||
m->cmd->setAdvise(&m->m_cancel);
|
||||
|
||||
for (const auto& it : env) {
|
||||
m->cmd->putenv(it);
|
||||
}
|
||||
|
||||
string acmdname(cmdname);
|
||||
if (!path.empty()) {
|
||||
string colonpath;
|
||||
for (const auto& it: path) {
|
||||
colonpath += it + ":";
|
||||
}
|
||||
if (!colonpath.empty()) {
|
||||
colonpath.erase(colonpath.size()-1);
|
||||
}
|
||||
LOGDEB("CmdTalk::startCmd: PATH: [" << colonpath << "]\n");
|
||||
ExecCmd::which(cmdname, acmdname, colonpath.c_str());
|
||||
}
|
||||
|
||||
if (m->cmd->startExec(acmdname, args, 1, 1) < 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Messages are made of data elements. Each element is like:
|
||||
// name: len\ndata
|
||||
// An empty line signals the end of the message, so the whole thing
|
||||
// would look like:
|
||||
// Name1: Len1\nData1Name2: Len2\nData2\n
|
||||
bool CmdTalk::Internal::readDataElement(string& name, string &data)
|
||||
{
|
||||
string ibuf;
|
||||
|
||||
m_cancel.reset();
|
||||
try {
|
||||
// Read name and length
|
||||
if (cmd->getline(ibuf) <= 0) {
|
||||
LOGERR("CmdTalk: getline error\n" );
|
||||
return false;
|
||||
}
|
||||
} catch (TimeoutExcept) {
|
||||
LOGINF("CmdTalk:readDataElement: fatal timeout (" <<
|
||||
m_cancel.m_timeosecs << " S)\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGDEB1("CmdTalk:rde: line [" << (ibuf) << "]\n" );
|
||||
|
||||
// Empty line (end of message) ?
|
||||
if (!ibuf.compare("\n")) {
|
||||
LOGDEB("CmdTalk: Got empty line\n" );
|
||||
return true;
|
||||
}
|
||||
|
||||
// We're expecting something like Name: len\n
|
||||
vector<string> tokens;
|
||||
stringToTokens(ibuf, tokens);
|
||||
if (tokens.size() != 2) {
|
||||
LOGERR("CmdTalk: bad line in filter output: [" << (ibuf) << "]\n" );
|
||||
return false;
|
||||
}
|
||||
vector<string>::iterator it = tokens.begin();
|
||||
name = *it++;
|
||||
string& slen = *it;
|
||||
int len;
|
||||
if (sscanf(slen.c_str(), "%d", &len) != 1) {
|
||||
LOGERR("CmdTalk: bad line in filter output: [" << (ibuf) << "]\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read element data
|
||||
data.erase();
|
||||
if (len > 0 && cmd->receive(data, len) != len) {
|
||||
LOGERR("CmdTalk: expected " << len << " bytes of data, got " <<
|
||||
data.length() << "\n");
|
||||
return false;
|
||||
}
|
||||
LOGDEB1("CmdTalk:rde: got: name [" << name << "] len " << len <<"value ["<<
|
||||
(data.size() > 100 ? (data.substr(0, 100) + " ...") : data)<< endl);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CmdTalk::Internal::talk(const pair<string, string>& arg0,
|
||||
const unordered_map<string, string>& args,
|
||||
unordered_map<string, string>& rep)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mmutex);
|
||||
if (cmd->getChildPid() <= 0) {
|
||||
LOGERR("CmdTalk::talk: no process\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ostringstream obuf;
|
||||
if (!arg0.first.empty()) {
|
||||
obuf << arg0.first << ": " << arg0.second.size() << "\n" << arg0.second;
|
||||
}
|
||||
for (const auto& it : args) {
|
||||
obuf << it.first << ": " << it.second.size() << "\n" << it.second;
|
||||
}
|
||||
obuf << "\n";
|
||||
|
||||
if (cmd->send(obuf.str()) < 0) {
|
||||
cmd->zapChild();
|
||||
LOGERR("CmdTalk: send error\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read answer (multiple elements)
|
||||
LOGDEB1("CmdTalk: reading answer\n" );
|
||||
for (;;) {
|
||||
string name, data;
|
||||
if (!readDataElement(name, data)) {
|
||||
cmd->zapChild();
|
||||
return false;
|
||||
}
|
||||
if (name.empty()) {
|
||||
break;
|
||||
}
|
||||
trimstring(name, ":");
|
||||
LOGDEB1("CmdTalk: got [" << name << "] -> [" << data << "]\n");
|
||||
rep[name] = data;
|
||||
}
|
||||
|
||||
if (rep.find("cmdtalkstatus") != rep.end()) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool CmdTalk::running()
|
||||
{
|
||||
return m && m->cmd && m->cmd->getChildPid() > 0;
|
||||
}
|
||||
|
||||
bool CmdTalk::talk(const unordered_map<string, string>& args,
|
||||
unordered_map<string, string>& rep)
|
||||
{
|
||||
return m->talk({"",""}, args, rep);
|
||||
}
|
||||
|
||||
bool CmdTalk::callproc(
|
||||
const string& proc,
|
||||
const unordered_map<std::string, std::string>& args,
|
||||
unordered_map<std::string, std::string>& rep)
|
||||
{
|
||||
return m->talk({"cmdtalk:proc", proc}, args, rep);
|
||||
}
|
||||
|
||||
|
||||
105
src/utils/cmdtalk.h
Normal file
105
src/utils/cmdtalk.h
Normal file
@ -0,0 +1,105 @@
|
||||
/* Copyright (C) 2016 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation; either version 2.1 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
#ifndef _CMDTALK_H_INCLUDED_
|
||||
#define _CMDTALK_H_INCLUDED_
|
||||
|
||||
/**
|
||||
* Execute commands and exchange messages with it.
|
||||
*
|
||||
* A simple stream protocol is used for the dialog. HTTP or some kind
|
||||
* of full-blown RPC could have been used, but there was also good
|
||||
* reason to keep it simple (yet powerful), given the limited context
|
||||
* of dialog through a pipe.
|
||||
*
|
||||
* The data is exchanged in TLV fashion, in a way that should be
|
||||
* usable in most script languages. The basic unit of data has one line
|
||||
* with a data type and a count (both ASCII), followed by the data. A
|
||||
* 'message' is made of one or several units or tags and ends with one empty
|
||||
* line.
|
||||
*
|
||||
* Example:(the message begins before 'Filename' and has 'Filename' and
|
||||
* 'Ipath' tags):
|
||||
*
|
||||
Filename: 24
|
||||
/my/home/mail/somefolderIpath: 2
|
||||
22
|
||||
|
||||
<Message ends here: because of the empty line after '22'
|
||||
|
||||
*
|
||||
* Example answer, with 'Mimetype' and 'Data' tags
|
||||
*
|
||||
Mimetype: 10
|
||||
text/plainData: 10
|
||||
0123456789
|
||||
|
||||
<Message ends here because of empty line
|
||||
|
||||
*
|
||||
* This format is both extensible and reasonably easy to parse.
|
||||
* While it's more fitted for python or perl on the script side, it
|
||||
* should even be sort of usable from the shell (e.g.: use dd to read
|
||||
* the counted data). Most alternatives would need data encoding in
|
||||
* some cases.
|
||||
*
|
||||
* Higher level dialog:
|
||||
* The C++ program is the master and sends request messages to the script.
|
||||
* Both sides of the communication should be prepared to receive and discard
|
||||
* unknown tags.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
class CmdTalk {
|
||||
public:
|
||||
CmdTalk(int timeosecs);
|
||||
virtual ~CmdTalk();
|
||||
|
||||
// @param env each entry should be of the form name=value. They
|
||||
// augment the subprocess environnement.
|
||||
// @param path replaces the PATH variable when looking for the command.
|
||||
virtual bool startCmd(const std::string& cmdname,
|
||||
const std::vector<std::string>& args =
|
||||
std::vector<std::string>(),
|
||||
const std::vector<std::string>& env =
|
||||
std::vector<std::string>(),
|
||||
const std::vector<std::string>& path =
|
||||
std::vector<std::string>()
|
||||
);
|
||||
virtual bool running();
|
||||
|
||||
// Single exchange: send and receive data.
|
||||
virtual bool talk(const std::unordered_map<std::string, std::string>& args,
|
||||
std::unordered_map<std::string, std::string>& rep);
|
||||
|
||||
// Specialized version with special argument used by dispatcher to call
|
||||
// designated method
|
||||
virtual bool callproc(
|
||||
const std::string& proc,
|
||||
const std::unordered_map<std::string, std::string>& args,
|
||||
std::unordered_map<std::string, std::string>& rep);
|
||||
|
||||
CmdTalk(const CmdTalk&) = delete;
|
||||
CmdTalk &operator=(const CmdTalk &) = delete;
|
||||
private:
|
||||
class Internal;
|
||||
Internal *m{0};
|
||||
};
|
||||
|
||||
#endif /* _CMDTALK_H_INCLUDED_ */
|
||||
Loading…
x
Reference in New Issue
Block a user