Have RclExecM inherit the shared CmdTalk now that the latter is used anyway for the korean splitter. Main diff: cmdtalk strips the colon from param names and does not lowercase them

This commit is contained in:
Jean-Francois Dockes 2020-03-27 11:07:51 +01:00
parent 9a3cea2728
commit 90dd64fc61
17 changed files with 101 additions and 184 deletions

View File

@ -46,10 +46,10 @@ else:
############################################
# CmdTalk implements the
# communication protocol with the master process. It calls an external
# method to use the args and produce return data.
class CmdTalk:
# CmdTalk implements the communication protocol with the master
# process. It calls an external method to use the args and produce
# return data.
class CmdTalk(object):
def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None):
try:
@ -66,7 +66,15 @@ class CmdTalk:
import msvcrt
msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
self.debugfile = None
try:
self.debugfile
except:
self.debugfile = None
try:
self.nodecodeinput
except:
self.nodecodeinput = False
if self.debugfile:
self.errfout = open(self.debugfile, "a")
else:
@ -131,7 +139,7 @@ class CmdTalk:
(paramsize, len(paramdata)), 1, 1)
else:
paramdata = b''
if PY3:
if PY3 and not self.nodecodeinput:
paramdata = paramdata.decode('utf-8')
#self.log("paramname [%s] paramsize %d value [%s]" %

View File

@ -58,7 +58,7 @@ class SevenZipExtractor:
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
filename = params["filename:"]
filename = params["filename"]
self.currentindex = -1
self.skiplist = []
@ -78,7 +78,7 @@ class SevenZipExtractor:
return False
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)

View File

@ -37,15 +37,15 @@ class RclBaseHandler(object):
def extractone(self, params):
#self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \
# params["mimetype:"]))
if not "filename:" in params:
#self.em.rclog("extractone fn %s mt %s" % (params["filename"], \
# params["mimetype"]))
if not "filename" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
fn = params["filename"]
if "mimetype:" in params:
self.inputmimetype = params["mimetype:"]
if "mimetype" in params:
self.inputmimetype = params["mimetype"]
else:
self.inputmimetype = None

View File

@ -295,7 +295,7 @@ class rclCHM:
self.currentindex = -1
self.contents = []
filename = params["filename:"]
filename = params["filename"]
if not self.chm.LoadCHM(filename):
self.em.rclog("LoadCHM failed")
return False
@ -336,7 +336,7 @@ class rclCHM:
return True
def getipath(self, params):
return self.extractone(params["ipath:"])
return self.extractone(params["ipath"])
def getnext(self, params):
if self.catenate:

View File

@ -106,7 +106,7 @@ class rclEPUB:
self.currentindex = -1
self.contents = []
try:
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
self.book = epub.open_epub(params["filename"].decode('UTF-8'))
except Exception as err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
@ -117,7 +117,7 @@ class rclEPUB:
def getipath(self, params):
return self.extractone(params["ipath:"].decode('UTF-8'))
return self.extractone(params["ipath"].decode('UTF-8'))
def getnext(self, params):
if self.catenate:

View File

@ -92,15 +92,15 @@ class Executor(RclBaseHandler):
return True, data
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
# params["mimetype:"]))
#self.em.rclog("extractone %s %s" % (params["filename"], \
# params["mimetype"]))
self.flt.reset()
ok = False
if not "filename:" in params:
if not "filename" in params:
self.em.rclog("extractone: no file name")
return (ok, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
fn = params["filename"]
while True:
cmdseq = self.flt.getCmd(fn)
cmd = cmdseq[0]

View File

@ -1,5 +1,5 @@
#################################
# Copyright (C) 2014 J.F.Dockes
# Copyright (C) 2014-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@ -28,6 +28,7 @@ import tempfile
import shutil
import getopt
import rclconfig
import cmdtalk
PY3 = (sys.version > '3')
_mswindows = (sys.platform == "win32")
@ -66,10 +67,11 @@ def configparamtrue(value):
my_config = rclconfig.RclConfig()
############################################
# RclExecM implements the
# communication protocol with the recollindex process. It calls the
# object specific of the document type to actually get the data.
class RclExecM:
# RclExecM implements the communication protocol with the recollindex
# process. It calls the object specific of the document type to
# actually get the data.
class RclExecM(cmdtalk.CmdTalk):
noteof = 0
eofnext = 1
eofnow = 2
@ -79,62 +81,30 @@ class RclExecM:
fileerror = 2
def __init__(self):
try:
self.myname = os.path.basename(sys.argv[0])
except:
self.myname = "???"
self.mimetype = b""
self.fields = {}
if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"):
self.maxmembersize = \
int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB"))
else:
try:
self.maxmembersize = int(os.environ["RECOLL_FILTER_MAXMEMBERKB"])
except:
self.maxmembersize = 50 * 1024
self.maxmembersize = self.maxmembersize * 1024
if sys.platform == "win32":
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
self.debugfile = my_config.getConfParam("filterdebuglog")
if self.debugfile:
self.errfout = open(self.debugfile, "a")
else:
self.errfout = sys.stderr
def rclog(self, s, doexit = 0, exitvalue = 1):
# On windows, and I think that it changed quite recently (Qt change?)
# we get stdout as stderr. So don't write at all
if self.debugfile or sys.platform != "win32":
print("RCLMFILT: %s: %s" % (self.myname, s), file=self.errfout)
self.errfout.flush()
if doexit:
sys.exit(exitvalue)
def breakwrite(self, outfile, data):
if sys.platform != "win32":
outfile.write(data)
else:
# On windows, writing big chunks can fail with a "not enough space"
# error. Seems a combined windows/python bug, depending on versions.
# See https://bugs.python.org/issue11395
# In any case, just break it up
total = len(data)
bs = 4*1024
offset = 0
while total > 0:
if total < bs:
tow = total
else:
tow = bs
#self.rclog("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
outfile.write(data[offset:offset+tow])
offset += tow
total -= tow
# Tell cmdtalk where to log
self.debugfile = my_config.getConfParam("filterdebuglog")
# Some of our params are binary, cmdtalk should not decode them
self.nodecodeinput = True
super().__init__()
def rclog(self, s, doexit = 0, exitvalue = 1):
# On windows, and I think that it changed quite recently (Qt
# change?), we get stdout as stderr?? So don't write at all if
# output not a file until this mystery is solved
if self.debugfile or sys.platform != "win32":
super().log(s, doexit, exitvalue)
# Note: tried replacing this with a multiple replacer according to
# http://stackoverflow.com/a/15221068, which was **10 times** slower
# http://stackoverflow.com/a/15221068, which was **10 times** slower
def htmlescape(self, txt):
# &amp must stay first (it somehow had managed to skip
# after the next replace, with rather interesting results)
@ -153,94 +123,36 @@ class RclExecM:
def setfield(self, nm, value):
self.fields[nm] = value
# Read single parameter from process input: line with param name and size
# followed by data. The param name is returned as str/unicode, the data
# as bytes
def readparam(self):
if PY3:
inf = sys.stdin.buffer
else:
inf = sys.stdin
s = inf.readline()
if s == b'':
sys.exit(0)
s = s.rstrip(b'\n')
if s == b'':
return ('', b'')
l = s.split()
if len(l) != 2:
self.rclog(b'bad line: [' + s + b']', 1, 1)
paramname = l[0].decode('ASCII').lower()
paramsize = int(l[1])
if paramsize > 0:
paramdata = inf.read(paramsize)
if len(paramdata) != paramsize:
self.rclog("Bad read: wanted %d, got %d" %
(paramsize, len(paramdata)), 1, 1)
else:
paramdata = b''
#self.rclog("paramname [%s] paramsize %d value [%s]" %
# (paramname, paramsize, paramdata))
return (paramname, paramdata)
if PY3:
def senditem(self, nm, data):
data = makebytes(data)
l = len(data)
sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, l)))
self.breakwrite(sys.stdout.buffer, data)
else:
def senditem(self, nm, data):
data = makebytes(data)
l = len(data)
sys.stdout.write(makebytes("%s: %d\n" % (nm, l)))
self.breakwrite(sys.stdout, data)
# Send answer: document, ipath, possible eof.
def answer(self, docdata, ipath, iseof = noteof, iserror = noerror):
if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow:
self.senditem("Document", docdata)
self.fields["Document"] = docdata
if len(ipath):
self.senditem("Ipath", ipath)
self.fields["Ipath"] = ipath
if len(self.mimetype):
self.senditem("Mimetype", self.mimetype)
for nm,value in self.fields.items():
#self.rclog("Senditem: [%s] -> [%s]" % (nm, value))
self.senditem("%s:"%nm, value)
self.fields = {}
self.fields["Mimetype"] = self.mimetype
# If we're at the end of the contents, say so
if iseof == RclExecM.eofnow:
self.senditem("Eofnow", b'')
self.fields["Eofnow"] = b''
elif iseof == RclExecM.eofnext:
self.senditem("Eofnext", b'')
self.fields["Eofnext"] = b''
if iserror == RclExecM.subdocerror:
self.senditem("Subdocerror", b'')
self.fields["Subdocerror"] = b''
elif iserror == RclExecM.fileerror:
self.senditem("Fileerror", b'')
self.fields["Fileerror"] = b''
super().answer(self.fields)
self.fields = {}
# End of message
print()
sys.stdout.flush()
#self.rclog("done writing data")
def processmessage(self, processor, params):
# We must have a filename entry (even empty). Else exit
if "filename:" not in params:
if "filename" not in params:
print("%s" % params, file=sys.stderr)
self.rclog("no filename ??", 1, 1)
# If we're given a file name, open it.
if len(params["filename:"]) != 0:
if len(params["filename"]) != 0:
try:
if not processor.openfile(params):
self.answer("", "", iserror = RclExecM.fileerror)
@ -255,7 +167,7 @@ class RclExecM:
eof = True
self.mimetype = ""
try:
if "ipath:" in params and len(params["ipath:"]):
if "ipath" in params and len(params["ipath"]):
ok, data, ipath, eof = processor.getipath(params)
else:
ok, data, ipath, eof = processor.getnext(params)
@ -270,13 +182,10 @@ class RclExecM:
else:
self.answer("", "", eof, RclExecM.subdocerror)
# Loop on messages from our master
# Main routine: loop on messages from our master
def mainloop(self, processor):
while 1:
#self.rclog("waiting for command")
params = dict()
# Read at most 10 parameters (normally 1 or 2), stop at empty line
# End of message is signalled by empty paramname
for i in range(10):
@ -284,7 +193,6 @@ class RclExecM:
if paramname == "":
break
params[paramname] = paramdata
# Got message, act on it
self.processmessage(processor, params)
@ -326,6 +234,7 @@ def which(program):
return candidate
return None
# Temp dir helper
class SafeTmpDir:
def __init__(self, em):
@ -355,6 +264,7 @@ class SafeTmpDir:
return self.tmpdir
# Common main routine for all python execm filters: either run the
# normal protocol engine or a local loop to test without recollindex
def main(proto, extract):
@ -369,8 +279,7 @@ def main(proto, extract):
def usage():
print("Usage: rclexecm.py [-d] [-s] [-i ipath] <filename>",
file=sys.stderr)
print(" rclexecm.py -w <prog>",
file=sys.stderr)
print(" rclexecm.py -w <prog>", file=sys.stderr)
sys.exit(1)
actAsSingle = False
@ -419,7 +328,7 @@ def main(proto, extract):
if not actAsSingle:
proto.breakwrite(out, makebytes(s+'\n'))
params = {'filename:': makebytes(path)}
params = {'filename' : makebytes(path)}
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer.
# We make a half-assed attempt to emulate:
@ -427,7 +336,7 @@ def main(proto, extract):
if not mimetype and not _mswindows:
mimetype = mimetype_with_file(path)
if mimetype:
params['mimetype:'] = mimetype
params['mimetype'] = mimetype
if not extract.openfile(params):
print("Open error", file=sys.stderr)
@ -438,7 +347,7 @@ def main(proto, extract):
else:
ioout = sys.stdout
if ipath != b"" or actAsSingle:
params['ipath:'] = ipath
params['ipath'] = ipath
ok, data, ipath, eof = extract.getipath(params)
if ok:
debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \

View File

@ -53,7 +53,7 @@ class IcalExtractor:
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.file = params["filename:"]
self.file = params["filename"]
try:
calstr = open(self.file, "rb")
@ -91,10 +91,10 @@ class IcalExtractor:
def getipath(self, params):
try:
if params["ipath:"] == b'':
if params["ipath"] == b'':
index = 0
else:
index = int(params["ipath:"])
index = int(params["ipath"])
except:
return (False, "", "", True)
return self.extractone(index)

View File

@ -50,7 +50,7 @@ class InfoExtractor:
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.file = params["filename:"]
self.file = params["filename"]
if not os.path.isfile(self.file):
self.em.rclog("Openfile: %s is not a file" % self.file)
@ -78,7 +78,7 @@ class InfoExtractor:
# Extract specific node
def getipath(self, params):
try:
index = int(params["ipath:"])
index = int(params["ipath"])
except:
return (False, "", "", True)
return self.extractone(index)

View File

@ -439,7 +439,7 @@ class PDFExtractor:
print("RECFILTERROR HELPERNOTFOUND pdftotext")
sys.exit(1);
self.filename = rclexecm.subprocfile(params["filename:"])
self.filename = rclexecm.subprocfile(params["filename"])
#self.em.rclog("openfile: [%s]" % self.filename)
self.currentindex = -1
@ -458,7 +458,7 @@ class PDFExtractor:
return True
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
return (ok, data, ipath, eof)

View File

@ -330,13 +330,13 @@ class PstExtractor(object):
if not self.pffexport:
print("RECFILTERROR HELPERNOTFOUND pffexport")
sys.exit(1);
self.filename = params["filename:"]
self.filename = params["filename"]
self.generator = None
return True
def getipath(self, params):
ipath = met_join(self.target + ".export",
params["ipath:"].decode("UTF-8"))
params["ipath"].decode("UTF-8"))
self.em.rclog("getipath: [%s]" % ipath)
if not self.startCmd(self.filename, ipath=ipath):
return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -109,14 +109,14 @@ class RarExtractor:
# wrong on Unix, but I'd have to dig further in the
# lib than I wish to. This is used on Windows anyway,
# where all Recoll paths are utf-8
fn = params["filename:"].decode("UTF-8")
fn = params["filename"].decode("UTF-8")
self.rar = rarfile.RarFile(fn, 'rb')
else:
# The previous versions passed the file name to
# RarFile. But the py3 version of this wants an str as
# input, which is wrong of course, as filenames are
# binary. Circumvented by passing the open file
f = open(params["filename:"], 'rb')
f = open(params["filename"], 'rb')
self.rar = RarFile(f)
return True
except Exception as err:
@ -124,7 +124,7 @@ class RarExtractor:
return False
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)

View File

@ -48,7 +48,7 @@ class TarExtractor:
def openfile(self, params):
self.currentindex = -1
try:
self.tar = tarfile.open(name=params["filename:"], mode='r')
self.tar = tarfile.open(name=params["filename"], mode='r')
#self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
@ -57,7 +57,7 @@ class TarExtractor:
return False
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)

View File

@ -39,7 +39,7 @@ class rclTXTLINES:
"""Open the text file, create a contents array"""
self.currentindex = -1
try:
f = open(params["filename:"].decode('UTF-8'), "r")
f = open(params["filename"].decode('UTF-8'), "r")
except Exception as err:
self.em.rclog("openfile: open failed: [%s]" % err)
return False
@ -80,7 +80,7 @@ class rclTXTLINES:
# numbers, but they could be tar archive paths or whatever we
# returned during indexing.
def getipath(self, params):
return self.extractone(int(params["ipath:"]))
return self.extractone(int(params["ipath"]))
# Most handlers factorize common code from getipath() and
# getnext() in an extractone() method, but this is not part of the

View File

@ -31,14 +31,14 @@ class WarExtractor:
def openfile(self, params):
self.currentindex = -1
try:
self.tar = tarfile.open(params["filename:"])
self.tar = tarfile.open(params["filename"])
return True
except Exception as err:
self.em.rclog(str(err))
return False
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
try:
tarinfo = self.tar.getmember(ipath)
except Exception as err:

View File

@ -126,7 +126,7 @@ class ZipExtractor:
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.closefile()
filename = params["filename:"]
filename = params["filename"]
self.filename = filename
self.currentindex = -1
self.skiplist = []
@ -157,7 +157,7 @@ class ZipExtractor:
return False
def getipath(self, params):
ipath = params["ipath:"]
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)

View File

@ -187,22 +187,22 @@ bool MimeHandlerExecMultiple::next_document()
"]: " << reason << "\n");
}
}
obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
obuf << "filename: " << m_fn.length() << "\n" << m_fn;
// m_filefirst is set to true by set_document_file()
m_filefirst = false;
} else {
obuf << "Filename: " << 0 << "\n";
obuf << "filename: " << 0 << "\n";
}
if (!m_ipath.empty()) {
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
m_ipath << "]\n");
obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
obuf << "ipath: " << m_ipath.length() << "\n" << m_ipath;
}
if (!m_dfltInputCharset.empty()) {
obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n"
obuf << "dflincs: " << m_dfltInputCharset.length() << "\n"
<< m_dfltInputCharset;
}
obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
obuf << "mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
obuf << "\n";
if (m_cmd.send(obuf.str()) < 0) {
m_cmd.zapChild();