From 90dd64fc616ccbf22172d97c3bf1b2d0efe89cd7 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 27 Mar 2020 11:07:51 +0100 Subject: [PATCH] Have RclExecM inherit the shared CmdTalk now that the latter is used anyway for the korean splitter. Main diff: cmdtalk strips the colon from param names and does not lowercase them --- src/filters/cmdtalk.py | 20 ++-- src/filters/rcl7z | 4 +- src/filters/rclbasehandler.py | 12 +-- src/filters/rclchm | 4 +- src/filters/rclepub | 4 +- src/filters/rclexec1.py | 8 +- src/filters/rclexecm.py | 183 +++++++++------------------------- src/filters/rclics | 6 +- src/filters/rclinfo | 4 +- src/filters/rclpdf.py | 4 +- src/filters/rclpst.py | 4 +- src/filters/rclrar | 6 +- src/filters/rcltar | 4 +- src/filters/rcltxtlines.py | 4 +- src/filters/rclwar | 4 +- src/filters/rclzip | 4 +- src/internfile/mh_execm.cpp | 10 +- 17 files changed, 101 insertions(+), 184 deletions(-) diff --git a/src/filters/cmdtalk.py b/src/filters/cmdtalk.py index 2949e936..99bcfb36 100644 --- a/src/filters/cmdtalk.py +++ b/src/filters/cmdtalk.py @@ -46,10 +46,10 @@ else: ############################################ -# CmdTalk implements the -# communication protocol with the master process. It calls an external -# method to use the args and produce return data. -class CmdTalk: +# CmdTalk implements the communication protocol with the master +# process. It calls an external method to use the args and produce +# return data. +class CmdTalk(object): def __init__(self, outfile=sys.stdout, infile=sys.stdin, exitfunc=None): try: @@ -66,7 +66,15 @@ class CmdTalk: import msvcrt msvcrt.setmode(self.outfile.fileno(), os.O_BINARY) msvcrt.setmode(self.infile.fileno(), os.O_BINARY) - self.debugfile = None + + try: + self.debugfile + except: + self.debugfile = None + try: + self.nodecodeinput + except: + self.nodecodeinput = False if self.debugfile: self.errfout = open(self.debugfile, "a") else: @@ -131,7 +139,7 @@ class CmdTalk: (paramsize, len(paramdata)), 1, 1) else: paramdata = b'' - if PY3: + if PY3 and not self.nodecodeinput: paramdata = paramdata.decode('utf-8') #self.log("paramname [%s] paramsize %d value [%s]" % diff --git a/src/filters/rcl7z b/src/filters/rcl7z index 446674d5..874ff66d 100755 --- a/src/filters/rcl7z +++ b/src/filters/rcl7z @@ -58,7 +58,7 @@ class SevenZipExtractor: ###### File type handler api, used by rclexecm ----------> def openfile(self, params): - filename = params["filename:"] + filename = params["filename"] self.currentindex = -1 self.skiplist = [] @@ -78,7 +78,7 @@ class SevenZipExtractor: return False def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) if ok: return (ok, data, ipath, eof) diff --git a/src/filters/rclbasehandler.py b/src/filters/rclbasehandler.py index 101b7e42..904c30fb 100644 --- a/src/filters/rclbasehandler.py +++ b/src/filters/rclbasehandler.py @@ -37,15 +37,15 @@ class RclBaseHandler(object): def extractone(self, params): - #self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \ - # params["mimetype:"])) - if not "filename:" in params: + #self.em.rclog("extractone fn %s mt %s" % (params["filename"], \ + # params["mimetype"])) + if not "filename" in params: self.em.rclog("extractone: no file name") return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] + fn = params["filename"] - if "mimetype:" in params: - self.inputmimetype = params["mimetype:"] + if "mimetype" in params: + self.inputmimetype = params["mimetype"] else: self.inputmimetype = None diff --git a/src/filters/rclchm b/src/filters/rclchm index ed77bcf7..b7c4bbaf 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -295,7 +295,7 @@ class rclCHM: self.currentindex = -1 self.contents = [] - filename = params["filename:"] + filename = params["filename"] if not self.chm.LoadCHM(filename): self.em.rclog("LoadCHM failed") return False @@ -336,7 +336,7 @@ class rclCHM: return True def getipath(self, params): - return self.extractone(params["ipath:"]) + return self.extractone(params["ipath"]) def getnext(self, params): if self.catenate: diff --git a/src/filters/rclepub b/src/filters/rclepub index 1276fc74..c98d369f 100755 --- a/src/filters/rclepub +++ b/src/filters/rclepub @@ -106,7 +106,7 @@ class rclEPUB: self.currentindex = -1 self.contents = [] try: - self.book = epub.open_epub(params["filename:"].decode('UTF-8')) + self.book = epub.open_epub(params["filename"].decode('UTF-8')) except Exception as err: self.em.rclog("openfile: epub.open failed: [%s]" % err) return False @@ -117,7 +117,7 @@ class rclEPUB: def getipath(self, params): - return self.extractone(params["ipath:"].decode('UTF-8')) + return self.extractone(params["ipath"].decode('UTF-8')) def getnext(self, params): if self.catenate: diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py index afcfa683..03831e29 100644 --- a/src/filters/rclexec1.py +++ b/src/filters/rclexec1.py @@ -92,15 +92,15 @@ class Executor(RclBaseHandler): return True, data def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], \ - # params["mimetype:"])) + #self.em.rclog("extractone %s %s" % (params["filename"], \ + # params["mimetype"])) self.flt.reset() ok = False - if not "filename:" in params: + if not "filename" in params: self.em.rclog("extractone: no file name") return (ok, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] + fn = params["filename"] while True: cmdseq = self.flt.getCmd(fn) cmd = cmdseq[0] diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index cb83a325..b3142acc 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -1,5 +1,5 @@ ################################# -# Copyright (C) 2014 J.F.Dockes +# Copyright (C) 2014-2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or @@ -28,6 +28,7 @@ import tempfile import shutil import getopt import rclconfig +import cmdtalk PY3 = (sys.version > '3') _mswindows = (sys.platform == "win32") @@ -66,10 +67,11 @@ def configparamtrue(value): my_config = rclconfig.RclConfig() ############################################ -# RclExecM implements the -# communication protocol with the recollindex process. It calls the -# object specific of the document type to actually get the data. -class RclExecM: +# RclExecM implements the communication protocol with the recollindex +# process. It calls the object specific of the document type to +# actually get the data. + +class RclExecM(cmdtalk.CmdTalk): noteof = 0 eofnext = 1 eofnow = 2 @@ -79,62 +81,30 @@ class RclExecM: fileerror = 2 def __init__(self): - try: - self.myname = os.path.basename(sys.argv[0]) - except: - self.myname = "???" self.mimetype = b"" - self.fields = {} - - if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"): - self.maxmembersize = \ - int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB")) - else: + try: + self.maxmembersize = int(os.environ["RECOLL_FILTER_MAXMEMBERKB"]) + except: self.maxmembersize = 50 * 1024 self.maxmembersize = self.maxmembersize * 1024 - if sys.platform == "win32": - import msvcrt - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) - self.debugfile = my_config.getConfParam("filterdebuglog") - if self.debugfile: - self.errfout = open(self.debugfile, "a") - else: - self.errfout = sys.stderr - - def rclog(self, s, doexit = 0, exitvalue = 1): - # On windows, and I think that it changed quite recently (Qt change?) - # we get stdout as stderr. So don't write at all - if self.debugfile or sys.platform != "win32": - print("RCLMFILT: %s: %s" % (self.myname, s), file=self.errfout) - self.errfout.flush() - if doexit: - sys.exit(exitvalue) - def breakwrite(self, outfile, data): - if sys.platform != "win32": - outfile.write(data) - else: - # On windows, writing big chunks can fail with a "not enough space" - # error. Seems a combined windows/python bug, depending on versions. - # See https://bugs.python.org/issue11395 - # In any case, just break it up - total = len(data) - bs = 4*1024 - offset = 0 - while total > 0: - if total < bs: - tow = total - else: - tow = bs - #self.rclog("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow])) - outfile.write(data[offset:offset+tow]) - offset += tow - total -= tow - + # Tell cmdtalk where to log + self.debugfile = my_config.getConfParam("filterdebuglog") + # Some of our params are binary, cmdtalk should not decode them + self.nodecodeinput = True + + super().__init__() + + def rclog(self, s, doexit = 0, exitvalue = 1): + # On windows, and I think that it changed quite recently (Qt + # change?), we get stdout as stderr?? So don't write at all if + # output not a file until this mystery is solved + if self.debugfile or sys.platform != "win32": + super().log(s, doexit, exitvalue) + # Note: tried replacing this with a multiple replacer according to - # http://stackoverflow.com/a/15221068, which was **10 times** slower + # http://stackoverflow.com/a/15221068, which was **10 times** slower def htmlescape(self, txt): # & must stay first (it somehow had managed to skip # after the next replace, with rather interesting results) @@ -153,94 +123,36 @@ class RclExecM: def setfield(self, nm, value): self.fields[nm] = value - # Read single parameter from process input: line with param name and size - # followed by data. The param name is returned as str/unicode, the data - # as bytes - def readparam(self): - if PY3: - inf = sys.stdin.buffer - else: - inf = sys.stdin - s = inf.readline() - if s == b'': - sys.exit(0) - - s = s.rstrip(b'\n') - - if s == b'': - return ('', b'') - l = s.split() - if len(l) != 2: - self.rclog(b'bad line: [' + s + b']', 1, 1) - - paramname = l[0].decode('ASCII').lower() - paramsize = int(l[1]) - if paramsize > 0: - paramdata = inf.read(paramsize) - if len(paramdata) != paramsize: - self.rclog("Bad read: wanted %d, got %d" % - (paramsize, len(paramdata)), 1, 1) - else: - paramdata = b'' - - #self.rclog("paramname [%s] paramsize %d value [%s]" % - # (paramname, paramsize, paramdata)) - return (paramname, paramdata) - - if PY3: - def senditem(self, nm, data): - data = makebytes(data) - l = len(data) - sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, l))) - self.breakwrite(sys.stdout.buffer, data) - else: - def senditem(self, nm, data): - data = makebytes(data) - l = len(data) - sys.stdout.write(makebytes("%s: %d\n" % (nm, l))) - self.breakwrite(sys.stdout, data) - # Send answer: document, ipath, possible eof. def answer(self, docdata, ipath, iseof = noteof, iserror = noerror): - if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow: - self.senditem("Document", docdata) - + self.fields["Document"] = docdata if len(ipath): - self.senditem("Ipath", ipath) - + self.fields["Ipath"] = ipath if len(self.mimetype): - self.senditem("Mimetype", self.mimetype) - - for nm,value in self.fields.items(): - #self.rclog("Senditem: [%s] -> [%s]" % (nm, value)) - self.senditem("%s:"%nm, value) - self.fields = {} - + self.fields["Mimetype"] = self.mimetype + # If we're at the end of the contents, say so if iseof == RclExecM.eofnow: - self.senditem("Eofnow", b'') + self.fields["Eofnow"] = b'' elif iseof == RclExecM.eofnext: - self.senditem("Eofnext", b'') + self.fields["Eofnext"] = b'' if iserror == RclExecM.subdocerror: - self.senditem("Subdocerror", b'') + self.fields["Subdocerror"] = b'' elif iserror == RclExecM.fileerror: - self.senditem("Fileerror", b'') + self.fields["Fileerror"] = b'' + + super().answer(self.fields) + self.fields = {} - # End of message - print() - sys.stdout.flush() - #self.rclog("done writing data") def processmessage(self, processor, params): - # We must have a filename entry (even empty). Else exit - if "filename:" not in params: + if "filename" not in params: print("%s" % params, file=sys.stderr) self.rclog("no filename ??", 1, 1) - # If we're given a file name, open it. - if len(params["filename:"]) != 0: + if len(params["filename"]) != 0: try: if not processor.openfile(params): self.answer("", "", iserror = RclExecM.fileerror) @@ -255,7 +167,7 @@ class RclExecM: eof = True self.mimetype = "" try: - if "ipath:" in params and len(params["ipath:"]): + if "ipath" in params and len(params["ipath"]): ok, data, ipath, eof = processor.getipath(params) else: ok, data, ipath, eof = processor.getnext(params) @@ -270,13 +182,10 @@ class RclExecM: else: self.answer("", "", eof, RclExecM.subdocerror) - # Loop on messages from our master + # Main routine: loop on messages from our master def mainloop(self, processor): while 1: - #self.rclog("waiting for command") - params = dict() - # Read at most 10 parameters (normally 1 or 2), stop at empty line # End of message is signalled by empty paramname for i in range(10): @@ -284,7 +193,6 @@ class RclExecM: if paramname == "": break params[paramname] = paramdata - # Got message, act on it self.processmessage(processor, params) @@ -326,6 +234,7 @@ def which(program): return candidate return None + # Temp dir helper class SafeTmpDir: def __init__(self, em): @@ -355,6 +264,7 @@ class SafeTmpDir: return self.tmpdir + # Common main routine for all python execm filters: either run the # normal protocol engine or a local loop to test without recollindex def main(proto, extract): @@ -369,8 +279,7 @@ def main(proto, extract): def usage(): print("Usage: rclexecm.py [-d] [-s] [-i ipath] ", file=sys.stderr) - print(" rclexecm.py -w ", - file=sys.stderr) + print(" rclexecm.py -w ", file=sys.stderr) sys.exit(1) actAsSingle = False @@ -419,7 +328,7 @@ def main(proto, extract): if not actAsSingle: proto.breakwrite(out, makebytes(s+'\n')) - params = {'filename:': makebytes(path)} + params = {'filename' : makebytes(path)} # Some filters (e.g. rclaudio) need/get a MIME type from the indexer. # We make a half-assed attempt to emulate: @@ -427,7 +336,7 @@ def main(proto, extract): if not mimetype and not _mswindows: mimetype = mimetype_with_file(path) if mimetype: - params['mimetype:'] = mimetype + params['mimetype'] = mimetype if not extract.openfile(params): print("Open error", file=sys.stderr) @@ -438,7 +347,7 @@ def main(proto, extract): else: ioout = sys.stdout if ipath != b"" or actAsSingle: - params['ipath:'] = ipath + params['ipath'] = ipath ok, data, ipath, eof = extract.getipath(params) if ok: debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \ diff --git a/src/filters/rclics b/src/filters/rclics index b6c7386a..7512a862 100755 --- a/src/filters/rclics +++ b/src/filters/rclics @@ -53,7 +53,7 @@ class IcalExtractor: ###### File type handler api, used by rclexecm ----------> def openfile(self, params): - self.file = params["filename:"] + self.file = params["filename"] try: calstr = open(self.file, "rb") @@ -91,10 +91,10 @@ class IcalExtractor: def getipath(self, params): try: - if params["ipath:"] == b'': + if params["ipath"] == b'': index = 0 else: - index = int(params["ipath:"]) + index = int(params["ipath"]) except: return (False, "", "", True) return self.extractone(index) diff --git a/src/filters/rclinfo b/src/filters/rclinfo index ebd611e5..05590113 100755 --- a/src/filters/rclinfo +++ b/src/filters/rclinfo @@ -50,7 +50,7 @@ class InfoExtractor: ###### File type handler api, used by rclexecm ----------> def openfile(self, params): - self.file = params["filename:"] + self.file = params["filename"] if not os.path.isfile(self.file): self.em.rclog("Openfile: %s is not a file" % self.file) @@ -78,7 +78,7 @@ class InfoExtractor: # Extract specific node def getipath(self, params): try: - index = int(params["ipath:"]) + index = int(params["ipath"]) except: return (False, "", "", True) return self.extractone(index) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 98f1e39a..95483b79 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -439,7 +439,7 @@ class PDFExtractor: print("RECFILTERROR HELPERNOTFOUND pdftotext") sys.exit(1); - self.filename = rclexecm.subprocfile(params["filename:"]) + self.filename = rclexecm.subprocfile(params["filename"]) #self.em.rclog("openfile: [%s]" % self.filename) self.currentindex = -1 @@ -458,7 +458,7 @@ class PDFExtractor: return True def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) return (ok, data, ipath, eof) diff --git a/src/filters/rclpst.py b/src/filters/rclpst.py index aa2d5d08..d7788093 100755 --- a/src/filters/rclpst.py +++ b/src/filters/rclpst.py @@ -330,13 +330,13 @@ class PstExtractor(object): if not self.pffexport: print("RECFILTERROR HELPERNOTFOUND pffexport") sys.exit(1); - self.filename = params["filename:"] + self.filename = params["filename"] self.generator = None return True def getipath(self, params): ipath = met_join(self.target + ".export", - params["ipath:"].decode("UTF-8")) + params["ipath"].decode("UTF-8")) self.em.rclog("getipath: [%s]" % ipath) if not self.startCmd(self.filename, ipath=ipath): return (False, "", "", rclexecm.RclExecM.eofnow) diff --git a/src/filters/rclrar b/src/filters/rclrar index d3ae69f0..d4fcd2b7 100755 --- a/src/filters/rclrar +++ b/src/filters/rclrar @@ -109,14 +109,14 @@ class RarExtractor: # wrong on Unix, but I'd have to dig further in the # lib than I wish to. This is used on Windows anyway, # where all Recoll paths are utf-8 - fn = params["filename:"].decode("UTF-8") + fn = params["filename"].decode("UTF-8") self.rar = rarfile.RarFile(fn, 'rb') else: # The previous versions passed the file name to # RarFile. But the py3 version of this wants an str as # input, which is wrong of course, as filenames are # binary. Circumvented by passing the open file - f = open(params["filename:"], 'rb') + f = open(params["filename"], 'rb') self.rar = RarFile(f) return True except Exception as err: @@ -124,7 +124,7 @@ class RarExtractor: return False def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) if ok: return (ok, data, ipath, eof) diff --git a/src/filters/rcltar b/src/filters/rcltar index dfa7f060..c6f2bf4f 100755 --- a/src/filters/rcltar +++ b/src/filters/rcltar @@ -48,7 +48,7 @@ class TarExtractor: def openfile(self, params): self.currentindex = -1 try: - self.tar = tarfile.open(name=params["filename:"], mode='r') + self.tar = tarfile.open(name=params["filename"], mode='r') #self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())] self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]] @@ -57,7 +57,7 @@ class TarExtractor: return False def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) if ok: return (ok, data, ipath, eof) diff --git a/src/filters/rcltxtlines.py b/src/filters/rcltxtlines.py index 982b675e..05746c1d 100755 --- a/src/filters/rcltxtlines.py +++ b/src/filters/rcltxtlines.py @@ -39,7 +39,7 @@ class rclTXTLINES: """Open the text file, create a contents array""" self.currentindex = -1 try: - f = open(params["filename:"].decode('UTF-8'), "r") + f = open(params["filename"].decode('UTF-8'), "r") except Exception as err: self.em.rclog("openfile: open failed: [%s]" % err) return False @@ -80,7 +80,7 @@ class rclTXTLINES: # numbers, but they could be tar archive paths or whatever we # returned during indexing. def getipath(self, params): - return self.extractone(int(params["ipath:"])) + return self.extractone(int(params["ipath"])) # Most handlers factorize common code from getipath() and # getnext() in an extractone() method, but this is not part of the diff --git a/src/filters/rclwar b/src/filters/rclwar index 69b681fd..fa251bc3 100755 --- a/src/filters/rclwar +++ b/src/filters/rclwar @@ -31,14 +31,14 @@ class WarExtractor: def openfile(self, params): self.currentindex = -1 try: - self.tar = tarfile.open(params["filename:"]) + self.tar = tarfile.open(params["filename"]) return True except Exception as err: self.em.rclog(str(err)) return False def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] try: tarinfo = self.tar.getmember(ipath) except Exception as err: diff --git a/src/filters/rclzip b/src/filters/rclzip index 4103877c..8d27aa10 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -126,7 +126,7 @@ class ZipExtractor: ###### File type handler api, used by rclexecm ----------> def openfile(self, params): self.closefile() - filename = params["filename:"] + filename = params["filename"] self.filename = filename self.currentindex = -1 self.skiplist = [] @@ -157,7 +157,7 @@ class ZipExtractor: return False def getipath(self, params): - ipath = params["ipath:"] + ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) if ok: return (ok, data, ipath, eof) diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index ab3240a6..ab5e02a9 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -187,22 +187,22 @@ bool MimeHandlerExecMultiple::next_document() "]: " << reason << "\n"); } } - obuf << "FileName: " << m_fn.length() << "\n" << m_fn; + obuf << "filename: " << m_fn.length() << "\n" << m_fn; // m_filefirst is set to true by set_document_file() m_filefirst = false; } else { - obuf << "Filename: " << 0 << "\n"; + obuf << "filename: " << 0 << "\n"; } if (!m_ipath.empty()) { LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" << m_ipath << "]\n"); - obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath; + obuf << "ipath: " << m_ipath.length() << "\n" << m_ipath; } if (!m_dfltInputCharset.empty()) { - obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n" + obuf << "dflincs: " << m_dfltInputCharset.length() << "\n" << m_dfltInputCharset; } - obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType; + obuf << "mimetype: " << m_mimeType.length() << "\n" << m_mimeType; obuf << "\n"; if (m_cmd.send(obuf.str()) < 0) { m_cmd.zapChild();