################################# # Copyright (C) 2014-2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the # Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ######################################################## ## Recoll multifilter communication module and utilities # # All data is binary. This is important for Python3 # All parameter names are converted to and processed as str/unicode from __future__ import print_function import sys import os import tempfile import shutil import getopt import rclconfig import cmdtalk PY3 = (sys.version > '3') _g_mswindows = (sys.platform == "win32") _g_execdir = os.path.dirname(sys.argv[0]) _g_config = rclconfig.RclConfig() _g_debugfile = _g_config.getConfParam("filterdebuglog") _g_errfout = None def logmsg(msg): global _g_debugfile, _g_errfout if _g_debugfile and not _g_errfout: try: _g_errfout = open(_g_debugfile, "a") except: pass if _g_errfout: print("%s" % msg, file=_g_errfout) elif not _g_mswindows: print("%s" % msg, file=sys.stderr) # Convert to bytes if not already such. def makebytes(data): if type(data) == type(u''): return data.encode("UTF-8") return data # Possibly decode binary file name for use as subprocess argument, # depending on platform. def subprocfile(fn): # On Windows PY3 the list2cmdline() method in subprocess assumes that # all args are str, and we receive file names as UTF-8. So we need # to convert. # On Unix all list elements get converted to bytes in the C # _posixsubprocess module, nothing to do. if PY3 and _g_mswindows and type(fn) != type(''): return fn.decode('UTF-8') else: return fn # Check for truthness of rclconfig value. def configparamtrue(value): if not value: return False try: ivalue = int(value) return True if ivalue else False except: return True if value[0] in 'tT' else False # Escape special characters in plain text for inclusion in HTML doc. # Note: tried replacing this with a multiple replacer according to # http://stackoverflow.com/a/15221068, which was **10 times** slower def htmlescape(txt): # & must stay first (it somehow had managed to skip # after the next replace, with rather interesting results) try: txt = txt.replace(b'&', b'&').replace(b'<', b'<').\ replace(b'>', b'>').replace(b'"', b'"') except: txt = txt.replace("&", "&").replace("<", "<").\ replace(">", ">").replace("\"", """) return txt ############################################ # RclExecM implements the communication protocol with the recollindex # process. It calls the object specific of the document type to # actually get the data. class RclExecM(cmdtalk.CmdTalk): noteof = 0 eofnext = 1 eofnow = 2 noerror = 0 subdocerror = 1 fileerror = 2 def __init__(self): self.mimetype = b"" self.fields = {} try: self.maxmembersize = int(os.environ["RECOLL_FILTER_MAXMEMBERKB"]) except: self.maxmembersize = 50 * 1024 self.maxmembersize = self.maxmembersize * 1024 # Tell cmdtalk where to log self.debugfile = _g_config.getConfParam("filterdebuglog") # Some of our params are binary, cmdtalk should not decode them self.nodecodeinput = True super().__init__() def rclog(self, s, doexit = 0, exitvalue = 1): # On windows, and I think that it changed quite recently (Qt # change?), we get stdout as stderr?? So don't write at all if # output not a file until this mystery is solved if self.debugfile or sys.platform != "win32": super().log(s, doexit, exitvalue) # Our worker sometimes knows the mime types of the data it sends def setmimetype(self, mt): self.mimetype = makebytes(mt) def setfield(self, nm, value): self.fields[nm] = value # Send answer: document, ipath, possible eof. def answer(self, docdata, ipath, iseof = noteof, iserror = noerror): if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow: self.fields["Document"] = docdata if len(ipath): self.fields["Ipath"] = ipath if len(self.mimetype): self.fields["Mimetype"] = self.mimetype # If we're at the end of the contents, say so if iseof == RclExecM.eofnow: self.fields["Eofnow"] = b'' elif iseof == RclExecM.eofnext: self.fields["Eofnext"] = b'' if iserror == RclExecM.subdocerror: self.fields["Subdocerror"] = b'' elif iserror == RclExecM.fileerror: self.fields["Fileerror"] = b'' super().answer(self.fields) self.fields = {} def processmessage(self, processor, params): # We must have a filename entry (even empty). Else exit if "filename" not in params: print("%s" % params, file=sys.stderr) self.rclog("no filename ??", 1, 1) if len(params["filename"]) != 0: try: if not processor.openfile(params): self.answer("", "", iserror = RclExecM.fileerror) return except Exception as err: self.rclog("processmessage: openfile raised: [%s]" % err) self.answer("", "", iserror = RclExecM.fileerror) return # If we have an ipath, that's what we look for, else ask for next entry ipath = "" eof = True self.mimetype = "" try: if "ipath" in params and len(params["ipath"]): ok, data, ipath, eof = processor.getipath(params) else: ok, data, ipath, eof = processor.getnext(params) except Exception as err: self.rclog("getipath/next: exception: %s" %err) self.answer("", "", eof, RclExecM.fileerror) return #self.rclog("processmessage: ok %s eof %s ipath %s"%(ok, eof, ipath)) if ok: self.answer(data, ipath, eof) else: self.answer("", "", eof, RclExecM.subdocerror) # Main routine: loop on messages from our master def mainloop(self, processor): while 1: params = dict() # Read at most 10 parameters (normally 1 or 2), stop at empty line # End of message is signalled by empty paramname for i in range(10): paramname, paramdata = self.readparam() if paramname == "": break params[paramname] = paramdata # Got message, act on it self.processmessage(processor, params) # Helper routine to test for program accessibility # Note that this works a bit differently from Linux 'which', which # won't search the PATH if there is a path part in the program name, # even if not absolute (e.g. will just try subdir/cmd in current # dir). We will find such a command if it exists in a matching subpath # of any PATH element. # This is very useful esp. on Windows so that we can have several bin # filter directories under filters (to avoid dll clashes). The # corresponding c++ routine in recoll execcmd works the same. def which(program): def is_exe(fpath): return os.path.exists(fpath) and os.access(fpath, os.X_OK) def ext_candidates(fpath): yield fpath for ext in os.environ.get("PATHEXT", "").split(os.pathsep): yield fpath + ext def path_candidates(): yield os.path.dirname(sys.argv[0]) rclpath = _g_config.getConfParam("recollhelperpath") if rclpath: for path in rclpath.split(os.pathsep): yield path for path in os.environ["PATH"].split(os.pathsep): yield path if os.path.isabs(program): if is_exe(program): return program else: for path in path_candidates(): exe_file = os.path.join(path, program) for candidate in ext_candidates(exe_file): if is_exe(candidate): return candidate return None # Execute Python script. cmd is a list with the script name as first elt. def execPythonScript(icmd): import subprocess cmd = list(icmd) if _g_mswindows: if not os.path.isabs(cmd[0]): cmd[0] = os.path.join(_g_execdir, cmd[0]) cmd = [sys.executable] + cmd return subprocess.check_output(cmd) # Temp dir helper class SafeTmpDir: def __init__(self, tag, em=None): self.tag = tag self.em = em self.toptmp = None self.tmpdir = None def __del__(self): if self.toptmp: try: if self.tmpdir: shutil.rmtree(self.tmpdir, True) os.rmdir(self.toptmp) except Exception as err: if self.em: self.em.rclog("delete dir failed for " + self.toptmp) def vacuumdir(self): if self.tmpdir: for fn in os.listdir(self.tmpdir): path = os.path.join(self.tmpdir, fn) if os.path.isfile(path): os.unlink(path) return True def getpath(self): if not self.tmpdir: envrcltmp = os.getenv('RECOLL_TMPDIR') if envrcltmp: self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp) else: self.toptmp = tempfile.mkdtemp(prefix='rcltmp') self.tmpdir = os.path.join(self.toptmp, self.tag) os.makedirs(self.tmpdir) return self.tmpdir # Common main routine for all python execm filters: either run the # normal protocol engine or a local loop to test without recollindex def main(proto, extract): if len(sys.argv) == 1: proto.mainloop(extract) # mainloop does not return. Just in case sys.exit(1) # Not running the main loop: either acting as single filter (when called # from other filter for example), or debugging def usage(): print("Usage: rclexecm.py [-d] [-s] [-i ipath] ", file=sys.stderr) print(" rclexecm.py -w ", file=sys.stderr) sys.exit(1) actAsSingle = False debugDumpData = False debugDumpFields = False ipath = b"" args = sys.argv[1:] opts, args = getopt.getopt(args, "dfhi:sw:") for opt, arg in opts: if opt in ['-d']: debugDumpData = True elif opt in ['-f']: debugDumpFields = True elif opt in ['-h']: usage() elif opt in ['-i']: ipath = makebytes(arg) elif opt in ['-w']: ret = which(arg) if ret: print("%s" % ret) sys.exit(0) else: sys.exit(1) elif opt in ['-s']: actAsSingle = True else: print("unknown option %s\n"%opt, file=sys.stderr) usage() if len(args) != 1: usage() path = args[0] def mimetype_with_file(f): cmd = 'file -i "' + f + '"' fileout = os.popen(cmd).read() lst = fileout.split(':') mimetype = lst[len(lst)-1].strip() lst = mimetype.split(';') return makebytes(lst[0].strip()) def mimetype_with_xdg(f): cmd = 'xdg-mime query filetype "' + f + '"' return makebytes(os.popen(cmd).read().strip()) def debprint(out, s): if not actAsSingle: proto.breakwrite(out, makebytes(s+'\n')) params = {'filename' : makebytes(path)} # Some filters (e.g. rclaudio.py) need/get a MIME type from the indexer. # We make a half-assed attempt to emulate: mimetype = _g_config.mimeType(path) if not mimetype and not _g_mswindows: mimetype = mimetype_with_file(path) if mimetype: params['mimetype'] = mimetype if not extract.openfile(params): print("Open error", file=sys.stderr) sys.exit(1) if PY3: ioout = sys.stdout.buffer else: ioout = sys.stdout if ipath != b"" or actAsSingle: params['ipath'] = ipath ok, data, ipath, eof = extract.getipath(params) if ok: debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \ (ipath, proto.mimetype.decode('cp1252'))) bdata = makebytes(data) if debugDumpData or actAsSingle: proto.breakwrite(ioout, bdata) ioout.write(b'\n') sys.exit(0) else: print("Got error, eof %d"%eof, file=sys.stderr) sys.exit(1) ecnt = 0 while 1: ok, data, ipath, eof = extract.getnext(params) if ok: ecnt = ecnt + 1 bdata = makebytes(data) debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \ (ecnt, len(data), ipath, proto.mimetype.decode('cp1252'))) if debugDumpFields: for k,v in proto.fields.items(): debprint(ioout, " %s -> %s" % (k,v)) proto.fields = {} if debugDumpData: proto.breakwrite(ioout, bdata) ioout.write(b'\n') if eof != RclExecM.noteof: sys.exit(0) else: print("Not ok, eof %d" % eof, file=sys.stderr) sys.exit(1) # Not sure this makes sense, but going on looping certainly does not if actAsSingle: sys.exit(0)