diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index 0e67239c..4b50ee38 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import rclexecm +import rclexec1 import re import sys import os @@ -131,10 +132,11 @@ class WordFilter: mt = self.mimetype(fn) self.em.rclog("rcldoc.py: actual MIME type %s" % mt) if mt == "text/plain": - return ([python, os.path.join(self.execdir, "rcltext")], + return ([python, os.path.join(self.execdir, "rcltext.py")], WordPassData(self.em)) elif mt == "text/rtf": - cmd = [python, os.path.join(self.execdir, "rclrtf.py")] + cmd = ["python", os.path.join(self.execdir, "rclrtf.py"), + "-s"] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": @@ -159,5 +161,5 @@ if __name__ == '__main__': sys.exit(1) proto = rclexecm.RclExecM() filter = WordFilter(proto, execdir) - extract = rclexecm.Executor(proto, filter) + extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract) diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py new file mode 100644 index 00000000..13970048 --- /dev/null +++ b/src/filters/rclexec1.py @@ -0,0 +1,107 @@ +################################# +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +######################################################## + +# Common code for replacing the old shell scripts with Python execm +# ones: this implements the basic functions for a filter which +# executes a command to translate a simple file (like rclword with +# antiword). +# +# This was motivated by the Windows port: to replace shell and Unix +# utility (awk , etc usage). We can't just execute python scripts, +# this would be to slow. So this helps implementing a permanent script +# to repeatedly execute single commands. + +import subprocess +import rclexecm + +# This class has the code to execute the subprocess and call a +# data-specific post-processor. Command and processor are supplied by +# the object which we receive as a parameter, which in turn is defined +# in the actual executable filter (e.g. rcldoc.py) +class Executor: + def __init__(self, em, flt): + self.em = em + self.flt = flt + self.currentindex = 0 + + def runCmd(self, cmd, filename, postproc): + ''' Substitute parameters and execute command, process output + with the specific postprocessor and return the complete text. + We expect cmd as a list of command name + arguments''' + + try: + fullcmd = cmd + [filename] + proc = subprocess.Popen(fullcmd, + stdout = subprocess.PIPE) + stdout = proc.stdout + except subprocess.CalledProcessError as err: + self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err)) + return (False, "") + except OSError as err: + self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err)) + return (False, "") + + for line in stdout: + postproc.takeLine(line.strip()) + + proc.wait() + if proc.returncode: + self.em.rclog("extractone: [%s] returncode %d" % \ + (filename, proc.returncode)) + return False, postproc.wrapData() + else: + return True, postproc.wrapData() + + def extractone(self, params): + #self.em.rclog("extractone %s %s" % (params["filename:"], \ + # params["mimetype:"])) + self.flt.reset() + ok = False + if not params.has_key("filename:"): + self.em.rclog("extractone: no mime or file name") + return (ok, "", "", rclexecm.RclExecM.eofnow) + + fn = params["filename:"] + while True: + cmd, postproc = self.flt.getCmd(fn) + if cmd: + ok, data = self.runCmd(cmd, fn, postproc) + if ok: + break + else: + break + if ok: + return (ok, data, "", rclexecm.RclExecM.eofnext) + else: + return (ok, "", "", rclexecm.RclExecM.eofnow) + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 65a4e119..61000264 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -17,11 +17,13 @@ ######################################################## ## Recoll multifilter communication module and utilities +from __future__ import print_function + import sys import os -import subprocess import tempfile import shutil +import getopt ############################################ # RclExecM implements the @@ -54,7 +56,7 @@ class RclExecM: msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) def rclog(self, s, doexit = 0, exitvalue = 1): - print >> sys.stderr, "RCLMFILT:", self.myname, ":", s + print("RCLMFILT: %s: %s" % (self.myname, s), file=sys.stderr) if doexit: sys.exit(exitvalue) @@ -112,29 +114,29 @@ class RclExecM: self.rclog("GOT UNICODE for ipath [%s]" % (ipath,)) docdata = docdata.encode("UTF-8") - print "Document:", len(docdata) + print("Document: %d" % len(docdata)) sys.stdout.write(docdata) if len(ipath): - print "Ipath:", len(ipath) + print("Ipath: %d" % len(ipath)) sys.stdout.write(ipath) if len(self.mimetype): - print "Mimetype:", len(self.mimetype) + print("Mimetype: %d" % len(self.mimetype)) sys.stdout.write(self.mimetype) # If we're at the end of the contents, say so if iseof == RclExecM.eofnow: - print "Eofnow: 0" + print("Eofnow: 0") elif iseof == RclExecM.eofnext: - print "Eofnext: 0" + print("Eofnext: 0") if iserror == RclExecM.subdocerror: - print "Subdocerror: 0" + print("Subdocerror: 0") elif iserror == RclExecM.fileerror: - print "Fileerror: 0" + print("Fileerror: 0") # End of message - print + print() sys.stdout.flush() #self.rclog("done writing data") @@ -193,92 +195,6 @@ class RclExecM: self.processmessage(processor, params) -#################################################################### -# Common code for replacing the shell scripts: this implements the basic -# functions for a filter which executes a command to translate a -# simple file (like rclword with antiword). -# -# This was motivated by the Windows port: to replace shell and Unix -# utility (awk , etc usage). We can't just execute python scripts, -# this would be to slow. So this helps implementing a permanent script -# to repeatedly execute single commands. -# -# This class has the code to execute the subprocess and call a -# data-specific post-processor. Command and processor are supplied by -# the object which we receive as a parameter, which in turn is defined -# in the actual executable filter (e.g. rcldoc) -class Executor: - def __init__(self, em, flt): - self.em = em - self.flt = flt - self.currentindex = 0 - - def runCmd(self, cmd, filename, postproc): - ''' Substitute parameters and execute command, process output - with the specific postprocessor and return the complete text. - We expect cmd as a list of command name + arguments''' - - try: - fullcmd = cmd + [filename] - proc = subprocess.Popen(fullcmd, - stdout = subprocess.PIPE) - stdout = proc.stdout - except subprocess.CalledProcessError as err: - self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err)) - return (False, "") - except OSError as err: - self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err)) - return (False, "") - - for line in stdout: - postproc.takeLine(line.strip()) - - proc.wait() - if proc.returncode: - self.em.rclog("extractone: [%s] returncode %d" % (returncode)) - return False, postproc.wrapData() - else: - return True, postproc.wrapData() - - def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], \ - # params["mimetype:"])) - self.flt.reset() - ok = False - if not params.has_key("filename:"): - self.em.rclog("extractone: no mime or file name") - return (ok, "", "", RclExecM.eofnow) - - fn = params["filename:"] - while True: - cmd, postproc = self.flt.getCmd(fn) - if cmd: - ok, data = self.runCmd(cmd, fn, postproc) - if ok: - break - else: - break - if ok: - return (ok, data, "", RclExecM.eofnext) - else: - return (ok, "", "", RclExecM.eofnow) - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret - # Helper routine to test for program accessibility def which(program): def is_exe(fpath): @@ -339,61 +255,101 @@ class SafeTmpDir: def main(proto, extract): if len(sys.argv) == 1: proto.mainloop(extract) - else: - # Got a file name parameter: TESTING without an execm parent - # Loop on all entries or get specific ipath - def mimetype_with_file(f): - cmd = 'file -i "' + f + '"' - fileout = os.popen(cmd).read() - lst = fileout.split(':') - mimetype = lst[len(lst)-1].strip() - lst = mimetype.split(';') - return lst[0].strip() - def mimetype_with_xdg(f): - cmd = 'xdg-mime query filetype "' + f + '"' - return os.popen(cmd).read().strip() - params = {'filename:': sys.argv[1]} - # Some filters (e.g. rclaudio) need/get a MIME type from the indexer - mimetype = mimetype_with_xdg(sys.argv[1]) - params['mimetype:'] = mimetype - if not extract.openfile(params): - print "Open error" - sys.exit(1) - ipath = "" - if len(sys.argv) == 3: - ipath = sys.argv[2] + # mainloop does not return. Just in case + sys.exit(1) - if ipath != "": - params['ipath:'] = ipath - ok, data, ipath, eof = extract.getipath(params) - if ok: - print "== Found entry for ipath %s (mimetype [%s]):" % \ - (ipath, proto.mimetype) - if isinstance(data, unicode): - bdata = data.encode("UTF-8") - else: - bdata = data - sys.stdout.write(bdata) - print + + # Not running the main loop: either acting as single filter (when called + # from other filter for example), or debugging + def usage(): + print("Usage: rclexecm.py [-d] [-s] [-i ipath] [filename]", + file=sys.stderr) + sys.exit(1) + + actAsSingle = False + debugDumpData = False + ipath = "" + + args = sys.argv[1:] + opts, args = getopt.getopt(args, "hdsi:") + for opt, arg in opts: + if opt in ['-h']: + usage() + elif opt in ['-s']: + actAsSingle = True + elif opt in ['-i']: + ipath = arg + elif opt in ['-d']: + debugDumpData = True + else: + print("unknown option %s\n"%opt, file=sys.stderr) + usage() + + if len(args) != 1: + usage() + + def mimetype_with_file(f): + cmd = 'file -i "' + f + '"' + fileout = os.popen(cmd).read() + lst = fileout.split(':') + mimetype = lst[len(lst)-1].strip() + lst = mimetype.split(';') + return lst[0].strip() + + def mimetype_with_xdg(f): + cmd = 'xdg-mime query filetype "' + f + '"' + return os.popen(cmd).read().strip() + + def debprint(s): + if not actAsSingle: + print(s) + + params = {'filename:': args[0]} + # Some filters (e.g. rclaudio) need/get a MIME type from the indexer + mimetype = mimetype_with_xdg(args[0]) + params['mimetype:'] = mimetype + + if not extract.openfile(params): + print("Open error", file=sys.stderr) + sys.exit(1) + + if ipath != "" or actAsSingle: + params['ipath:'] = ipath + ok, data, ipath, eof = extract.getipath(params) + if ok: + debprint("== Found entry for ipath %s (mimetype [%s]):" % \ + (ipath, proto.mimetype)) + if isinstance(data, unicode): + bdata = data.encode("UTF-8") else: - print "Got error, eof %d"%eof + bdata = data + if debugDumpData or actAsSingle: + sys.stdout.write(bdata) + print() sys.exit(0) + else: + print("Got error, eof %d"%eof, file=sys.stderr) + sys.exit(1) - ecnt = 0 - while 1: - ok, data, ipath, eof = extract.getnext(params) - if ok: - ecnt = ecnt + 1 - print "== Entry %d ipath %s (mimetype [%s]):" % \ - (ecnt, ipath, proto.mimetype) - if isinstance(data, unicode): - bdata = data.encode("UTF-8") - else: - bdata = data - sys.stdout.write(bdata) - print - if eof != RclExecM.noteof: - break + ecnt = 0 + while 1: + ok, data, ipath, eof = extract.getnext(params) + if ok: + ecnt = ecnt + 1 + debprint("== Entry %d ipath %s (mimetype [%s]):" % \ + (ecnt, ipath, proto.mimetype)) + if isinstance(data, unicode): + bdata = data.encode("UTF-8") else: - print "Not ok, eof %d" % eof - break + bdata = data + if debugDumpData: + sys.stdout.write(bdata) + print() + if eof != RclExecM.noteof: + sys.exit(0) + else: + print("Not ok, eof %d" % eof, file=sys.stderr) + sys.exit(1) + # Not sure this makes sense, but going on looping certainly does not + if actAsSingle: + sys.exit(0) diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py index bc560380..07cf6ca9 100644 --- a/src/filters/rclrtf.py +++ b/src/filters/rclrtf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import rclexecm +import rclexec1 import re import sys import os @@ -46,7 +47,10 @@ class RTFFilter: return ([],None) if __name__ == '__main__': + if not rclexecm.which("unrtf"): + print("RECFILTERROR HELPERNOTFOUND antiword") + sys.exit(1) proto = rclexecm.RclExecM() filter = RTFFilter(proto) - extract = rclexecm.Executor(proto, filter) + extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)