cleanup in new python filters
--HG-- branch : WINDOWSPORT
This commit is contained in:
parent
330c7fc30d
commit
118982d25e
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import rclexecm
|
||||
import rclexec1
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
@ -131,10 +132,11 @@ class WordFilter:
|
||||
mt = self.mimetype(fn)
|
||||
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
|
||||
if mt == "text/plain":
|
||||
return ([python, os.path.join(self.execdir, "rcltext")],
|
||||
return ([python, os.path.join(self.execdir, "rcltext.py")],
|
||||
WordPassData(self.em))
|
||||
elif mt == "text/rtf":
|
||||
cmd = [python, os.path.join(self.execdir, "rclrtf.py")]
|
||||
cmd = ["python", os.path.join(self.execdir, "rclrtf.py"),
|
||||
"-s"]
|
||||
self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
|
||||
return (cmd, WordPassData(self.em))
|
||||
elif mt == "application/msword":
|
||||
@ -159,5 +161,5 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
proto = rclexecm.RclExecM()
|
||||
filter = WordFilter(proto, execdir)
|
||||
extract = rclexecm.Executor(proto, filter)
|
||||
extract = rclexec1.Executor(proto, filter)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
107
src/filters/rclexec1.py
Normal file
107
src/filters/rclexec1.py
Normal file
@ -0,0 +1,107 @@
|
||||
#################################
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
########################################################
|
||||
|
||||
# Common code for replacing the old shell scripts with Python execm
|
||||
# ones: this implements the basic functions for a filter which
|
||||
# executes a command to translate a simple file (like rclword with
|
||||
# antiword).
|
||||
#
|
||||
# This was motivated by the Windows port: to replace shell and Unix
|
||||
# utility (awk , etc usage). We can't just execute python scripts,
|
||||
# this would be to slow. So this helps implementing a permanent script
|
||||
# to repeatedly execute single commands.
|
||||
|
||||
import subprocess
|
||||
import rclexecm
|
||||
|
||||
# This class has the code to execute the subprocess and call a
|
||||
# data-specific post-processor. Command and processor are supplied by
|
||||
# the object which we receive as a parameter, which in turn is defined
|
||||
# in the actual executable filter (e.g. rcldoc.py)
|
||||
class Executor:
|
||||
def __init__(self, em, flt):
|
||||
self.em = em
|
||||
self.flt = flt
|
||||
self.currentindex = 0
|
||||
|
||||
def runCmd(self, cmd, filename, postproc):
|
||||
''' Substitute parameters and execute command, process output
|
||||
with the specific postprocessor and return the complete text.
|
||||
We expect cmd as a list of command name + arguments'''
|
||||
|
||||
try:
|
||||
fullcmd = cmd + [filename]
|
||||
proc = subprocess.Popen(fullcmd,
|
||||
stdout = subprocess.PIPE)
|
||||
stdout = proc.stdout
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
except OSError as err:
|
||||
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
|
||||
for line in stdout:
|
||||
postproc.takeLine(line.strip())
|
||||
|
||||
proc.wait()
|
||||
if proc.returncode:
|
||||
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||
(filename, proc.returncode))
|
||||
return False, postproc.wrapData()
|
||||
else:
|
||||
return True, postproc.wrapData()
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
# params["mimetype:"]))
|
||||
self.flt.reset()
|
||||
ok = False
|
||||
if not params.has_key("filename:"):
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
fn = params["filename:"]
|
||||
while True:
|
||||
cmd, postproc = self.flt.getCmd(fn)
|
||||
if cmd:
|
||||
ok, data = self.runCmd(cmd, fn, postproc)
|
||||
if ok:
|
||||
break
|
||||
else:
|
||||
break
|
||||
if ok:
|
||||
return (ok, data, "", rclexecm.RclExecM.eofnext)
|
||||
else:
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
@ -17,11 +17,13 @@
|
||||
########################################################
|
||||
## Recoll multifilter communication module and utilities
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import shutil
|
||||
import getopt
|
||||
|
||||
############################################
|
||||
# RclExecM implements the
|
||||
@ -54,7 +56,7 @@ class RclExecM:
|
||||
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
|
||||
|
||||
def rclog(self, s, doexit = 0, exitvalue = 1):
|
||||
print >> sys.stderr, "RCLMFILT:", self.myname, ":", s
|
||||
print("RCLMFILT: %s: %s" % (self.myname, s), file=sys.stderr)
|
||||
if doexit:
|
||||
sys.exit(exitvalue)
|
||||
|
||||
@ -112,29 +114,29 @@ class RclExecM:
|
||||
self.rclog("GOT UNICODE for ipath [%s]" % (ipath,))
|
||||
docdata = docdata.encode("UTF-8")
|
||||
|
||||
print "Document:", len(docdata)
|
||||
print("Document: %d" % len(docdata))
|
||||
sys.stdout.write(docdata)
|
||||
|
||||
if len(ipath):
|
||||
print "Ipath:", len(ipath)
|
||||
print("Ipath: %d" % len(ipath))
|
||||
sys.stdout.write(ipath)
|
||||
|
||||
if len(self.mimetype):
|
||||
print "Mimetype:", len(self.mimetype)
|
||||
print("Mimetype: %d" % len(self.mimetype))
|
||||
sys.stdout.write(self.mimetype)
|
||||
|
||||
# If we're at the end of the contents, say so
|
||||
if iseof == RclExecM.eofnow:
|
||||
print "Eofnow: 0"
|
||||
print("Eofnow: 0")
|
||||
elif iseof == RclExecM.eofnext:
|
||||
print "Eofnext: 0"
|
||||
print("Eofnext: 0")
|
||||
if iserror == RclExecM.subdocerror:
|
||||
print "Subdocerror: 0"
|
||||
print("Subdocerror: 0")
|
||||
elif iserror == RclExecM.fileerror:
|
||||
print "Fileerror: 0"
|
||||
print("Fileerror: 0")
|
||||
|
||||
# End of message
|
||||
print
|
||||
print()
|
||||
sys.stdout.flush()
|
||||
#self.rclog("done writing data")
|
||||
|
||||
@ -193,92 +195,6 @@ class RclExecM:
|
||||
self.processmessage(processor, params)
|
||||
|
||||
|
||||
####################################################################
|
||||
# Common code for replacing the shell scripts: this implements the basic
|
||||
# functions for a filter which executes a command to translate a
|
||||
# simple file (like rclword with antiword).
|
||||
#
|
||||
# This was motivated by the Windows port: to replace shell and Unix
|
||||
# utility (awk , etc usage). We can't just execute python scripts,
|
||||
# this would be to slow. So this helps implementing a permanent script
|
||||
# to repeatedly execute single commands.
|
||||
#
|
||||
# This class has the code to execute the subprocess and call a
|
||||
# data-specific post-processor. Command and processor are supplied by
|
||||
# the object which we receive as a parameter, which in turn is defined
|
||||
# in the actual executable filter (e.g. rcldoc)
|
||||
class Executor:
|
||||
def __init__(self, em, flt):
|
||||
self.em = em
|
||||
self.flt = flt
|
||||
self.currentindex = 0
|
||||
|
||||
def runCmd(self, cmd, filename, postproc):
|
||||
''' Substitute parameters and execute command, process output
|
||||
with the specific postprocessor and return the complete text.
|
||||
We expect cmd as a list of command name + arguments'''
|
||||
|
||||
try:
|
||||
fullcmd = cmd + [filename]
|
||||
proc = subprocess.Popen(fullcmd,
|
||||
stdout = subprocess.PIPE)
|
||||
stdout = proc.stdout
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
except OSError as err:
|
||||
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
|
||||
for line in stdout:
|
||||
postproc.takeLine(line.strip())
|
||||
|
||||
proc.wait()
|
||||
if proc.returncode:
|
||||
self.em.rclog("extractone: [%s] returncode %d" % (returncode))
|
||||
return False, postproc.wrapData()
|
||||
else:
|
||||
return True, postproc.wrapData()
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
# params["mimetype:"]))
|
||||
self.flt.reset()
|
||||
ok = False
|
||||
if not params.has_key("filename:"):
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (ok, "", "", RclExecM.eofnow)
|
||||
|
||||
fn = params["filename:"]
|
||||
while True:
|
||||
cmd, postproc = self.flt.getCmd(fn)
|
||||
if cmd:
|
||||
ok, data = self.runCmd(cmd, fn, postproc)
|
||||
if ok:
|
||||
break
|
||||
else:
|
||||
break
|
||||
if ok:
|
||||
return (ok, data, "", RclExecM.eofnext)
|
||||
else:
|
||||
return (ok, "", "", RclExecM.eofnow)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
# Helper routine to test for program accessibility
|
||||
def which(program):
|
||||
def is_exe(fpath):
|
||||
@ -339,61 +255,101 @@ class SafeTmpDir:
|
||||
def main(proto, extract):
|
||||
if len(sys.argv) == 1:
|
||||
proto.mainloop(extract)
|
||||
else:
|
||||
# Got a file name parameter: TESTING without an execm parent
|
||||
# Loop on all entries or get specific ipath
|
||||
def mimetype_with_file(f):
|
||||
cmd = 'file -i "' + f + '"'
|
||||
fileout = os.popen(cmd).read()
|
||||
lst = fileout.split(':')
|
||||
mimetype = lst[len(lst)-1].strip()
|
||||
lst = mimetype.split(';')
|
||||
return lst[0].strip()
|
||||
def mimetype_with_xdg(f):
|
||||
cmd = 'xdg-mime query filetype "' + f + '"'
|
||||
return os.popen(cmd).read().strip()
|
||||
params = {'filename:': sys.argv[1]}
|
||||
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
||||
mimetype = mimetype_with_xdg(sys.argv[1])
|
||||
params['mimetype:'] = mimetype
|
||||
if not extract.openfile(params):
|
||||
print "Open error"
|
||||
sys.exit(1)
|
||||
ipath = ""
|
||||
if len(sys.argv) == 3:
|
||||
ipath = sys.argv[2]
|
||||
# mainloop does not return. Just in case
|
||||
sys.exit(1)
|
||||
|
||||
if ipath != "":
|
||||
params['ipath:'] = ipath
|
||||
ok, data, ipath, eof = extract.getipath(params)
|
||||
if ok:
|
||||
print "== Found entry for ipath %s (mimetype [%s]):" % \
|
||||
(ipath, proto.mimetype)
|
||||
if isinstance(data, unicode):
|
||||
bdata = data.encode("UTF-8")
|
||||
else:
|
||||
bdata = data
|
||||
sys.stdout.write(bdata)
|
||||
print
|
||||
|
||||
# Not running the main loop: either acting as single filter (when called
|
||||
# from other filter for example), or debugging
|
||||
def usage():
|
||||
print("Usage: rclexecm.py [-d] [-s] [-i ipath] [filename]",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
actAsSingle = False
|
||||
debugDumpData = False
|
||||
ipath = ""
|
||||
|
||||
args = sys.argv[1:]
|
||||
opts, args = getopt.getopt(args, "hdsi:")
|
||||
for opt, arg in opts:
|
||||
if opt in ['-h']:
|
||||
usage()
|
||||
elif opt in ['-s']:
|
||||
actAsSingle = True
|
||||
elif opt in ['-i']:
|
||||
ipath = arg
|
||||
elif opt in ['-d']:
|
||||
debugDumpData = True
|
||||
else:
|
||||
print("unknown option %s\n"%opt, file=sys.stderr)
|
||||
usage()
|
||||
|
||||
if len(args) != 1:
|
||||
usage()
|
||||
|
||||
def mimetype_with_file(f):
|
||||
cmd = 'file -i "' + f + '"'
|
||||
fileout = os.popen(cmd).read()
|
||||
lst = fileout.split(':')
|
||||
mimetype = lst[len(lst)-1].strip()
|
||||
lst = mimetype.split(';')
|
||||
return lst[0].strip()
|
||||
|
||||
def mimetype_with_xdg(f):
|
||||
cmd = 'xdg-mime query filetype "' + f + '"'
|
||||
return os.popen(cmd).read().strip()
|
||||
|
||||
def debprint(s):
|
||||
if not actAsSingle:
|
||||
print(s)
|
||||
|
||||
params = {'filename:': args[0]}
|
||||
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
||||
mimetype = mimetype_with_xdg(args[0])
|
||||
params['mimetype:'] = mimetype
|
||||
|
||||
if not extract.openfile(params):
|
||||
print("Open error", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if ipath != "" or actAsSingle:
|
||||
params['ipath:'] = ipath
|
||||
ok, data, ipath, eof = extract.getipath(params)
|
||||
if ok:
|
||||
debprint("== Found entry for ipath %s (mimetype [%s]):" % \
|
||||
(ipath, proto.mimetype))
|
||||
if isinstance(data, unicode):
|
||||
bdata = data.encode("UTF-8")
|
||||
else:
|
||||
print "Got error, eof %d"%eof
|
||||
bdata = data
|
||||
if debugDumpData or actAsSingle:
|
||||
sys.stdout.write(bdata)
|
||||
print()
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Got error, eof %d"%eof, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
ecnt = 0
|
||||
while 1:
|
||||
ok, data, ipath, eof = extract.getnext(params)
|
||||
if ok:
|
||||
ecnt = ecnt + 1
|
||||
print "== Entry %d ipath %s (mimetype [%s]):" % \
|
||||
(ecnt, ipath, proto.mimetype)
|
||||
if isinstance(data, unicode):
|
||||
bdata = data.encode("UTF-8")
|
||||
else:
|
||||
bdata = data
|
||||
sys.stdout.write(bdata)
|
||||
print
|
||||
if eof != RclExecM.noteof:
|
||||
break
|
||||
ecnt = 0
|
||||
while 1:
|
||||
ok, data, ipath, eof = extract.getnext(params)
|
||||
if ok:
|
||||
ecnt = ecnt + 1
|
||||
debprint("== Entry %d ipath %s (mimetype [%s]):" % \
|
||||
(ecnt, ipath, proto.mimetype))
|
||||
if isinstance(data, unicode):
|
||||
bdata = data.encode("UTF-8")
|
||||
else:
|
||||
print "Not ok, eof %d" % eof
|
||||
break
|
||||
bdata = data
|
||||
if debugDumpData:
|
||||
sys.stdout.write(bdata)
|
||||
print()
|
||||
if eof != RclExecM.noteof:
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Not ok, eof %d" % eof, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
# Not sure this makes sense, but going on looping certainly does not
|
||||
if actAsSingle:
|
||||
sys.exit(0)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import rclexecm
|
||||
import rclexec1
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
@ -46,7 +47,10 @@ class RTFFilter:
|
||||
return ([],None)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not rclexecm.which("unrtf"):
|
||||
print("RECFILTERROR HELPERNOTFOUND antiword")
|
||||
sys.exit(1)
|
||||
proto = rclexecm.RclExecM()
|
||||
filter = RTFFilter(proto)
|
||||
extract = rclexecm.Executor(proto, filter)
|
||||
extract = rclexec1.Executor(proto, filter)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user