cleanup in new python filters
--HG-- branch : WINDOWSPORT
This commit is contained in:
parent
330c7fc30d
commit
118982d25e
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
import rclexec1
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@ -131,10 +132,11 @@ class WordFilter:
|
|||||||
mt = self.mimetype(fn)
|
mt = self.mimetype(fn)
|
||||||
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
|
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
|
||||||
if mt == "text/plain":
|
if mt == "text/plain":
|
||||||
return ([python, os.path.join(self.execdir, "rcltext")],
|
return ([python, os.path.join(self.execdir, "rcltext.py")],
|
||||||
WordPassData(self.em))
|
WordPassData(self.em))
|
||||||
elif mt == "text/rtf":
|
elif mt == "text/rtf":
|
||||||
cmd = [python, os.path.join(self.execdir, "rclrtf.py")]
|
cmd = ["python", os.path.join(self.execdir, "rclrtf.py"),
|
||||||
|
"-s"]
|
||||||
self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
|
self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
|
||||||
return (cmd, WordPassData(self.em))
|
return (cmd, WordPassData(self.em))
|
||||||
elif mt == "application/msword":
|
elif mt == "application/msword":
|
||||||
@ -159,5 +161,5 @@ if __name__ == '__main__':
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
filter = WordFilter(proto, execdir)
|
filter = WordFilter(proto, execdir)
|
||||||
extract = rclexecm.Executor(proto, filter)
|
extract = rclexec1.Executor(proto, filter)
|
||||||
rclexecm.main(proto, extract)
|
rclexecm.main(proto, extract)
|
||||||
|
|||||||
107
src/filters/rclexec1.py
Normal file
107
src/filters/rclexec1.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
#################################
|
||||||
|
# Copyright (C) 2014 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
# Common code for replacing the old shell scripts with Python execm
|
||||||
|
# ones: this implements the basic functions for a filter which
|
||||||
|
# executes a command to translate a simple file (like rclword with
|
||||||
|
# antiword).
|
||||||
|
#
|
||||||
|
# This was motivated by the Windows port: to replace shell and Unix
|
||||||
|
# utility (awk , etc usage). We can't just execute python scripts,
|
||||||
|
# this would be to slow. So this helps implementing a permanent script
|
||||||
|
# to repeatedly execute single commands.
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import rclexecm
|
||||||
|
|
||||||
|
# This class has the code to execute the subprocess and call a
|
||||||
|
# data-specific post-processor. Command and processor are supplied by
|
||||||
|
# the object which we receive as a parameter, which in turn is defined
|
||||||
|
# in the actual executable filter (e.g. rcldoc.py)
|
||||||
|
class Executor:
|
||||||
|
def __init__(self, em, flt):
|
||||||
|
self.em = em
|
||||||
|
self.flt = flt
|
||||||
|
self.currentindex = 0
|
||||||
|
|
||||||
|
def runCmd(self, cmd, filename, postproc):
|
||||||
|
''' Substitute parameters and execute command, process output
|
||||||
|
with the specific postprocessor and return the complete text.
|
||||||
|
We expect cmd as a list of command name + arguments'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
fullcmd = cmd + [filename]
|
||||||
|
proc = subprocess.Popen(fullcmd,
|
||||||
|
stdout = subprocess.PIPE)
|
||||||
|
stdout = proc.stdout
|
||||||
|
except subprocess.CalledProcessError as err:
|
||||||
|
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||||
|
return (False, "")
|
||||||
|
except OSError as err:
|
||||||
|
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
||||||
|
return (False, "")
|
||||||
|
|
||||||
|
for line in stdout:
|
||||||
|
postproc.takeLine(line.strip())
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode:
|
||||||
|
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||||
|
(filename, proc.returncode))
|
||||||
|
return False, postproc.wrapData()
|
||||||
|
else:
|
||||||
|
return True, postproc.wrapData()
|
||||||
|
|
||||||
|
def extractone(self, params):
|
||||||
|
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||||
|
# params["mimetype:"]))
|
||||||
|
self.flt.reset()
|
||||||
|
ok = False
|
||||||
|
if not params.has_key("filename:"):
|
||||||
|
self.em.rclog("extractone: no mime or file name")
|
||||||
|
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
|
fn = params["filename:"]
|
||||||
|
while True:
|
||||||
|
cmd, postproc = self.flt.getCmd(fn)
|
||||||
|
if cmd:
|
||||||
|
ok, data = self.runCmd(cmd, fn, postproc)
|
||||||
|
if ok:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if ok:
|
||||||
|
return (ok, data, "", rclexecm.RclExecM.eofnext)
|
||||||
|
else:
|
||||||
|
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
|
###### File type handler api, used by rclexecm ---------->
|
||||||
|
def openfile(self, params):
|
||||||
|
self.currentindex = 0
|
||||||
|
return True
|
||||||
|
|
||||||
|
def getipath(self, params):
|
||||||
|
return self.extractone(params)
|
||||||
|
|
||||||
|
def getnext(self, params):
|
||||||
|
if self.currentindex >= 1:
|
||||||
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
else:
|
||||||
|
ret= self.extractone(params)
|
||||||
|
self.currentindex += 1
|
||||||
|
return ret
|
||||||
@ -17,11 +17,13 @@
|
|||||||
########################################################
|
########################################################
|
||||||
## Recoll multifilter communication module and utilities
|
## Recoll multifilter communication module and utilities
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
import getopt
|
||||||
|
|
||||||
############################################
|
############################################
|
||||||
# RclExecM implements the
|
# RclExecM implements the
|
||||||
@ -54,7 +56,7 @@ class RclExecM:
|
|||||||
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
|
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
|
||||||
|
|
||||||
def rclog(self, s, doexit = 0, exitvalue = 1):
|
def rclog(self, s, doexit = 0, exitvalue = 1):
|
||||||
print >> sys.stderr, "RCLMFILT:", self.myname, ":", s
|
print("RCLMFILT: %s: %s" % (self.myname, s), file=sys.stderr)
|
||||||
if doexit:
|
if doexit:
|
||||||
sys.exit(exitvalue)
|
sys.exit(exitvalue)
|
||||||
|
|
||||||
@ -112,29 +114,29 @@ class RclExecM:
|
|||||||
self.rclog("GOT UNICODE for ipath [%s]" % (ipath,))
|
self.rclog("GOT UNICODE for ipath [%s]" % (ipath,))
|
||||||
docdata = docdata.encode("UTF-8")
|
docdata = docdata.encode("UTF-8")
|
||||||
|
|
||||||
print "Document:", len(docdata)
|
print("Document: %d" % len(docdata))
|
||||||
sys.stdout.write(docdata)
|
sys.stdout.write(docdata)
|
||||||
|
|
||||||
if len(ipath):
|
if len(ipath):
|
||||||
print "Ipath:", len(ipath)
|
print("Ipath: %d" % len(ipath))
|
||||||
sys.stdout.write(ipath)
|
sys.stdout.write(ipath)
|
||||||
|
|
||||||
if len(self.mimetype):
|
if len(self.mimetype):
|
||||||
print "Mimetype:", len(self.mimetype)
|
print("Mimetype: %d" % len(self.mimetype))
|
||||||
sys.stdout.write(self.mimetype)
|
sys.stdout.write(self.mimetype)
|
||||||
|
|
||||||
# If we're at the end of the contents, say so
|
# If we're at the end of the contents, say so
|
||||||
if iseof == RclExecM.eofnow:
|
if iseof == RclExecM.eofnow:
|
||||||
print "Eofnow: 0"
|
print("Eofnow: 0")
|
||||||
elif iseof == RclExecM.eofnext:
|
elif iseof == RclExecM.eofnext:
|
||||||
print "Eofnext: 0"
|
print("Eofnext: 0")
|
||||||
if iserror == RclExecM.subdocerror:
|
if iserror == RclExecM.subdocerror:
|
||||||
print "Subdocerror: 0"
|
print("Subdocerror: 0")
|
||||||
elif iserror == RclExecM.fileerror:
|
elif iserror == RclExecM.fileerror:
|
||||||
print "Fileerror: 0"
|
print("Fileerror: 0")
|
||||||
|
|
||||||
# End of message
|
# End of message
|
||||||
print
|
print()
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
#self.rclog("done writing data")
|
#self.rclog("done writing data")
|
||||||
|
|
||||||
@ -193,92 +195,6 @@ class RclExecM:
|
|||||||
self.processmessage(processor, params)
|
self.processmessage(processor, params)
|
||||||
|
|
||||||
|
|
||||||
####################################################################
|
|
||||||
# Common code for replacing the shell scripts: this implements the basic
|
|
||||||
# functions for a filter which executes a command to translate a
|
|
||||||
# simple file (like rclword with antiword).
|
|
||||||
#
|
|
||||||
# This was motivated by the Windows port: to replace shell and Unix
|
|
||||||
# utility (awk , etc usage). We can't just execute python scripts,
|
|
||||||
# this would be to slow. So this helps implementing a permanent script
|
|
||||||
# to repeatedly execute single commands.
|
|
||||||
#
|
|
||||||
# This class has the code to execute the subprocess and call a
|
|
||||||
# data-specific post-processor. Command and processor are supplied by
|
|
||||||
# the object which we receive as a parameter, which in turn is defined
|
|
||||||
# in the actual executable filter (e.g. rcldoc)
|
|
||||||
class Executor:
|
|
||||||
def __init__(self, em, flt):
|
|
||||||
self.em = em
|
|
||||||
self.flt = flt
|
|
||||||
self.currentindex = 0
|
|
||||||
|
|
||||||
def runCmd(self, cmd, filename, postproc):
|
|
||||||
''' Substitute parameters and execute command, process output
|
|
||||||
with the specific postprocessor and return the complete text.
|
|
||||||
We expect cmd as a list of command name + arguments'''
|
|
||||||
|
|
||||||
try:
|
|
||||||
fullcmd = cmd + [filename]
|
|
||||||
proc = subprocess.Popen(fullcmd,
|
|
||||||
stdout = subprocess.PIPE)
|
|
||||||
stdout = proc.stdout
|
|
||||||
except subprocess.CalledProcessError as err:
|
|
||||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
|
||||||
return (False, "")
|
|
||||||
except OSError as err:
|
|
||||||
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
|
||||||
return (False, "")
|
|
||||||
|
|
||||||
for line in stdout:
|
|
||||||
postproc.takeLine(line.strip())
|
|
||||||
|
|
||||||
proc.wait()
|
|
||||||
if proc.returncode:
|
|
||||||
self.em.rclog("extractone: [%s] returncode %d" % (returncode))
|
|
||||||
return False, postproc.wrapData()
|
|
||||||
else:
|
|
||||||
return True, postproc.wrapData()
|
|
||||||
|
|
||||||
def extractone(self, params):
|
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
|
||||||
# params["mimetype:"]))
|
|
||||||
self.flt.reset()
|
|
||||||
ok = False
|
|
||||||
if not params.has_key("filename:"):
|
|
||||||
self.em.rclog("extractone: no mime or file name")
|
|
||||||
return (ok, "", "", RclExecM.eofnow)
|
|
||||||
|
|
||||||
fn = params["filename:"]
|
|
||||||
while True:
|
|
||||||
cmd, postproc = self.flt.getCmd(fn)
|
|
||||||
if cmd:
|
|
||||||
ok, data = self.runCmd(cmd, fn, postproc)
|
|
||||||
if ok:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
if ok:
|
|
||||||
return (ok, data, "", RclExecM.eofnext)
|
|
||||||
else:
|
|
||||||
return (ok, "", "", RclExecM.eofnow)
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
|
||||||
def openfile(self, params):
|
|
||||||
self.currentindex = 0
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
# Helper routine to test for program accessibility
|
# Helper routine to test for program accessibility
|
||||||
def which(program):
|
def which(program):
|
||||||
def is_exe(fpath):
|
def is_exe(fpath):
|
||||||
@ -339,61 +255,101 @@ class SafeTmpDir:
|
|||||||
def main(proto, extract):
|
def main(proto, extract):
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
proto.mainloop(extract)
|
proto.mainloop(extract)
|
||||||
else:
|
# mainloop does not return. Just in case
|
||||||
# Got a file name parameter: TESTING without an execm parent
|
sys.exit(1)
|
||||||
# Loop on all entries or get specific ipath
|
|
||||||
def mimetype_with_file(f):
|
|
||||||
cmd = 'file -i "' + f + '"'
|
|
||||||
fileout = os.popen(cmd).read()
|
|
||||||
lst = fileout.split(':')
|
|
||||||
mimetype = lst[len(lst)-1].strip()
|
|
||||||
lst = mimetype.split(';')
|
|
||||||
return lst[0].strip()
|
|
||||||
def mimetype_with_xdg(f):
|
|
||||||
cmd = 'xdg-mime query filetype "' + f + '"'
|
|
||||||
return os.popen(cmd).read().strip()
|
|
||||||
params = {'filename:': sys.argv[1]}
|
|
||||||
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
|
||||||
mimetype = mimetype_with_xdg(sys.argv[1])
|
|
||||||
params['mimetype:'] = mimetype
|
|
||||||
if not extract.openfile(params):
|
|
||||||
print "Open error"
|
|
||||||
sys.exit(1)
|
|
||||||
ipath = ""
|
|
||||||
if len(sys.argv) == 3:
|
|
||||||
ipath = sys.argv[2]
|
|
||||||
|
|
||||||
if ipath != "":
|
|
||||||
params['ipath:'] = ipath
|
# Not running the main loop: either acting as single filter (when called
|
||||||
ok, data, ipath, eof = extract.getipath(params)
|
# from other filter for example), or debugging
|
||||||
if ok:
|
def usage():
|
||||||
print "== Found entry for ipath %s (mimetype [%s]):" % \
|
print("Usage: rclexecm.py [-d] [-s] [-i ipath] [filename]",
|
||||||
(ipath, proto.mimetype)
|
file=sys.stderr)
|
||||||
if isinstance(data, unicode):
|
sys.exit(1)
|
||||||
bdata = data.encode("UTF-8")
|
|
||||||
else:
|
actAsSingle = False
|
||||||
bdata = data
|
debugDumpData = False
|
||||||
sys.stdout.write(bdata)
|
ipath = ""
|
||||||
print
|
|
||||||
|
args = sys.argv[1:]
|
||||||
|
opts, args = getopt.getopt(args, "hdsi:")
|
||||||
|
for opt, arg in opts:
|
||||||
|
if opt in ['-h']:
|
||||||
|
usage()
|
||||||
|
elif opt in ['-s']:
|
||||||
|
actAsSingle = True
|
||||||
|
elif opt in ['-i']:
|
||||||
|
ipath = arg
|
||||||
|
elif opt in ['-d']:
|
||||||
|
debugDumpData = True
|
||||||
|
else:
|
||||||
|
print("unknown option %s\n"%opt, file=sys.stderr)
|
||||||
|
usage()
|
||||||
|
|
||||||
|
if len(args) != 1:
|
||||||
|
usage()
|
||||||
|
|
||||||
|
def mimetype_with_file(f):
|
||||||
|
cmd = 'file -i "' + f + '"'
|
||||||
|
fileout = os.popen(cmd).read()
|
||||||
|
lst = fileout.split(':')
|
||||||
|
mimetype = lst[len(lst)-1].strip()
|
||||||
|
lst = mimetype.split(';')
|
||||||
|
return lst[0].strip()
|
||||||
|
|
||||||
|
def mimetype_with_xdg(f):
|
||||||
|
cmd = 'xdg-mime query filetype "' + f + '"'
|
||||||
|
return os.popen(cmd).read().strip()
|
||||||
|
|
||||||
|
def debprint(s):
|
||||||
|
if not actAsSingle:
|
||||||
|
print(s)
|
||||||
|
|
||||||
|
params = {'filename:': args[0]}
|
||||||
|
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
||||||
|
mimetype = mimetype_with_xdg(args[0])
|
||||||
|
params['mimetype:'] = mimetype
|
||||||
|
|
||||||
|
if not extract.openfile(params):
|
||||||
|
print("Open error", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if ipath != "" or actAsSingle:
|
||||||
|
params['ipath:'] = ipath
|
||||||
|
ok, data, ipath, eof = extract.getipath(params)
|
||||||
|
if ok:
|
||||||
|
debprint("== Found entry for ipath %s (mimetype [%s]):" % \
|
||||||
|
(ipath, proto.mimetype))
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
bdata = data.encode("UTF-8")
|
||||||
else:
|
else:
|
||||||
print "Got error, eof %d"%eof
|
bdata = data
|
||||||
|
if debugDumpData or actAsSingle:
|
||||||
|
sys.stdout.write(bdata)
|
||||||
|
print()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print("Got error, eof %d"%eof, file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
ecnt = 0
|
ecnt = 0
|
||||||
while 1:
|
while 1:
|
||||||
ok, data, ipath, eof = extract.getnext(params)
|
ok, data, ipath, eof = extract.getnext(params)
|
||||||
if ok:
|
if ok:
|
||||||
ecnt = ecnt + 1
|
ecnt = ecnt + 1
|
||||||
print "== Entry %d ipath %s (mimetype [%s]):" % \
|
debprint("== Entry %d ipath %s (mimetype [%s]):" % \
|
||||||
(ecnt, ipath, proto.mimetype)
|
(ecnt, ipath, proto.mimetype))
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
bdata = data.encode("UTF-8")
|
bdata = data.encode("UTF-8")
|
||||||
else:
|
|
||||||
bdata = data
|
|
||||||
sys.stdout.write(bdata)
|
|
||||||
print
|
|
||||||
if eof != RclExecM.noteof:
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
print "Not ok, eof %d" % eof
|
bdata = data
|
||||||
break
|
if debugDumpData:
|
||||||
|
sys.stdout.write(bdata)
|
||||||
|
print()
|
||||||
|
if eof != RclExecM.noteof:
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print("Not ok, eof %d" % eof, file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
# Not sure this makes sense, but going on looping certainly does not
|
||||||
|
if actAsSingle:
|
||||||
|
sys.exit(0)
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
import rclexec1
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@ -46,7 +47,10 @@ class RTFFilter:
|
|||||||
return ([],None)
|
return ([],None)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
if not rclexecm.which("unrtf"):
|
||||||
|
print("RECFILTERROR HELPERNOTFOUND antiword")
|
||||||
|
sys.exit(1)
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
filter = RTFFilter(proto)
|
filter = RTFFilter(proto)
|
||||||
extract = rclexecm.Executor(proto, filter)
|
extract = rclexec1.Executor(proto, filter)
|
||||||
rclexecm.main(proto, extract)
|
rclexecm.main(proto, extract)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user