430 lines
14 KiB
Python
430 lines
14 KiB
Python
#################################
|
|
# Copyright (C) 2014 J.F.Dockes
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the
|
|
# Free Software Foundation, Inc.,
|
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
########################################################
|
|
## Recoll multifilter communication module and utilities
|
|
#
|
|
# All data is binary. This is important for Python3
|
|
# All parameter names are converted to and processed as str/unicode
|
|
|
|
from __future__ import print_function
|
|
|
|
import sys
|
|
import os
|
|
import tempfile
|
|
import shutil
|
|
import getopt
|
|
import rclconfig
|
|
|
|
PY3 = sys.version > '3'
|
|
|
|
if PY3:
|
|
def makebytes(data):
|
|
if isinstance(data, bytes):
|
|
return data
|
|
else:
|
|
return data.encode("UTF-8")
|
|
else:
|
|
def makebytes(data):
|
|
if isinstance(data, unicode):
|
|
return data.encode("UTF-8")
|
|
else:
|
|
return data
|
|
|
|
my_config = rclconfig.RclConfig()
|
|
|
|
############################################
|
|
# RclExecM implements the
|
|
# communication protocol with the recollindex process. It calls the
|
|
# object specific of the document type to actually get the data.
|
|
class RclExecM:
|
|
noteof = 0
|
|
eofnext = 1
|
|
eofnow = 2
|
|
|
|
noerror = 0
|
|
subdocerror = 1
|
|
fileerror = 2
|
|
|
|
def __init__(self):
|
|
try:
|
|
self.myname = os.path.basename(sys.argv[0])
|
|
except:
|
|
self.myname = "???"
|
|
self.mimetype = b""
|
|
|
|
if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"):
|
|
self.maxmembersize = \
|
|
int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB"))
|
|
else:
|
|
self.maxmembersize = 50 * 1024
|
|
self.maxmembersize = self.maxmembersize * 1024
|
|
if sys.platform == "win32":
|
|
import msvcrt
|
|
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
|
|
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
|
|
self.debugfile = None
|
|
if self.debugfile:
|
|
self.errfout = open(self.debugfile, "a")
|
|
else:
|
|
self.errfout = sys.stderr
|
|
|
|
def rclog(self, s, doexit = 0, exitvalue = 1):
|
|
print("RCLMFILT: %s: %s" % (self.myname, s), file=self.errfout)
|
|
if doexit:
|
|
sys.exit(exitvalue)
|
|
|
|
def breakwrite(self, outfile, data):
|
|
if sys.platform != "win32":
|
|
outfile.write(data)
|
|
else:
|
|
# On windows, writing big chunks can fail with a "not enough space"
|
|
# error. Seems a combined windows/python bug, depending on versions.
|
|
# See https://bugs.python.org/issue11395
|
|
# In any case, just break it up
|
|
total = len(data)
|
|
bs = 4*1024
|
|
offset = 0
|
|
while total > 0:
|
|
if total < bs:
|
|
tow = total
|
|
else:
|
|
tow = bs
|
|
#self.rclog("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
|
|
outfile.write(data[offset:offset+tow])
|
|
offset += tow
|
|
total -= tow
|
|
|
|
# Note: tried replacing this with a multiple replacer according to
|
|
# http://stackoverflow.com/a/15221068, which was **10 times** slower
|
|
def htmlescape(self, txt):
|
|
# & must stay first (it somehow had managed to skip
|
|
# after the next replace, with rather interesting results)
|
|
try:
|
|
txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
|
|
replace(b'>', b'>').replace(b'"', b'"')
|
|
except:
|
|
txt = txt.replace("&", "&").replace("<", "<").\
|
|
replace(">", ">").replace("\"", """)
|
|
return txt
|
|
|
|
# Our worker sometimes knows the mime types of the data it sends
|
|
def setmimetype(self, mt):
|
|
self.mimetype = makebytes(mt)
|
|
|
|
# Read single parameter from process input: line with param name and size
|
|
# followed by data. The param name is returned as str/unicode, the data
|
|
# as bytes
|
|
def readparam(self):
|
|
if PY3:
|
|
inf = sys.stdin.buffer
|
|
else:
|
|
inf = sys.stdin
|
|
s = inf.readline()
|
|
if s == b'':
|
|
sys.exit(0)
|
|
|
|
s = s.rstrip(b'\n')
|
|
|
|
if s == b'':
|
|
return ('', b'')
|
|
l = s.split()
|
|
if len(l) != 2:
|
|
self.rclog(b'bad line: [' + s + b']', 1, 1)
|
|
|
|
paramname = l[0].decode('ASCII').lower()
|
|
paramsize = int(l[1])
|
|
if paramsize > 0:
|
|
paramdata = inf.read(paramsize)
|
|
if len(paramdata) != paramsize:
|
|
self.rclog("Bad read: wanted %d, got %d" %
|
|
(paramsize, len(paramdata)), 1, 1)
|
|
else:
|
|
paramdata = b''
|
|
|
|
#self.rclog("paramname [%s] paramsize %d value [%s]" %
|
|
# (paramname, paramsize, paramdata))
|
|
return (paramname, paramdata)
|
|
|
|
if PY3:
|
|
def senditem(self, nm, len, data):
|
|
sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, len)))
|
|
self.breakwrite(sys.stdout.buffer, makebytes(data))
|
|
else:
|
|
def senditem(self, nm, len, data):
|
|
sys.stdout.write(makebytes("%s: %d\n" % (nm, len)))
|
|
self.breakwrite(sys.stdout, makebytes(data))
|
|
|
|
# Send answer: document, ipath, possible eof.
|
|
def answer(self, docdata, ipath, iseof = noteof, iserror = noerror):
|
|
|
|
if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow:
|
|
self.senditem("Document", len(docdata), docdata)
|
|
|
|
if len(ipath):
|
|
self.senditem("Ipath", len(ipath), ipath)
|
|
|
|
if len(self.mimetype):
|
|
self.senditem("Mimetype", len(self.mimetype), self.mimetype)
|
|
|
|
# If we're at the end of the contents, say so
|
|
if iseof == RclExecM.eofnow:
|
|
self.senditem("Eofnow", 0, b'')
|
|
elif iseof == RclExecM.eofnext:
|
|
self.senditem("Eofnext", 0, b'')
|
|
if iserror == RclExecM.subdocerror:
|
|
self.senditem("Subdocerror", 0, b'')
|
|
elif iserror == RclExecM.fileerror:
|
|
self.senditem("Fileerror", 0, b'')
|
|
|
|
# End of message
|
|
print()
|
|
sys.stdout.flush()
|
|
#self.rclog("done writing data")
|
|
|
|
def processmessage(self, processor, params):
|
|
|
|
# We must have a filename entry (even empty). Else exit
|
|
if "filename:" not in params:
|
|
print("%s" % params, file=sys.stderr)
|
|
self.rclog("no filename ??", 1, 1)
|
|
|
|
# If we're given a file name, open it.
|
|
if len(params["filename:"]) != 0:
|
|
try:
|
|
if not processor.openfile(params):
|
|
self.answer("", "", iserror = RclExecM.fileerror)
|
|
return
|
|
except Exception as err:
|
|
self.rclog("processmessage: openfile raised: [%s]" % err)
|
|
self.answer("", "", iserror = RclExecM.fileerror)
|
|
return
|
|
|
|
# If we have an ipath, that's what we look for, else ask for next entry
|
|
ipath = ""
|
|
eof = True
|
|
self.mimetype = ""
|
|
try:
|
|
if "ipath:" in params and len(params["ipath:"]):
|
|
ok, data, ipath, eof = processor.getipath(params)
|
|
else:
|
|
ok, data, ipath, eof = processor.getnext(params)
|
|
except Exception as err:
|
|
self.answer("", "", eof, RclExecM.fileerror)
|
|
return
|
|
|
|
#self.rclog("processmessage: ok %s eof %s ipath %s"%(ok, eof, ipath))
|
|
if ok:
|
|
self.answer(data, ipath, eof)
|
|
else:
|
|
self.answer("", "", eof, RclExecM.subdocerror)
|
|
|
|
# Loop on messages from our master
|
|
def mainloop(self, processor):
|
|
while 1:
|
|
#self.rclog("waiting for command")
|
|
|
|
params = dict()
|
|
|
|
# Read at most 10 parameters (normally 1 or 2), stop at empty line
|
|
# End of message is signalled by empty paramname
|
|
for i in range(10):
|
|
paramname, paramdata = self.readparam()
|
|
if paramname == "":
|
|
break
|
|
params[paramname] = paramdata
|
|
|
|
# Got message, act on it
|
|
self.processmessage(processor, params)
|
|
|
|
|
|
# Helper routine to test for program accessibility
|
|
# Note that this works a bit differently from Linux 'which', which
|
|
# won't search the PATH if there is a path part in the program name,
|
|
# even if not absolute (e.g. will just try subdir/cmd in current
|
|
# dir). We will find such a command if it exists in a matching subpath
|
|
# of any PATH element.
|
|
# This is very useful esp. on Windows so that we can have several bin
|
|
# filter directories under filters (to avoid dll clashes). The
|
|
# corresponding c++ routine in recoll execcmd works the same.
|
|
def which(program):
|
|
def is_exe(fpath):
|
|
return os.path.exists(fpath) and os.access(fpath, os.X_OK)
|
|
def ext_candidates(fpath):
|
|
yield fpath
|
|
for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
|
|
yield fpath + ext
|
|
|
|
def path_candidates():
|
|
yield os.path.dirname(sys.argv[0])
|
|
rclpath = my_config.getConfParam("recollhelperpath")
|
|
if rclpath:
|
|
for path in rclpath.split(os.pathsep):
|
|
yield path
|
|
for path in os.environ["PATH"].split(os.pathsep):
|
|
yield path
|
|
|
|
if os.path.isabs(program):
|
|
if is_exe(program):
|
|
return program
|
|
else:
|
|
for path in path_candidates():
|
|
exe_file = os.path.join(path, program)
|
|
for candidate in ext_candidates(exe_file):
|
|
if is_exe(candidate):
|
|
return candidate
|
|
return None
|
|
|
|
# Temp dir helper
|
|
class SafeTmpDir:
|
|
def __init__(self, em):
|
|
self.em = em
|
|
self.toptmp = ""
|
|
self.tmpdir = ""
|
|
|
|
def __del__(self):
|
|
try:
|
|
if self.toptmp:
|
|
shutil.rmtree(self.tmpdir, True)
|
|
os.rmdir(self.toptmp)
|
|
except Exception as err:
|
|
self.em.rclog("delete dir failed for " + self.toptmp)
|
|
|
|
def getpath(self):
|
|
if not self.tmpdir:
|
|
envrcltmp = os.getenv('RECOLL_TMPDIR')
|
|
if envrcltmp:
|
|
self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp)
|
|
else:
|
|
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
|
|
|
|
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
|
|
os.makedirs(self.tmpdir)
|
|
|
|
return self.tmpdir
|
|
|
|
|
|
# Common main routine for all python execm filters: either run the
|
|
# normal protocol engine or a local loop to test without recollindex
|
|
def main(proto, extract):
|
|
if len(sys.argv) == 1:
|
|
proto.mainloop(extract)
|
|
# mainloop does not return. Just in case
|
|
sys.exit(1)
|
|
|
|
|
|
# Not running the main loop: either acting as single filter (when called
|
|
# from other filter for example), or debugging
|
|
def usage():
|
|
print("Usage: rclexecm.py [-d] [-s] [-i ipath] <filename>",
|
|
file=sys.stderr)
|
|
print(" rclexecm.py -w <prog>",
|
|
file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
actAsSingle = False
|
|
debugDumpData = False
|
|
ipath = b""
|
|
|
|
args = sys.argv[1:]
|
|
opts, args = getopt.getopt(args, "hdsi:w:")
|
|
for opt, arg in opts:
|
|
if opt in ['-h']:
|
|
usage()
|
|
elif opt in ['-s']:
|
|
actAsSingle = True
|
|
elif opt in ['-i']:
|
|
ipath = makebytes(arg)
|
|
elif opt in ['-w']:
|
|
ret = which(arg)
|
|
if ret:
|
|
print("%s" % ret)
|
|
sys.exit(0)
|
|
else:
|
|
sys.exit(1)
|
|
elif opt in ['-d']:
|
|
debugDumpData = True
|
|
else:
|
|
print("unknown option %s\n"%opt, file=sys.stderr)
|
|
usage()
|
|
|
|
if len(args) != 1:
|
|
usage()
|
|
|
|
def mimetype_with_file(f):
|
|
cmd = 'file -i "' + f + '"'
|
|
fileout = os.popen(cmd).read()
|
|
lst = fileout.split(':')
|
|
mimetype = lst[len(lst)-1].strip()
|
|
lst = mimetype.split(';')
|
|
return makebytes(lst[0].strip())
|
|
|
|
def mimetype_with_xdg(f):
|
|
cmd = 'xdg-mime query filetype "' + f + '"'
|
|
return makebytes(os.popen(cmd).read().strip())
|
|
|
|
def debprint(out, s):
|
|
if not actAsSingle:
|
|
proto.breakwrite(out, makebytes(s+'\n'))
|
|
|
|
params = {'filename:': makebytes(args[0])}
|
|
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
|
mimetype = mimetype_with_xdg(args[0])
|
|
params['mimetype:'] = mimetype
|
|
|
|
if not extract.openfile(params):
|
|
print("Open error", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if PY3:
|
|
ioout = sys.stdout.buffer
|
|
else:
|
|
ioout = sys.stdout
|
|
if ipath != b"" or actAsSingle:
|
|
params['ipath:'] = ipath
|
|
ok, data, ipath, eof = extract.getipath(params)
|
|
if ok:
|
|
debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \
|
|
(ipath, proto.mimetype))
|
|
bdata = makebytes(data)
|
|
if debugDumpData or actAsSingle:
|
|
proto.breakwrite(ioout, bdata)
|
|
ioout.write(b'\n')
|
|
sys.exit(0)
|
|
else:
|
|
print("Got error, eof %d"%eof, file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
ecnt = 0
|
|
while 1:
|
|
ok, data, ipath, eof = extract.getnext(params)
|
|
if ok:
|
|
ecnt = ecnt + 1
|
|
bdata = makebytes(data)
|
|
debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \
|
|
(ecnt, len(data), ipath, proto.mimetype))
|
|
if debugDumpData:
|
|
proto.breakwrite(ioout, bdata)
|
|
ioout.write(b'\n')
|
|
if eof != RclExecM.noteof:
|
|
sys.exit(0)
|
|
else:
|
|
print("Not ok, eof %d" % eof, file=sys.stderr)
|
|
sys.exit(1)
|
|
# Not sure this makes sense, but going on looping certainly does not
|
|
if actAsSingle:
|
|
sys.exit(0)
|