recoll/src/filters/rclexecm.py

#################################
# Copyright (C) 2014 J.F.Dockes
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the
#   Free Software Foundation, Inc.,
#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
########################################################
## Recoll multifilter communication module and utilities
#
# All data is binary. This is important for Python3
# All parameter names are converted to and processed as str/unicode

from __future__ import print_function

import sys
import os
import tempfile
import shutil
import getopt
import rclconfig

PY3 = sys.version > '3'

if PY3:
    def makebytes(data):
        if isinstance(data, bytes):
            return data
        else:
            return data.encode("UTF-8")
else:
    def makebytes(data):
        if isinstance(data, unicode):
            return data.encode("UTF-8")
        else:
            return data

my_config = rclconfig.RclConfig()

############################################
# RclExecM implements the
# communication protocol with the recollindex process. It calls the
# object specific of the document type to actually get the data.
class RclExecM:
    noteof = 0
    eofnext = 1
    eofnow = 2

    noerror = 0
    subdocerror = 1
    fileerror = 2

    def __init__(self):
        try:
            self.myname = os.path.basename(sys.argv[0])
        except:
            self.myname = "???"
        self.mimetype = b""

        if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"):
            self.maxmembersize = \
            int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB"))
        else:
            self.maxmembersize = 50 * 1024
        self.maxmembersize = self.maxmembersize * 1024
        if sys.platform == "win32":
            import msvcrt
            msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
            msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
        self.debugfile = None
        if self.debugfile:
            self.errfout = open(self.debugfile, "a")
        else:
            self.errfout = sys.stderr

    def rclog(self, s, doexit = 0, exitvalue = 1):
        print("RCLMFILT: %s: %s" % (self.myname, s), file=self.errfout)
        if doexit:
            sys.exit(exitvalue)

    def breakwrite(self, outfile, data):
        if sys.platform != "win32":
            outfile.write(data)
        else:
            # On windows, writing big chunks can fail with a "not enough space"
            # error. Seems a combined windows/python bug, depending on versions.
            # See https://bugs.python.org/issue11395
            # In any case, just break it up
            total = len(data)
            bs = 4*1024
            offset = 0
            while total > 0:
                if total < bs:
                    tow = total
                else:
                    tow = bs
                #self.rclog("Total %d Writing %d to stdout: %s" % (total,tow,data[offset:offset+tow]))
                outfile.write(data[offset:offset+tow])
                offset += tow
                total -= tow

    # Note: tried replacing this with a multiple replacer according to
    #  http://stackoverflow.com/a/15221068, which was **10 times** slower
    def htmlescape(self, txt):
        # &amp must stay first (it somehow had managed to skip
        # after the next replace, with rather interesting results)
        try:
            txt = txt.replace(b'&', b'&amp;').replace(b'<', b'&lt;').\
                  replace(b'>', b'&gt;').replace(b'"', b'&quot;')
        except:
            txt = txt.replace("&", "&amp;").replace("<", "&lt;").\
                  replace(">", "&gt;").replace("\"", "&quot;")
        return txt

    # Our worker sometimes knows the mime types of the data it sends
    def setmimetype(self, mt):
        self.mimetype = makebytes(mt)

    # Read single parameter from process input: line with param name and size
    # followed by data. The param name is returned as str/unicode, the data
    # as bytes
    def readparam(self):
        if PY3:
            inf = sys.stdin.buffer
        else:
            inf = sys.stdin
        s = inf.readline()
        if s == b'':
            sys.exit(0)

        s = s.rstrip(b'\n')

        if s == b'':
            return ('', b'')
        l = s.split()
        if len(l) != 2:
            self.rclog(b'bad line: [' + s + b']', 1, 1)

        paramname = l[0].decode('ASCII').lower()
        paramsize = int(l[1])
        if paramsize > 0:
            paramdata = inf.read(paramsize)
            if len(paramdata) != paramsize:
                self.rclog("Bad read: wanted %d, got %d" %
                      (paramsize, len(paramdata)), 1, 1)
        else:
            paramdata = b''

        #self.rclog("paramname [%s] paramsize %d value [%s]" %
        #          (paramname, paramsize, paramdata))
        return (paramname, paramdata)

    if PY3:
        def senditem(self, nm, len, data):
            sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, len)))
            self.breakwrite(sys.stdout.buffer, makebytes(data))
    else:
        def senditem(self, nm, len, data):
            sys.stdout.write(makebytes("%s: %d\n" % (nm, len)))
            self.breakwrite(sys.stdout, makebytes(data))

    # Send answer: document, ipath, possible eof.
    def answer(self, docdata, ipath, iseof = noteof, iserror = noerror):

        if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow:
            self.senditem("Document", len(docdata), docdata)

            if len(ipath):
                self.senditem("Ipath", len(ipath), ipath)

            if len(self.mimetype):
                self.senditem("Mimetype", len(self.mimetype), self.mimetype)

        # If we're at the end of the contents, say so
        if iseof == RclExecM.eofnow:
            self.senditem("Eofnow", 0, b'')
        elif iseof == RclExecM.eofnext:
            self.senditem("Eofnext", 0, b'')
        if iserror == RclExecM.subdocerror:
            self.senditem("Subdocerror", 0, b'')
        elif iserror == RclExecM.fileerror:
            self.senditem("Fileerror", 0, b'')

        # End of message
        print()
        sys.stdout.flush()
        #self.rclog("done writing data")

    def processmessage(self, processor, params):

        # We must have a filename entry (even empty). Else exit
        if "filename:" not in params:
            print("%s" % params, file=sys.stderr)
            self.rclog("no filename ??", 1, 1)

        # If we're given a file name, open it.
        if len(params["filename:"]) != 0:
            try:
                if not processor.openfile(params):
                    self.answer("", "", iserror = RclExecM.fileerror)
                    return
            except Exception as err:
                self.rclog("processmessage: openfile raised: [%s]" % err)
                self.answer("", "", iserror = RclExecM.fileerror)
                return

        # If we have an ipath, that's what we look for, else ask for next entry
        ipath = ""
        eof = True
        self.mimetype = ""
        try:
            if "ipath:" in params and len(params["ipath:"]):
                ok, data, ipath, eof = processor.getipath(params)
            else:
                ok, data, ipath, eof = processor.getnext(params)
        except Exception as err:
            self.answer("", "", eof, RclExecM.fileerror)
            return

        #self.rclog("processmessage: ok %s eof %s ipath %s"%(ok, eof, ipath))
        if ok:
            self.answer(data, ipath, eof)
        else:
            self.answer("", "", eof, RclExecM.subdocerror)

    # Loop on messages from our master
    def mainloop(self, processor):
        while 1:
            #self.rclog("waiting for command")

            params = dict()

            # Read at most 10 parameters (normally 1 or 2), stop at empty line
            # End of message is signalled by empty paramname
            for i in range(10):
                paramname, paramdata = self.readparam()
                if paramname == "":
                    break
                params[paramname] = paramdata

            # Got message, act on it
            self.processmessage(processor, params)


# Helper routine to test for program accessibility
# Note that this works a bit differently from Linux 'which', which
# won't search the PATH if there is a path part in the program name,
# even if not absolute (e.g. will just try subdir/cmd in current
# dir). We will find such a command if it exists in a matching subpath
# of any PATH element.
# This is very useful esp. on Windows so that we can have several bin
# filter directories under filters (to avoid dll clashes). The
# corresponding c++ routine in recoll execcmd works the same.
def which(program):
    def is_exe(fpath):
        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
    def ext_candidates(fpath):
        yield fpath
        for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
            yield fpath + ext

    def path_candidates():
        yield os.path.dirname(sys.argv[0])
        rclpath = my_config.getConfParam("recollhelperpath")
        if rclpath:
            for path in rclpath.split(os.pathsep):
                yield path
        for path in os.environ["PATH"].split(os.pathsep):
            yield path

    if os.path.isabs(program):
        if is_exe(program):
            return program
    else:
        for path in path_candidates():
            exe_file = os.path.join(path, program)
            for candidate in ext_candidates(exe_file):
                if is_exe(candidate):
                    return candidate
    return None

# Temp dir helper
class SafeTmpDir:
    def __init__(self, em):
        self.em = em
        self.toptmp = ""
        self.tmpdir = ""

    def __del__(self):
        try:
            if self.toptmp:
                shutil.rmtree(self.tmpdir, True)
                os.rmdir(self.toptmp)
        except Exception as err:
            self.em.rclog("delete dir failed for " + self.toptmp)

    def getpath(self):
        if not self.tmpdir:
            envrcltmp = os.getenv('RECOLL_TMPDIR')
            if envrcltmp:
                self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp)
            else:
                self.toptmp = tempfile.mkdtemp(prefix='rcltmp')

            self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
            os.makedirs(self.tmpdir)

        return self.tmpdir


# Common main routine for all python execm filters: either run the
# normal protocol engine or a local loop to test without recollindex
def main(proto, extract):
    if len(sys.argv) == 1:
        proto.mainloop(extract)
        # mainloop does not return. Just in case
        sys.exit(1)


    # Not running the main loop: either acting as single filter (when called
    # from other filter for example), or debugging
    def usage():
        print("Usage: rclexecm.py [-d] [-s] [-i ipath] <filename>",
              file=sys.stderr)
        print("       rclexecm.py -w <prog>",
              file=sys.stderr)
        sys.exit(1)

    actAsSingle = False
    debugDumpData = False
    ipath = b""

    args = sys.argv[1:]
    opts, args = getopt.getopt(args, "hdsi:w:")
    for opt, arg in opts:
        if opt in ['-h']:
            usage()
        elif opt in ['-s']:
            actAsSingle = True
        elif opt in ['-i']:
            ipath = makebytes(arg)
        elif opt in ['-w']:
            ret = which(arg)
            if ret:
                print("%s" % ret)
                sys.exit(0)
            else:
                sys.exit(1)
        elif opt in ['-d']:
            debugDumpData = True
        else:
            print("unknown option %s\n"%opt, file=sys.stderr)
            usage()

    if len(args) != 1:
        usage()

    def mimetype_with_file(f):
        cmd = 'file -i "' + f + '"'
        fileout = os.popen(cmd).read()
        lst = fileout.split(':')
        mimetype = lst[len(lst)-1].strip()
        lst = mimetype.split(';')
        return makebytes(lst[0].strip())

    def mimetype_with_xdg(f):
        cmd = 'xdg-mime query filetype "' + f + '"'
        return makebytes(os.popen(cmd).read().strip())

    def debprint(out, s):
        if not actAsSingle:
            proto.breakwrite(out, makebytes(s+'\n'))

    params = {'filename:': makebytes(args[0])}
    # Some filters (e.g. rclaudio) need/get a MIME type from the indexer
    mimetype = mimetype_with_xdg(args[0])
    params['mimetype:'] = mimetype

    if not extract.openfile(params):
        print("Open error", file=sys.stderr)
        sys.exit(1)

    if PY3:
        ioout = sys.stdout.buffer
    else:
        ioout = sys.stdout
    if ipath != b"" or actAsSingle:
        params['ipath:'] = ipath
        ok, data, ipath, eof = extract.getipath(params)
        if ok:
            debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \
                  (ipath, proto.mimetype))
            bdata = makebytes(data)
            if debugDumpData or actAsSingle:
                proto.breakwrite(ioout, bdata)
                ioout.write(b'\n')
            sys.exit(0)
        else:
            print("Got error, eof %d"%eof, file=sys.stderr)
            sys.exit(1)

    ecnt = 0
    while 1:
        ok, data, ipath, eof = extract.getnext(params)
        if ok:
            ecnt = ecnt + 1
            bdata = makebytes(data)
            debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \
                  (ecnt, len(data), ipath, proto.mimetype))
            if debugDumpData:
                proto.breakwrite(ioout, bdata)
                ioout.write(b'\n')
            if eof != RclExecM.noteof:
                sys.exit(0)
        else:
            print("Not ok, eof %d" % eof, file=sys.stderr)
            sys.exit(1)
        # Not sure this makes sense, but going on looping certainly does not
        if actAsSingle:
            sys.exit(0)