recoll/src/filters/rclzip.py

#!/usr/bin/env python3
# Copyright (C) 2014 J.F.Dockes
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the
#   Free Software Foundation, Inc.,
#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

# Zip file extractor for Recoll

from __future__ import print_function

import os
import posixpath
import fnmatch
import datetime

import rclexecm
from zipfile import ZipFile

try:
    from recoll import rclconfig
    from recoll import conftree
    hasrclconfig = True
except:
    hasrclconfig = False
# As a temporary measure, we also look for rclconfig as a bare
# module. This is so that the intermediate releases of the filter can
# ship and use rclconfig.py with the filter code
if not hasrclconfig:
    try:
        import rclconfig
        hasrclconfig = True
    except:
        pass

# Note about file names (python 2.6. 2.7, don't know about 3.)
#
# There is a bit in zip entries to indicate if the filename is encoded
# as utf-8 or not. If the bit is set, zipfile decodes the file name
# and stores it in the catalog as an unicode object. Else it uses the
# binary string, which it decodes as CP437 (zip standard).
#
# When reading the file, the input file name is used by rclzip.py
# directly as an index into the catalog.
#
# When we send the file name data to the indexer, we have to serialize
# it as byte string, we can't pass unicode objects to and fro. This
# means that we have to test if the name is unicode. If it is, we send
# the string encoded as utf-8. When reading, if the input is utf-8, we
# turn it to unicode and use this to access the zip member, else we
# use the binary string.
#
# In the case where an archive member name is a valid non-ascii utf-8
# string, but the flag is not set (which could probably happen if the
# archiver did not try to detect utf-8 file names), this will fail,
# because we'll convert back the utf-8 string to unicode and pass this
# to zipfile, but an utf-8 string, not a unicode object is actually in
# the catalog in this case, so the access will fail (will be seen at
# preview or open time). This does not affect ascii file names because
# the representation is the same anyway.
#
# To avoid this problem, we'd need to pass a separate bit of
# information indicating that encoding was performed, not just rely on
# the utf-8 validity test (ie have a 1st char switch), but this would be
# incompatible with existing indexes. Instead we try both ways...
#
# Also, some zip files contain file names which are not encoded as
# CP437 (Ex: EUC-KR which was the test case). Python produces garbage
# paths in this case (this does not affect the ipath validity, just
# the display), which is expected, but unzip succeeds in guessing the
# correct encoding, I have no idea how, but apparently the magic
# occurs in process.c:GetUnicodeData(), which succeeds in finding an
# utf-8 string which zipfile does not see (to be checked: was a quick look).
# Anyway: this is a python zipfile issue.
class ZipExtractor:
    def __init__(self, em):
        self.filename = None
        self.f = None
        self.zip = None
        self.currentindex = 0
        self.em = em

    def closefile(self):
        #self.em.rclog("Closing %s" % self.filename)
        if self.zip:
            self.zip.close()
        if self.f:
            self.f.close()
        self.f = None
        self.zip = None

    def extractone(self, ipath):
        #self.em.rclog("extractone: [%s]" % ipath)
        docdata = ""
        try:
            info = self.zip.getinfo(ipath)
            # There could be a 4GB Iso in the zip. We have to set a limit
            if info.file_size > self.em.maxmembersize:
                self.em.rclog("extractone: entry %s size %d too big" %
                              (ipath, info.file_size))
                docdata = ""
                #raise BadZipfile()
            else:
                docdata = self.zip.read(ipath)
            try:
                # We are assuming here that the zip uses forward slash
                # separators, which is not necessarily the case. At
                # worse, we'll get a wrong or no file name, which is
                # no big deal (the ipath is the important data
                # element).
                filename = posixpath.basename(ipath)
                self.em.setfield("filename", filename)
                dt = datetime.datetime(*info.date_time)
                self.em.setfield("modificationdate", str(int(dt.timestamp())))
            except:
                pass
            ok = True
        except Exception as err:
            self.em.rclog("extractone: failed: [%s]" % err)
            ok = False
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.zip.namelist()) -1:
            self.closefile()
            iseof = rclexecm.RclExecM.eofnext
        return (ok, docdata, rclexecm.makebytes(ipath), iseof)

    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.closefile()
        filename = params["filename"]
        self.filename = filename
        self.currentindex = -1
        self.skiplist = []

        if hasrclconfig:
            config = rclconfig.RclConfig()
            config.setKeyDir(os.path.dirname(filename))
            usebaseskipped = config.getConfParam("zipUseSkippedNames")
            if usebaseskipped:
                skipped = config.getConfParam("skippedNames")
                self.em.rclog("skippedNames: %s"%self.skiplist)
                self.skiplist += conftree.stringToStrings(skipped)
            skipped = config.getConfParam("zipSkippedNames")
            if skipped is not None:
                self.skiplist += conftree.stringToStrings(skipped)
        try:
            if rclexecm.PY3:
                # Note: py3 ZipFile wants an str file name, which
                # is wrong: file names are binary. But it accepts an
                # open file, and open() has no such restriction
                self.f = open(filename, 'rb')
                self.zip = ZipFile(self.f)
            else:
                self.zip = ZipFile(filename)
            return True
        except Exception as err:
            self.em.rclog("openfile: failed: [%s]" % err)
            return False

    def getipath(self, params):
        ipath = params["ipath"]
        ok, data, ipath, eof = self.extractone(ipath)
        if ok:
            return (ok, data, ipath, eof)
        # Not found. Maybe we need to decode the path?
        try:
            ipath = ipath.decode("utf-8")
            return self.extractone(ipath)
        except Exception as err:
            return (ok, data, ipath, eof)

    def getnext(self, params):
        if self.currentindex == -1:
            # Return "self" doc
            self.currentindex = 0
            self.em.setmimetype('text/plain')
            if len(self.zip.namelist()) == 0:
                self.closefile()
                eof = rclexecm.RclExecM.eofnext
            else:
                eof = rclexecm.RclExecM.noteof
            return (True, "", "", eof)

        if self.currentindex >= len(self.zip.namelist()):
            #self.em.rclog("getnext: EOF hit")
            self.closefile()
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            entryname = self.zip.namelist()[self.currentindex]
            # This is how we'd fix a badly decoded entry, but then
            # this can't be used as ipath any more
            #fixedname = entryname.encode('cp437').decode('euc-kr')
            #self.em.rclog("REENCODED: %s"%fixedname)

            if hasrclconfig and len(self.skiplist) != 0:
                while self.currentindex < len(self.zip.namelist()):
                    entryname = self.zip.namelist()[self.currentindex]
                    for pat in self.skiplist:
                        if fnmatch.fnmatch(entryname, pat):
                            entryname = None
                            break
                    if entryname is not None:
                        break
                    self.currentindex += 1
                if entryname is None:
                    self.closefile()
                    return (False, "", "", rclexecm.RclExecM.eofnow)

            ret= self.extractone(entryname)
            self.currentindex += 1
            return ret

# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = ZipExtractor(proto)
rclexecm.main(proto, extract)