#!/usr/bin/env python3 # Copyright (C) 2014 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the # Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Zip file extractor for Recoll from __future__ import print_function import os import posixpath import fnmatch import datetime import rclexecm from zipfile import ZipFile try: from recoll import rclconfig from recoll import conftree hasrclconfig = True except: hasrclconfig = False # As a temporary measure, we also look for rclconfig as a bare # module. This is so that the intermediate releases of the filter can # ship and use rclconfig.py with the filter code if not hasrclconfig: try: import rclconfig hasrclconfig = True except: pass # Note about file names (python 2.6. 2.7, don't know about 3.) # # There is a bit in zip entries to indicate if the filename is encoded # as utf-8 or not. If the bit is set, zipfile decodes the file name # and stores it in the catalog as an unicode object. Else it uses the # binary string, which it decodes as CP437 (zip standard). # # When reading the file, the input file name is used by rclzip.py # directly as an index into the catalog. # # When we send the file name data to the indexer, we have to serialize # it as byte string, we can't pass unicode objects to and fro. This # means that we have to test if the name is unicode. If it is, we send # the string encoded as utf-8. When reading, if the input is utf-8, we # turn it to unicode and use this to access the zip member, else we # use the binary string. # # In the case where an archive member name is a valid non-ascii utf-8 # string, but the flag is not set (which could probably happen if the # archiver did not try to detect utf-8 file names), this will fail, # because we'll convert back the utf-8 string to unicode and pass this # to zipfile, but an utf-8 string, not a unicode object is actually in # the catalog in this case, so the access will fail (will be seen at # preview or open time). This does not affect ascii file names because # the representation is the same anyway. # # To avoid this problem, we'd need to pass a separate bit of # information indicating that encoding was performed, not just rely on # the utf-8 validity test (ie have a 1st char switch), but this would be # incompatible with existing indexes. Instead we try both ways... # # Also, some zip files contain file names which are not encoded as # CP437 (Ex: EUC-KR which was the test case). Python produces garbage # paths in this case (this does not affect the ipath validity, just # the display), which is expected, but unzip succeeds in guessing the # correct encoding, I have no idea how, but apparently the magic # occurs in process.c:GetUnicodeData(), which succeeds in finding an # utf-8 string which zipfile does not see (to be checked: was a quick look). # Anyway: this is a python zipfile issue. class ZipExtractor: def __init__(self, em): self.filename = None self.f = None self.zip = None self.currentindex = 0 self.em = em def closefile(self): #self.em.rclog("Closing %s" % self.filename) if self.zip: self.zip.close() if self.f: self.f.close() self.f = None self.zip = None def extractone(self, ipath): #self.em.rclog("extractone: [%s]" % ipath) docdata = "" try: info = self.zip.getinfo(ipath) # There could be a 4GB Iso in the zip. We have to set a limit if info.file_size > self.em.maxmembersize: self.em.rclog("extractone: entry %s size %d too big" % (ipath, info.file_size)) docdata = "" #raise BadZipfile() else: docdata = self.zip.read(ipath) try: # We are assuming here that the zip uses forward slash # separators, which is not necessarily the case. At # worse, we'll get a wrong or no file name, which is # no big deal (the ipath is the important data # element). filename = posixpath.basename(ipath) self.em.setfield("filename", filename) dt = datetime.datetime(*info.date_time) self.em.setfield("modificationdate", str(int(dt.timestamp()))) except: pass ok = True except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.zip.namelist()) -1: self.closefile() iseof = rclexecm.RclExecM.eofnext return (ok, docdata, rclexecm.makebytes(ipath), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): self.closefile() filename = params["filename"] self.filename = filename self.currentindex = -1 self.skiplist = [] if hasrclconfig: config = rclconfig.RclConfig() config.setKeyDir(os.path.dirname(filename)) usebaseskipped = config.getConfParam("zipUseSkippedNames") if usebaseskipped: skipped = config.getConfParam("skippedNames") self.em.rclog("skippedNames: %s"%self.skiplist) self.skiplist += conftree.stringToStrings(skipped) skipped = config.getConfParam("zipSkippedNames") if skipped is not None: self.skiplist += conftree.stringToStrings(skipped) try: if rclexecm.PY3: # Note: py3 ZipFile wants an str file name, which # is wrong: file names are binary. But it accepts an # open file, and open() has no such restriction self.f = open(filename, 'rb') self.zip = ZipFile(self.f) else: self.zip = ZipFile(filename) return True except Exception as err: self.em.rclog("openfile: failed: [%s]" % err) return False def getipath(self, params): ipath = params["ipath"] ok, data, ipath, eof = self.extractone(ipath) if ok: return (ok, data, ipath, eof) # Not found. Maybe we need to decode the path? try: ipath = ipath.decode("utf-8") return self.extractone(ipath) except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): if self.currentindex == -1: # Return "self" doc self.currentindex = 0 self.em.setmimetype('text/plain') if len(self.zip.namelist()) == 0: self.closefile() eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof return (True, "", "", eof) if self.currentindex >= len(self.zip.namelist()): #self.em.rclog("getnext: EOF hit") self.closefile() return (False, "", "", rclexecm.RclExecM.eofnow) else: entryname = self.zip.namelist()[self.currentindex] # This is how we'd fix a badly decoded entry, but then # this can't be used as ipath any more #fixedname = entryname.encode('cp437').decode('euc-kr') #self.em.rclog("REENCODED: %s"%fixedname) if hasrclconfig and len(self.skiplist) != 0: while self.currentindex < len(self.zip.namelist()): entryname = self.zip.namelist()[self.currentindex] for pat in self.skiplist: if fnmatch.fnmatch(entryname, pat): entryname = None break if entryname is not None: break self.currentindex += 1 if entryname is None: self.closefile() return (False, "", "", rclexecm.RclExecM.eofnow) ret= self.extractone(entryname) self.currentindex += 1 return ret # Main program: create protocol handler and extractor and run them proto = rclexecm.RclExecM() extract = ZipExtractor(proto) rclexecm.main(proto, extract)