diff --git a/src/filters/rclzip b/src/filters/rclzip index a35e0519..3b29d8e1 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -5,14 +5,42 @@ import rclexecm from zipfile import ZipFile +# Note about file names (python 2.6. 2.7, don't know about 3.) +# +# There is a bit in zip entries to indicate if the filename is encoded +# as utf-8 or not. If the bit is set, zipfile decodes the file name +# and stores it in the catalog as an unicode object. Else it uses a +# binary string. +# +# When reading the file, the input file name is used directly as an +# index into the catalog. +# +# When we send the file name data to the indexer, we have to serialize +# it as byte string, we can't pass unicode objects to and fro. This +# means that we have to test if the name is unicode. If it is, we send +# the string encoded as utf-8. When reading, if the input is utf-8, we +# turn it to unicode and use this to access the zip member, else we +# use the binary string. +# +# In the case where an archive member name is a valid non-ascii utf-8 +# string, but the flag is not set (which could probably happen if the +# archiver did not try to detect utf-8 file names), this will fail, +# because we'll convert back the utf-8 string to unicode and pass this +# to zipfile, but an utf-8 string, not a unicode object is actually in +# the catalog in this case, so the access will fail (will be seen at +# preview or open time). This does not affect ascii file names because +# the representation is the same anyway. +# +# To avoid this problem, we'd need to pass a separate bit of +# information indicating that encoding was performed, not just rely on +# the utf-8 validity test (ie have a 1st char switch), but this would be +# incompatible with existing indexes. Instead we try both ways... +# class ZipExtractor: def __init__(self, em): self.currentindex = 0 self.em = em - def extractzipentry(self, name): - return (ret, data) - def extractone(self, ipath): #self.em.rclog("extractone: [%s]" % ipath) docdata = "" @@ -20,11 +48,13 @@ class ZipExtractor: docdata = self.zip.read(ipath) ok = True except Exception, err: - self.em.rclog("extractone: failed: [%s]" % err) +# self.em.rclog("extractone: failed: [%s]" % err) ok = False iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.zip.namelist()) -1: iseof = rclexecm.RclExecM.eofnext + if isinstance(ipath, unicode): + ipath = ipath.encode("utf-8") return (ok, docdata, ipath, iseof) ###### File type handler api, used by rclexecm ----------> @@ -37,7 +67,16 @@ class ZipExtractor: return False def getipath(self, params): - return self.extractone(params["ipath:"]) + ipath = params["ipath:"] + ok, data, ipath, eof = self.extractone(ipath) + if ok: + return (ok, data, ipath, eof) + # Not found. Maybe we need to decode the path? + try: + ipath = ipath.decode("utf-8") + return self.extractone(ipath) + except Exception, err: + return (ok, data, ipath, eof) def getnext(self, params): if self.currentindex >= len(self.zip.namelist()):