try to handle the special handling of utf-8 paths inside zipfile
This commit is contained in:
parent
7eb182f53c
commit
205fdde5a9
@ -5,14 +5,42 @@
|
||||
import rclexecm
|
||||
from zipfile import ZipFile
|
||||
|
||||
# Note about file names (python 2.6. 2.7, don't know about 3.)
|
||||
#
|
||||
# There is a bit in zip entries to indicate if the filename is encoded
|
||||
# as utf-8 or not. If the bit is set, zipfile decodes the file name
|
||||
# and stores it in the catalog as an unicode object. Else it uses a
|
||||
# binary string.
|
||||
#
|
||||
# When reading the file, the input file name is used directly as an
|
||||
# index into the catalog.
|
||||
#
|
||||
# When we send the file name data to the indexer, we have to serialize
|
||||
# it as byte string, we can't pass unicode objects to and fro. This
|
||||
# means that we have to test if the name is unicode. If it is, we send
|
||||
# the string encoded as utf-8. When reading, if the input is utf-8, we
|
||||
# turn it to unicode and use this to access the zip member, else we
|
||||
# use the binary string.
|
||||
#
|
||||
# In the case where an archive member name is a valid non-ascii utf-8
|
||||
# string, but the flag is not set (which could probably happen if the
|
||||
# archiver did not try to detect utf-8 file names), this will fail,
|
||||
# because we'll convert back the utf-8 string to unicode and pass this
|
||||
# to zipfile, but an utf-8 string, not a unicode object is actually in
|
||||
# the catalog in this case, so the access will fail (will be seen at
|
||||
# preview or open time). This does not affect ascii file names because
|
||||
# the representation is the same anyway.
|
||||
#
|
||||
# To avoid this problem, we'd need to pass a separate bit of
|
||||
# information indicating that encoding was performed, not just rely on
|
||||
# the utf-8 validity test (ie have a 1st char switch), but this would be
|
||||
# incompatible with existing indexes. Instead we try both ways...
|
||||
#
|
||||
class ZipExtractor:
|
||||
def __init__(self, em):
|
||||
self.currentindex = 0
|
||||
self.em = em
|
||||
|
||||
def extractzipentry(self, name):
|
||||
return (ret, data)
|
||||
|
||||
def extractone(self, ipath):
|
||||
#self.em.rclog("extractone: [%s]" % ipath)
|
||||
docdata = ""
|
||||
@ -20,11 +48,13 @@ class ZipExtractor:
|
||||
docdata = self.zip.read(ipath)
|
||||
ok = True
|
||||
except Exception, err:
|
||||
self.em.rclog("extractone: failed: [%s]" % err)
|
||||
# self.em.rclog("extractone: failed: [%s]" % err)
|
||||
ok = False
|
||||
iseof = rclexecm.RclExecM.noteof
|
||||
if self.currentindex >= len(self.zip.namelist()) -1:
|
||||
iseof = rclexecm.RclExecM.eofnext
|
||||
if isinstance(ipath, unicode):
|
||||
ipath = ipath.encode("utf-8")
|
||||
return (ok, docdata, ipath, iseof)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
@ -37,7 +67,16 @@ class ZipExtractor:
|
||||
return False
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params["ipath:"])
|
||||
ipath = params["ipath:"]
|
||||
ok, data, ipath, eof = self.extractone(ipath)
|
||||
if ok:
|
||||
return (ok, data, ipath, eof)
|
||||
# Not found. Maybe we need to decode the path?
|
||||
try:
|
||||
ipath = ipath.decode("utf-8")
|
||||
return self.extractone(ipath)
|
||||
except Exception, err:
|
||||
return (ok, data, ipath, eof)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= len(self.zip.namelist()):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user