From c38db0f160606cf8c9148d1932335e78c3383696 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 18 Apr 2020 09:15:45 +0200 Subject: [PATCH] comment --- src/filters/rclzip | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/filters/rclzip b/src/filters/rclzip index 8d27aa10..04fc0262 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -46,11 +46,11 @@ if not hasrclconfig: # # There is a bit in zip entries to indicate if the filename is encoded # as utf-8 or not. If the bit is set, zipfile decodes the file name -# and stores it in the catalog as an unicode object. Else it uses a -# binary string. +# and stores it in the catalog as an unicode object. Else it uses the +# binary string, which it decodes as CP437 (zip standard). # -# When reading the file, the input file name is used directly as an -# index into the catalog. +# When reading the file, the input file name is used by rclzip +# directly as an index into the catalog. # # When we send the file name data to the indexer, we have to serialize # it as byte string, we can't pass unicode objects to and fro. This @@ -73,6 +73,14 @@ if not hasrclconfig: # the utf-8 validity test (ie have a 1st char switch), but this would be # incompatible with existing indexes. Instead we try both ways... # +# Also, some zip files contain file names which are not encoded as +# CP437 (Ex: EUC-KR which was the test case). Python produces garbage +# paths in this case (this does not affect the ipath validity, just +# the display), which is expected, but unzip succeeds in guessing the +# correct encoding, I have no idea how, but apparently the magic +# occurs in process.c:GetUnicodeData(), which succeeds in finding an +# utf-8 string which zipfile does not see (to be checked: was a quick look). +# Anyway: this is a python zipfile issue. class ZipExtractor: def __init__(self, em): self.filename = None