From c38db0f160606cf8c9148d1932335e78c3383696 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sat, 18 Apr 2020 09:15:45 +0200
Subject: [PATCH] comment

---
 src/filters/rclzip | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/filters/rclzip b/src/filters/rclzip
index 8d27aa10..04fc0262 100755
--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@@ -46,11 +46,11 @@ if not hasrclconfig:
 #
 # There is a bit in zip entries to indicate if the filename is encoded
 # as utf-8 or not. If the bit is set, zipfile decodes the file name
-# and stores it in the catalog as an unicode object. Else it uses a
-# binary string.
+# and stores it in the catalog as an unicode object. Else it uses the
+# binary string, which it decodes as CP437 (zip standard).
 #
-# When reading the file, the input file name is used directly as an
-# index into the catalog.
+# When reading the file, the input file name is used by rclzip
+# directly as an index into the catalog.
 #
 # When we send the file name data to the indexer, we have to serialize
 # it as byte string, we can't pass unicode objects to and fro. This
@@ -73,6 +73,14 @@ if not hasrclconfig:
 # the utf-8 validity test (ie have a 1st char switch), but this would be
 # incompatible with existing indexes. Instead we try both ways...
 #
+# Also, some zip files contain file names which are not encoded as
+# CP437 (Ex: EUC-KR which was the test case). Python produces garbage
+# paths in this case (this does not affect the ipath validity, just
+# the display), which is expected, but unzip succeeds in guessing the
+# correct encoding, I have no idea how, but apparently the magic
+# occurs in process.c:GetUnicodeData(), which succeeds in finding an
+# utf-8 string which zipfile does not see (to be checked: was a quick look). 
+# Anyway: this is a python zipfile issue.
 class ZipExtractor:
     def __init__(self, em):
         self.filename = None