From cedff8ce7cc5a60b6fe891f3147ef092eb2363be Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 8 Apr 2018 10:53:15 +0200 Subject: [PATCH] rclchm: python3 modifications --- src/filters/rclchm | 64 +++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/filters/rclchm b/src/filters/rclchm index 49f8728a..f9811c37 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -17,9 +17,11 @@ PY3 = sys.version > '3' if PY3: from urllib.parse import unquote as urllib_unquote from urllib.parse import urlparse as urlparse_urlparse + from html.parser import HTMLParser else: from urlparse import urlparse as urlparse_urlparse from urllib import unquote as urllib_unquote + from HTMLParser import HTMLParser import subprocess @@ -32,15 +34,11 @@ except: print("RECFILTERROR HELPERNOTFOUND python:chm") sys.exit(1); -try: - from HTMLParser import HTMLParser -except: - print("RECFILTERROR HELPERNOTFOUND python:HTMLParser") - sys.exit(1); - # Small helper routines def getfile(chmfile, path): """Extract internal file text from chm object, given path""" + if type(path) != type(b''): + raise Exception("Chm:getfile: must be called with path as bytes") res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: #print("ResolveObject failed: %s" % path, file=sys.stderr) @@ -51,8 +49,10 @@ def getfile(chmfile, path): return "" return doc -def peekfile(chmfile, path): +def peekfile(chmfile, path, charset): """Check that path resolves in chm object""" + if type(path) == type(u''): + path = path.encode(charset) res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: return False @@ -120,14 +120,14 @@ class ChmTopicsParser(HTMLParser): # not work if the file is renamed. Just check that the internal # path resolves. Old: if ll[-3] == self.rclchm.sfn: localpath = ll[-1] - if not peekfile(self.rclchm.chm, localpath): + if not peekfile(self.rclchm.chm, localpath, self.rclchm.charset): #self.em.rclog("SKIPPING %s" % ll[-3]) localpath = "" if len(localpath) != 0 and localpath.find("#") == -1: if localpath[0] != '/': localpath = "/" + localpath - self.rclchm.contents.append(localpath) + self.rclchm.contents.append(localpath.encode(self.rclchm.charset)) # Used when there is no Topics node. Walk the links tree @@ -141,6 +141,8 @@ class ChmWalker(HTMLParser): self.rclchm = rclchm self.chm = rclchm.chm self.contents = contents + if type(path) == type(u''): + path = path.encode(self.rclchm.charset) self.path = posixpath.normpath(path) self.dir = posixpath.dirname(self.path) contents.append(self.path) @@ -164,7 +166,7 @@ class ChmWalker(HTMLParser): # know this never happens because there was a runtime error # in this path path = lpath[2] - if not peekfile(self.chm, path): + if not peekfile(self.chm, path, self.rclchm.charset): path = "" elif len(lpath) == 1: path = lpath[0] @@ -173,10 +175,11 @@ class ChmWalker(HTMLParser): if path: #print "got path", path, "me", self.path, "dir", self.dir - if path[0] == "/": - npath = posixpath.normpath(path) + bpath = path.encode(self.rclchm.charset) + if path[0] == "/"[0]: + npath = posixpath.normpath(bpath) else: - npath = posixpath.normpath(posixpath.join(self.dir, path)) + npath = posixpath.normpath(posixpath.join(self.dir, bpath)) if not npath in self.contents: #print("Going into [%s] paths [%s]\n" % #(npath,str(self.contents))) @@ -184,7 +187,8 @@ class ChmWalker(HTMLParser): if text: try: newwalker = ChmWalker(self.rclchm, npath, self.contents) - newwalker.feed(self.rclchm.fixencoding(text)) + t,c = self.rclchm.fixencoding(text) + newwalker.feed(t) except: pass @@ -204,15 +208,17 @@ class rclCHM: self.em.setmimetype("text/plain") else: self.em.setmimetype(rclchm_html_mtype) - expr = r'()' + expr = b'''()''' self.asciito1252re = re.compile(expr, re.IGNORECASE) - expr = r'' + expr = b'''''' self.findcharsetre = re.compile(expr, re.IGNORECASE) def extractone(self, path): """Extract one path-named internal file from the chm file""" #self.em.rclog("extractone: [%s]" % (path,)) + if type(path) == type(u''): + path = path.encode(self.charset) iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: iseof = rclexecm.RclExecM.eofnext @@ -225,8 +231,8 @@ class rclCHM: res, doc = self.chm.RetrieveObject(ui) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: - doc = re.sub('''''', doc) + doc = re.sub(b'''''', doc) self.em.setmimetype(rclchm_html_mtype) return (True, doc, path, iseof) return (False, "", path, iseof) @@ -261,14 +267,17 @@ class rclCHM: # HTML5 charset tag ? # - if isinstance(text, str): + if type(text) == type(b''): # Fix an ascii charset decl to windows-1252 - text = self.asciito1252re.sub(r"\1windows-1252\4", text, 1) + text = self.asciito1252re.sub(b'''\1windows-1252\4''', text, 1) # Convert to unicode according to charset decl m = self.findcharsetre.search(text) if m: - text = text.decode(m.group(1)) - return text + charset = m.group(1).decode('cp1252') + else: + charset = 'cp1252' + text = text.decode(charset, errors='replace') + return text, charset def openfile(self, params): """Open the chm file and build the contents list by extracting and @@ -286,11 +295,13 @@ class rclCHM: # (self.chm.home, self.chm.topics, self.chm.title)) self.topics = self.chm.GetTopicsTree() + self.charset = 'cp1252' if self.topics: # Parse Topics file and extract list of internal nodes #self.em.rclog("Got topics"); tp = ChmTopicsParser(self) - tp.feed(self.fixencoding(self.topics)) + text,self.charset = self.fixencoding(self.topics) + tp.feed(text) tp.close() else: # No topics. If there is a home, let's try to walk the tree @@ -299,14 +310,15 @@ class rclCHM: self.em.rclog("No topics and no home") return False home = self.chm.home - if home[0] != '/': - home = "/" + home + if home[0] != b'/'[0]: + home = b"/" + home text = getfile(self.chm, home) if not text: self.em.rclog("No topics and no home content") return False walker = ChmWalker(self, self.chm.home, self.contents) - walker.feed(self.fixencoding(text)) + text,self.charset = self.fixencoding(text) + walker.feed(text) walker.close() #self.em.rclog("Contents size %d" % len(self.contents))