diff --git a/src/filters/rclchm b/src/filters/rclchm index 1bf45f5b..b05e7898 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -1,25 +1,13 @@ #!/usr/bin/env python3 -"""Extract Html files from a Microsoft Compiled Html Help file (.chm) -Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" - -from __future__ import print_function - -rclchm_html_mtype = "text/html" +"""Extract Html files from a Microsoft Compiled Html Help file (.chm)""" import sys import os import re import posixpath -PY3 = sys.version > '3' -if PY3: - from urllib.parse import unquote as urllib_unquote - from urllib.parse import urlparse as urlparse_urlparse - from html.parser import HTMLParser -else: - from urlparse import urlparse as urlparse_urlparse - from urllib import unquote as urllib_unquote - from HTMLParser import HTMLParser - +from urllib.parse import unquote as urllib_unquote +from urllib.parse import urlparse as urlparse_urlparse +from html.parser import HTMLParser import subprocess import rclconfig @@ -40,6 +28,9 @@ except: print("RECFILTERROR HELPERNOTFOUND python3:chm") sys.exit(1); +def _deb(s): + print("%s"%s, file=sys.stderr) + # Small helper routines def getfile(chmfile, path): """Extract internal file text from chm object, given path""" @@ -47,11 +38,11 @@ def getfile(chmfile, path): raise Exception("Chm:getfile: must be called with path as bytes") res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: - #print("ResolveObject failed: %s" % path, file=sys.stderr) + #_deb("ResolveObject failed: %s" % path) return "" res, doc = chmfile.RetrieveObject(ui) if not res: - print("RetrieveObject failed: %s" % path, file=sys.stderr) + _deb("RetrieveObject failed: %s" % path) return "" return doc @@ -180,15 +171,13 @@ class ChmWalker(HTMLParser): path = "" if path: - #print "got path", path, "me", self.path, "dir", self.dir bpath = path.encode(self.rclchm.charset) if path[0] == "/"[0]: npath = posixpath.normpath(bpath) else: npath = posixpath.normpath(posixpath.join(self.dir, bpath)) if not npath in self.contents: - #print("Going into [%s] paths [%s]\n" % - #(npath,str(self.contents))) + #_deb("Going into [%s] paths [%s]\n" % (npath,str(self.contents))) text = getfile(self.chm, npath) if text: try: @@ -197,7 +186,8 @@ class ChmWalker(HTMLParser): newwalker.feed(t) except: pass - + + class rclCHM: """RclExecM slave worker for extracting all files from an Msoft chm file. We first extract the list of internal nodes, and them return them @@ -210,16 +200,17 @@ class rclCHM: cf = rclconfig.RclConfig() self.catenate = cf.getConfParam("chmcatenate") self.catenate = int(self.catenate) if self.catenate else False - if self.catenate: - self.em.setmimetype("text/plain") - else: - self.em.setmimetype(rclchm_html_mtype) + self.em.setmimetype("text/html") expr = b'''()''' self.asciito1252re = re.compile(expr, re.IGNORECASE) expr = b'''''' self.findcharsetre = re.compile(expr, re.IGNORECASE) + self._headtagre = re.compile(b'', re.IGNORECASE) + self._headerre = re.compile(b'(