Chm: fix catenate mode which was broken a long time ago

This commit is contained in:
Jean-Francois Dockes 2021-05-01 10:29:44 +02:00
parent 2fd485366a
commit e42a4e9669

View File

@ -1,25 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Extract Html files from a Microsoft Compiled Html Help file (.chm) """Extract Html files from a Microsoft Compiled Html Help file (.chm)"""
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
from __future__ import print_function
rclchm_html_mtype = "text/html"
import sys import sys
import os import os
import re import re
import posixpath import posixpath
PY3 = sys.version > '3' from urllib.parse import unquote as urllib_unquote
if PY3: from urllib.parse import urlparse as urlparse_urlparse
from urllib.parse import unquote as urllib_unquote from html.parser import HTMLParser
from urllib.parse import urlparse as urlparse_urlparse
from html.parser import HTMLParser
else:
from urlparse import urlparse as urlparse_urlparse
from urllib import unquote as urllib_unquote
from HTMLParser import HTMLParser
import subprocess import subprocess
import rclconfig import rclconfig
@ -40,6 +28,9 @@ except:
print("RECFILTERROR HELPERNOTFOUND python3:chm") print("RECFILTERROR HELPERNOTFOUND python3:chm")
sys.exit(1); sys.exit(1);
def _deb(s):
print("%s"%s, file=sys.stderr)
# Small helper routines # Small helper routines
def getfile(chmfile, path): def getfile(chmfile, path):
"""Extract internal file text from chm object, given path""" """Extract internal file text from chm object, given path"""
@ -47,11 +38,11 @@ def getfile(chmfile, path):
raise Exception("Chm:getfile: must be called with path as bytes") raise Exception("Chm:getfile: must be called with path as bytes")
res, ui = chmfile.ResolveObject(path) res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS: if res != chmlib.CHM_RESOLVE_SUCCESS:
#print("ResolveObject failed: %s" % path, file=sys.stderr) #_deb("ResolveObject failed: %s" % path)
return "" return ""
res, doc = chmfile.RetrieveObject(ui) res, doc = chmfile.RetrieveObject(ui)
if not res: if not res:
print("RetrieveObject failed: %s" % path, file=sys.stderr) _deb("RetrieveObject failed: %s" % path)
return "" return ""
return doc return doc
@ -180,15 +171,13 @@ class ChmWalker(HTMLParser):
path = "" path = ""
if path: if path:
#print "got path", path, "me", self.path, "dir", self.dir
bpath = path.encode(self.rclchm.charset) bpath = path.encode(self.rclchm.charset)
if path[0] == "/"[0]: if path[0] == "/"[0]:
npath = posixpath.normpath(bpath) npath = posixpath.normpath(bpath)
else: else:
npath = posixpath.normpath(posixpath.join(self.dir, bpath)) npath = posixpath.normpath(posixpath.join(self.dir, bpath))
if not npath in self.contents: if not npath in self.contents:
#print("Going into [%s] paths [%s]\n" % #_deb("Going into [%s] paths [%s]\n" % (npath,str(self.contents)))
#(npath,str(self.contents)))
text = getfile(self.chm, npath) text = getfile(self.chm, npath)
if text: if text:
try: try:
@ -197,7 +186,8 @@ class ChmWalker(HTMLParser):
newwalker.feed(t) newwalker.feed(t)
except: except:
pass pass
class rclCHM: class rclCHM:
"""RclExecM slave worker for extracting all files from an Msoft chm """RclExecM slave worker for extracting all files from an Msoft chm
file. We first extract the list of internal nodes, and them return them file. We first extract the list of internal nodes, and them return them
@ -210,16 +200,17 @@ class rclCHM:
cf = rclconfig.RclConfig() cf = rclconfig.RclConfig()
self.catenate = cf.getConfParam("chmcatenate") self.catenate = cf.getConfParam("chmcatenate")
self.catenate = int(self.catenate) if self.catenate else False self.catenate = int(self.catenate) if self.catenate else False
if self.catenate: self.em.setmimetype("text/html")
self.em.setmimetype("text/plain")
else:
self.em.setmimetype(rclchm_html_mtype)
expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)''' expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'''
self.asciito1252re = re.compile(expr, re.IGNORECASE) self.asciito1252re = re.compile(expr, re.IGNORECASE)
expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>''' expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'''
self.findcharsetre = re.compile(expr, re.IGNORECASE) self.findcharsetre = re.compile(expr, re.IGNORECASE)
self._headtagre = re.compile(b'</head>', re.IGNORECASE)
self._headerre = re.compile(b'(<head.*</head>)', re.IGNORECASE|re.DOTALL)
self._bodyre = re.compile(b'<body[^>]*>(.*)</body>', re.IGNORECASE|re.DOTALL)
def extractone(self, path):
def extractone(self, path, norclaptag=False):
"""Extract one path-named internal file from the chm file""" """Extract one path-named internal file from the chm file"""
#self.em.rclog("extractone: [%s]" % (path,)) #self.em.rclog("extractone: [%s]" % (path,))
@ -237,34 +228,45 @@ class rclCHM:
res, doc = self.chm.RetrieveObject(ui) res, doc = self.chm.RetrieveObject(ui)
#self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
if res > 0: if res > 0:
doc = re.sub(b'''</[hH][eE][aA][dD]''', if not norclaptag:
b'''<meta name="rclaptg" content="chm"></head>''', doc) doc = self._headtagre.sub(b'''<meta name="rclaptg" content="chm"></head>''', doc)
self.em.setmimetype(rclchm_html_mtype)
return (True, doc, path, iseof) return (True, doc, path, iseof)
return (False, "", path, iseof) return (False, "", path, iseof)
def dumpall(self): def dumpall(self):
alltxt=b"" alltxt=b""
first = True
for pth in self.contents: for pth in self.contents:
ret,doc,path,iseof = self.extractone(pth) ret,doc,path,iseof = self.extractone(pth, norclaptag=True)
if not ret: if not ret:
continue continue
# Feed doc to lynx if first:
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist", # Save a header
"-display_charset=utf8", headmatch = self._headerre.search(doc)
"-force_html"], if headmatch:
stdin=subprocess.PIPE, header = headmatch[1]
stdout=subprocess.PIPE #_deb("HEADER [%s]" % header)
) #_deb("type(self.chm.title) %s" % type(self.chm.title))
txt,err = process.communicate(doc) if type(self.chm.title) == type(""):
alltxt += txt title = self.chm.title.encode(self.charset)
else:
title = self.chm.title
header = re.sub(b"<title.*</title>", b"<title>" + title + b"</title>",
doc, re.IGNORECASE|re.DOTALL)
first = False
alltxt += header + b"<body>"
body = self._bodyre.search(doc)
if body:
body = body[1]
#_deb("BODY [%s]" % body[0:200])
alltxt += body
alltxt += b"</body></html>"
return alltxt return alltxt
def fixencoding(self, text): def fixencoding(self, text):
"""Fix encoding for supposedly html document. We do 2 things here: """Fix encoding for supposedly html document. We do 2 things here:
- Change any 'ASCII' charset decl to windows-1252 because windows - Change any 'ASCII' charset decl to windows-1252
people can't learn and we have to cope. - Decode the string if it's originally bytes because
- Decode the string to unicode if it's originally an str because
that's what Python HTMLParser actually expects even if it does not that's what Python HTMLParser actually expects even if it does not
really say so. See http://bugs.python.org/issue3932. really say so. See http://bugs.python.org/issue3932.
""" """
@ -330,9 +332,15 @@ class rclCHM:
walker.feed(text) walker.feed(text)
walker.close() walker.close()
#self.em.rclog("Contents size %d" % len(self.contents)) # Eliminate duplicates but keep order (can't directly use set)
uniq = set(self.contents) u = set()
self.contents = list(uniq) ct = []
for t in self.contents:
if t not in u:
ct.append(t)
u.add(t)
self.contents = ct
#self.em.rclog("Contents size %d contents %s" % (len(self.contents), self.contents))
return True return True
def getipath(self, params): def getipath(self, params):
@ -340,7 +348,6 @@ class rclCHM:
def getnext(self, params): def getnext(self, params):
if self.catenate: if self.catenate:
self.em.setmimetype("text/plain")
alltxt = self.dumpall() alltxt = self.dumpall()
self.closefile() self.closefile()
if alltxt: if alltxt: