rclchm: python3 modifications

This commit is contained in:
Jean-Francois Dockes 2018-04-08 10:53:15 +02:00
parent 2aeb66854c
commit cedff8ce7c

View File

@ -17,9 +17,11 @@ PY3 = sys.version > '3'
if PY3: if PY3:
from urllib.parse import unquote as urllib_unquote from urllib.parse import unquote as urllib_unquote
from urllib.parse import urlparse as urlparse_urlparse from urllib.parse import urlparse as urlparse_urlparse
from html.parser import HTMLParser
else: else:
from urlparse import urlparse as urlparse_urlparse from urlparse import urlparse as urlparse_urlparse
from urllib import unquote as urllib_unquote from urllib import unquote as urllib_unquote
from HTMLParser import HTMLParser
import subprocess import subprocess
@ -32,15 +34,11 @@ except:
print("RECFILTERROR HELPERNOTFOUND python:chm") print("RECFILTERROR HELPERNOTFOUND python:chm")
sys.exit(1); sys.exit(1);
try:
from HTMLParser import HTMLParser
except:
print("RECFILTERROR HELPERNOTFOUND python:HTMLParser")
sys.exit(1);
# Small helper routines # Small helper routines
def getfile(chmfile, path): def getfile(chmfile, path):
"""Extract internal file text from chm object, given path""" """Extract internal file text from chm object, given path"""
if type(path) != type(b''):
raise Exception("Chm:getfile: must be called with path as bytes")
res, ui = chmfile.ResolveObject(path) res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS: if res != chmlib.CHM_RESOLVE_SUCCESS:
#print("ResolveObject failed: %s" % path, file=sys.stderr) #print("ResolveObject failed: %s" % path, file=sys.stderr)
@ -51,8 +49,10 @@ def getfile(chmfile, path):
return "" return ""
return doc return doc
def peekfile(chmfile, path): def peekfile(chmfile, path, charset):
"""Check that path resolves in chm object""" """Check that path resolves in chm object"""
if type(path) == type(u''):
path = path.encode(charset)
res, ui = chmfile.ResolveObject(path) res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS: if res != chmlib.CHM_RESOLVE_SUCCESS:
return False return False
@ -120,14 +120,14 @@ class ChmTopicsParser(HTMLParser):
# not work if the file is renamed. Just check that the internal # not work if the file is renamed. Just check that the internal
# path resolves. Old: if ll[-3] == self.rclchm.sfn: # path resolves. Old: if ll[-3] == self.rclchm.sfn:
localpath = ll[-1] localpath = ll[-1]
if not peekfile(self.rclchm.chm, localpath): if not peekfile(self.rclchm.chm, localpath, self.rclchm.charset):
#self.em.rclog("SKIPPING %s" % ll[-3]) #self.em.rclog("SKIPPING %s" % ll[-3])
localpath = "" localpath = ""
if len(localpath) != 0 and localpath.find("#") == -1: if len(localpath) != 0 and localpath.find("#") == -1:
if localpath[0] != '/': if localpath[0] != '/':
localpath = "/" + localpath localpath = "/" + localpath
self.rclchm.contents.append(localpath) self.rclchm.contents.append(localpath.encode(self.rclchm.charset))
# Used when there is no Topics node. Walk the links tree # Used when there is no Topics node. Walk the links tree
@ -141,6 +141,8 @@ class ChmWalker(HTMLParser):
self.rclchm = rclchm self.rclchm = rclchm
self.chm = rclchm.chm self.chm = rclchm.chm
self.contents = contents self.contents = contents
if type(path) == type(u''):
path = path.encode(self.rclchm.charset)
self.path = posixpath.normpath(path) self.path = posixpath.normpath(path)
self.dir = posixpath.dirname(self.path) self.dir = posixpath.dirname(self.path)
contents.append(self.path) contents.append(self.path)
@ -164,7 +166,7 @@ class ChmWalker(HTMLParser):
# know this never happens because there was a runtime error # know this never happens because there was a runtime error
# in this path # in this path
path = lpath[2] path = lpath[2]
if not peekfile(self.chm, path): if not peekfile(self.chm, path, self.rclchm.charset):
path = "" path = ""
elif len(lpath) == 1: elif len(lpath) == 1:
path = lpath[0] path = lpath[0]
@ -173,10 +175,11 @@ class ChmWalker(HTMLParser):
if path: if path:
#print "got path", path, "me", self.path, "dir", self.dir #print "got path", path, "me", self.path, "dir", self.dir
if path[0] == "/": bpath = path.encode(self.rclchm.charset)
npath = posixpath.normpath(path) if path[0] == "/"[0]:
npath = posixpath.normpath(bpath)
else: else:
npath = posixpath.normpath(posixpath.join(self.dir, path)) npath = posixpath.normpath(posixpath.join(self.dir, bpath))
if not npath in self.contents: if not npath in self.contents:
#print("Going into [%s] paths [%s]\n" % #print("Going into [%s] paths [%s]\n" %
#(npath,str(self.contents))) #(npath,str(self.contents)))
@ -184,7 +187,8 @@ class ChmWalker(HTMLParser):
if text: if text:
try: try:
newwalker = ChmWalker(self.rclchm, npath, self.contents) newwalker = ChmWalker(self.rclchm, npath, self.contents)
newwalker.feed(self.rclchm.fixencoding(text)) t,c = self.rclchm.fixencoding(text)
newwalker.feed(t)
except: except:
pass pass
@ -204,15 +208,17 @@ class rclCHM:
self.em.setmimetype("text/plain") self.em.setmimetype("text/plain")
else: else:
self.em.setmimetype(rclchm_html_mtype) self.em.setmimetype(rclchm_html_mtype)
expr = r'(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)' expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'''
self.asciito1252re = re.compile(expr, re.IGNORECASE) self.asciito1252re = re.compile(expr, re.IGNORECASE)
expr = r'<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>' expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'''
self.findcharsetre = re.compile(expr, re.IGNORECASE) self.findcharsetre = re.compile(expr, re.IGNORECASE)
def extractone(self, path): def extractone(self, path):
"""Extract one path-named internal file from the chm file""" """Extract one path-named internal file from the chm file"""
#self.em.rclog("extractone: [%s]" % (path,)) #self.em.rclog("extractone: [%s]" % (path,))
if type(path) == type(u''):
path = path.encode(self.charset)
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.contents) -1: if self.currentindex >= len(self.contents) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
@ -225,8 +231,8 @@ class rclCHM:
res, doc = self.chm.RetrieveObject(ui) res, doc = self.chm.RetrieveObject(ui)
#self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
if res > 0: if res > 0:
doc = re.sub('''</[hH][eE][aA][dD]''', doc = re.sub(b'''</[hH][eE][aA][dD]''',
'''<meta name="rclaptg" content="chm"></head>''', doc) b'''<meta name="rclaptg" content="chm"></head>''', doc)
self.em.setmimetype(rclchm_html_mtype) self.em.setmimetype(rclchm_html_mtype)
return (True, doc, path, iseof) return (True, doc, path, iseof)
return (False, "", path, iseof) return (False, "", path, iseof)
@ -261,14 +267,17 @@ class rclCHM:
# HTML5 charset tag ? # HTML5 charset tag ?
#<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=US-ASCII"> #<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=US-ASCII">
if isinstance(text, str): if type(text) == type(b''):
# Fix an ascii charset decl to windows-1252 # Fix an ascii charset decl to windows-1252
text = self.asciito1252re.sub(r"\1windows-1252\4", text, 1) text = self.asciito1252re.sub(b'''\1windows-1252\4''', text, 1)
# Convert to unicode according to charset decl # Convert to unicode according to charset decl
m = self.findcharsetre.search(text) m = self.findcharsetre.search(text)
if m: if m:
text = text.decode(m.group(1)) charset = m.group(1).decode('cp1252')
return text else:
charset = 'cp1252'
text = text.decode(charset, errors='replace')
return text, charset
def openfile(self, params): def openfile(self, params):
"""Open the chm file and build the contents list by extracting and """Open the chm file and build the contents list by extracting and
@ -286,11 +295,13 @@ class rclCHM:
# (self.chm.home, self.chm.topics, self.chm.title)) # (self.chm.home, self.chm.topics, self.chm.title))
self.topics = self.chm.GetTopicsTree() self.topics = self.chm.GetTopicsTree()
self.charset = 'cp1252'
if self.topics: if self.topics:
# Parse Topics file and extract list of internal nodes # Parse Topics file and extract list of internal nodes
#self.em.rclog("Got topics"); #self.em.rclog("Got topics");
tp = ChmTopicsParser(self) tp = ChmTopicsParser(self)
tp.feed(self.fixencoding(self.topics)) text,self.charset = self.fixencoding(self.topics)
tp.feed(text)
tp.close() tp.close()
else: else:
# No topics. If there is a home, let's try to walk the tree # No topics. If there is a home, let's try to walk the tree
@ -299,14 +310,15 @@ class rclCHM:
self.em.rclog("No topics and no home") self.em.rclog("No topics and no home")
return False return False
home = self.chm.home home = self.chm.home
if home[0] != '/': if home[0] != b'/'[0]:
home = "/" + home home = b"/" + home
text = getfile(self.chm, home) text = getfile(self.chm, home)
if not text: if not text:
self.em.rclog("No topics and no home content") self.em.rclog("No topics and no home content")
return False return False
walker = ChmWalker(self, self.chm.home, self.contents) walker = ChmWalker(self, self.chm.home, self.contents)
walker.feed(self.fixencoding(text)) text,self.charset = self.fixencoding(text)
walker.feed(text)
walker.close() walker.close()
#self.em.rclog("Contents size %d" % len(self.contents)) #self.em.rclog("Contents size %d" % len(self.contents))