Implement workaround to character encoding issues in chm files and python HTMLParser
This commit is contained in:
parent
a3cbe32d62
commit
e24bd240f9
@ -178,7 +178,7 @@ class ChmWalker(HTMLParser):
|
|||||||
if text:
|
if text:
|
||||||
try:
|
try:
|
||||||
newwalker = ChmWalker(self.rclchm, npath, self.contents)
|
newwalker = ChmWalker(self.rclchm, npath, self.contents)
|
||||||
newwalker.feed(text)
|
newwalker.feed(self.rclchm.fixencoding(text))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -195,6 +195,10 @@ class rclCHM:
|
|||||||
self.em.setmimetype("text/plain")
|
self.em.setmimetype("text/plain")
|
||||||
else:
|
else:
|
||||||
self.em.setmimetype(rclchm_html_mtype)
|
self.em.setmimetype(rclchm_html_mtype)
|
||||||
|
expr = r'(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'
|
||||||
|
self.asciito1252re = re.compile(expr, re.IGNORECASE)
|
||||||
|
expr = r'<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'
|
||||||
|
self.findcharsetre = re.compile(expr, re.IGNORECASE)
|
||||||
|
|
||||||
def extractone(self, path):
|
def extractone(self, path):
|
||||||
"""Extract one path-named internal file from the chm file"""
|
"""Extract one path-named internal file from the chm file"""
|
||||||
@ -235,6 +239,28 @@ class rclCHM:
|
|||||||
alltxt += txt
|
alltxt += txt
|
||||||
return alltxt
|
return alltxt
|
||||||
|
|
||||||
|
def fixencoding(self, text):
|
||||||
|
"""Fix encoding for supposedly html document. We do 2 things here:
|
||||||
|
- Change any 'ASCII' charset decl to windows-1252 because windows
|
||||||
|
people can't learn and we have to cope.
|
||||||
|
- Decode the string to unicode if it's originally an str because
|
||||||
|
that's what Python HTMLParser actually expects even if it does not
|
||||||
|
really say so. See http://bugs.python.org/issue3932.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Memo. Charset decl example. Maybe we should also process the
|
||||||
|
# HTML5 charset tag ?
|
||||||
|
#<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=US-ASCII">
|
||||||
|
|
||||||
|
if isinstance(text, str):
|
||||||
|
# Fix an ascii charset decl to windows-1252
|
||||||
|
text = self.asciito1252re.sub(r"\1windows-1252\4", text, 1)
|
||||||
|
# Convert to unicode according to charset decl
|
||||||
|
m = self.findcharsetre.search(text)
|
||||||
|
if m:
|
||||||
|
text = text.decode(m.group(1))
|
||||||
|
return text
|
||||||
|
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
"""Open the chm file and build the contents list by extracting and
|
"""Open the chm file and build the contents list by extracting and
|
||||||
parsing the Topics object"""
|
parsing the Topics object"""
|
||||||
@ -255,7 +281,7 @@ class rclCHM:
|
|||||||
# Parse Topics file and extract list of internal nodes
|
# Parse Topics file and extract list of internal nodes
|
||||||
#self.em.rclog("Got topics");
|
#self.em.rclog("Got topics");
|
||||||
tp = ChmTopicsParser(self)
|
tp = ChmTopicsParser(self)
|
||||||
tp.feed(self.topics)
|
tp.feed(self.fixencoding(self.topics))
|
||||||
tp.close()
|
tp.close()
|
||||||
else:
|
else:
|
||||||
# No topics. If there is a home, let's try to walk the tree
|
# No topics. If there is a home, let's try to walk the tree
|
||||||
@ -271,7 +297,7 @@ class rclCHM:
|
|||||||
self.em.rclog("No topics and no home content")
|
self.em.rclog("No topics and no home content")
|
||||||
return False
|
return False
|
||||||
walker = ChmWalker(self, self.chm.home, self.contents)
|
walker = ChmWalker(self, self.chm.home, self.contents)
|
||||||
walker.feed(text)
|
walker.feed(self.fixencoding(text))
|
||||||
walker.close()
|
walker.close()
|
||||||
|
|
||||||
#self.em.rclog("Contents size %d" % len(self.contents))
|
#self.em.rclog("Contents size %d" % len(self.contents))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user