Chm: fix catenate mode which was broken a long time ago
This commit is contained in:
parent
2fd485366a
commit
e42a4e9669
@ -1,25 +1,13 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
|
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)"""
|
||||||
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
rclchm_html_mtype = "text/html"
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import posixpath
|
import posixpath
|
||||||
PY3 = sys.version > '3'
|
from urllib.parse import unquote as urllib_unquote
|
||||||
if PY3:
|
from urllib.parse import urlparse as urlparse_urlparse
|
||||||
from urllib.parse import unquote as urllib_unquote
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urlparse as urlparse_urlparse
|
|
||||||
from html.parser import HTMLParser
|
|
||||||
else:
|
|
||||||
from urlparse import urlparse as urlparse_urlparse
|
|
||||||
from urllib import unquote as urllib_unquote
|
|
||||||
from HTMLParser import HTMLParser
|
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import rclconfig
|
import rclconfig
|
||||||
@ -40,6 +28,9 @@ except:
|
|||||||
print("RECFILTERROR HELPERNOTFOUND python3:chm")
|
print("RECFILTERROR HELPERNOTFOUND python3:chm")
|
||||||
sys.exit(1);
|
sys.exit(1);
|
||||||
|
|
||||||
|
def _deb(s):
|
||||||
|
print("%s"%s, file=sys.stderr)
|
||||||
|
|
||||||
# Small helper routines
|
# Small helper routines
|
||||||
def getfile(chmfile, path):
|
def getfile(chmfile, path):
|
||||||
"""Extract internal file text from chm object, given path"""
|
"""Extract internal file text from chm object, given path"""
|
||||||
@ -47,11 +38,11 @@ def getfile(chmfile, path):
|
|||||||
raise Exception("Chm:getfile: must be called with path as bytes")
|
raise Exception("Chm:getfile: must be called with path as bytes")
|
||||||
res, ui = chmfile.ResolveObject(path)
|
res, ui = chmfile.ResolveObject(path)
|
||||||
if res != chmlib.CHM_RESOLVE_SUCCESS:
|
if res != chmlib.CHM_RESOLVE_SUCCESS:
|
||||||
#print("ResolveObject failed: %s" % path, file=sys.stderr)
|
#_deb("ResolveObject failed: %s" % path)
|
||||||
return ""
|
return ""
|
||||||
res, doc = chmfile.RetrieveObject(ui)
|
res, doc = chmfile.RetrieveObject(ui)
|
||||||
if not res:
|
if not res:
|
||||||
print("RetrieveObject failed: %s" % path, file=sys.stderr)
|
_deb("RetrieveObject failed: %s" % path)
|
||||||
return ""
|
return ""
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -180,15 +171,13 @@ class ChmWalker(HTMLParser):
|
|||||||
path = ""
|
path = ""
|
||||||
|
|
||||||
if path:
|
if path:
|
||||||
#print "got path", path, "me", self.path, "dir", self.dir
|
|
||||||
bpath = path.encode(self.rclchm.charset)
|
bpath = path.encode(self.rclchm.charset)
|
||||||
if path[0] == "/"[0]:
|
if path[0] == "/"[0]:
|
||||||
npath = posixpath.normpath(bpath)
|
npath = posixpath.normpath(bpath)
|
||||||
else:
|
else:
|
||||||
npath = posixpath.normpath(posixpath.join(self.dir, bpath))
|
npath = posixpath.normpath(posixpath.join(self.dir, bpath))
|
||||||
if not npath in self.contents:
|
if not npath in self.contents:
|
||||||
#print("Going into [%s] paths [%s]\n" %
|
#_deb("Going into [%s] paths [%s]\n" % (npath,str(self.contents)))
|
||||||
#(npath,str(self.contents)))
|
|
||||||
text = getfile(self.chm, npath)
|
text = getfile(self.chm, npath)
|
||||||
if text:
|
if text:
|
||||||
try:
|
try:
|
||||||
@ -197,7 +186,8 @@ class ChmWalker(HTMLParser):
|
|||||||
newwalker.feed(t)
|
newwalker.feed(t)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class rclCHM:
|
class rclCHM:
|
||||||
"""RclExecM slave worker for extracting all files from an Msoft chm
|
"""RclExecM slave worker for extracting all files from an Msoft chm
|
||||||
file. We first extract the list of internal nodes, and them return them
|
file. We first extract the list of internal nodes, and them return them
|
||||||
@ -210,16 +200,17 @@ class rclCHM:
|
|||||||
cf = rclconfig.RclConfig()
|
cf = rclconfig.RclConfig()
|
||||||
self.catenate = cf.getConfParam("chmcatenate")
|
self.catenate = cf.getConfParam("chmcatenate")
|
||||||
self.catenate = int(self.catenate) if self.catenate else False
|
self.catenate = int(self.catenate) if self.catenate else False
|
||||||
if self.catenate:
|
self.em.setmimetype("text/html")
|
||||||
self.em.setmimetype("text/plain")
|
|
||||||
else:
|
|
||||||
self.em.setmimetype(rclchm_html_mtype)
|
|
||||||
expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'''
|
expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'''
|
||||||
self.asciito1252re = re.compile(expr, re.IGNORECASE)
|
self.asciito1252re = re.compile(expr, re.IGNORECASE)
|
||||||
expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'''
|
expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'''
|
||||||
self.findcharsetre = re.compile(expr, re.IGNORECASE)
|
self.findcharsetre = re.compile(expr, re.IGNORECASE)
|
||||||
|
self._headtagre = re.compile(b'</head>', re.IGNORECASE)
|
||||||
|
self._headerre = re.compile(b'(<head.*</head>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
self._bodyre = re.compile(b'<body[^>]*>(.*)</body>', re.IGNORECASE|re.DOTALL)
|
||||||
|
|
||||||
def extractone(self, path):
|
|
||||||
|
def extractone(self, path, norclaptag=False):
|
||||||
"""Extract one path-named internal file from the chm file"""
|
"""Extract one path-named internal file from the chm file"""
|
||||||
|
|
||||||
#self.em.rclog("extractone: [%s]" % (path,))
|
#self.em.rclog("extractone: [%s]" % (path,))
|
||||||
@ -237,34 +228,45 @@ class rclCHM:
|
|||||||
res, doc = self.chm.RetrieveObject(ui)
|
res, doc = self.chm.RetrieveObject(ui)
|
||||||
#self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
|
#self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
|
||||||
if res > 0:
|
if res > 0:
|
||||||
doc = re.sub(b'''</[hH][eE][aA][dD]''',
|
if not norclaptag:
|
||||||
b'''<meta name="rclaptg" content="chm"></head>''', doc)
|
doc = self._headtagre.sub(b'''<meta name="rclaptg" content="chm"></head>''', doc)
|
||||||
self.em.setmimetype(rclchm_html_mtype)
|
|
||||||
return (True, doc, path, iseof)
|
return (True, doc, path, iseof)
|
||||||
return (False, "", path, iseof)
|
return (False, "", path, iseof)
|
||||||
|
|
||||||
def dumpall(self):
|
def dumpall(self):
|
||||||
alltxt=b""
|
alltxt=b""
|
||||||
|
first = True
|
||||||
for pth in self.contents:
|
for pth in self.contents:
|
||||||
ret,doc,path,iseof = self.extractone(pth)
|
ret,doc,path,iseof = self.extractone(pth, norclaptag=True)
|
||||||
if not ret:
|
if not ret:
|
||||||
continue
|
continue
|
||||||
# Feed doc to lynx
|
if first:
|
||||||
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
|
# Save a header
|
||||||
"-display_charset=utf8",
|
headmatch = self._headerre.search(doc)
|
||||||
"-force_html"],
|
if headmatch:
|
||||||
stdin=subprocess.PIPE,
|
header = headmatch[1]
|
||||||
stdout=subprocess.PIPE
|
#_deb("HEADER [%s]" % header)
|
||||||
)
|
#_deb("type(self.chm.title) %s" % type(self.chm.title))
|
||||||
txt,err = process.communicate(doc)
|
if type(self.chm.title) == type(""):
|
||||||
alltxt += txt
|
title = self.chm.title.encode(self.charset)
|
||||||
|
else:
|
||||||
|
title = self.chm.title
|
||||||
|
header = re.sub(b"<title.*</title>", b"<title>" + title + b"</title>",
|
||||||
|
doc, re.IGNORECASE|re.DOTALL)
|
||||||
|
first = False
|
||||||
|
alltxt += header + b"<body>"
|
||||||
|
body = self._bodyre.search(doc)
|
||||||
|
if body:
|
||||||
|
body = body[1]
|
||||||
|
#_deb("BODY [%s]" % body[0:200])
|
||||||
|
alltxt += body
|
||||||
|
alltxt += b"</body></html>"
|
||||||
return alltxt
|
return alltxt
|
||||||
|
|
||||||
def fixencoding(self, text):
|
def fixencoding(self, text):
|
||||||
"""Fix encoding for supposedly html document. We do 2 things here:
|
"""Fix encoding for supposedly html document. We do 2 things here:
|
||||||
- Change any 'ASCII' charset decl to windows-1252 because windows
|
- Change any 'ASCII' charset decl to windows-1252
|
||||||
people can't learn and we have to cope.
|
- Decode the string if it's originally bytes because
|
||||||
- Decode the string to unicode if it's originally an str because
|
|
||||||
that's what Python HTMLParser actually expects even if it does not
|
that's what Python HTMLParser actually expects even if it does not
|
||||||
really say so. See http://bugs.python.org/issue3932.
|
really say so. See http://bugs.python.org/issue3932.
|
||||||
"""
|
"""
|
||||||
@ -330,9 +332,15 @@ class rclCHM:
|
|||||||
walker.feed(text)
|
walker.feed(text)
|
||||||
walker.close()
|
walker.close()
|
||||||
|
|
||||||
#self.em.rclog("Contents size %d" % len(self.contents))
|
# Eliminate duplicates but keep order (can't directly use set)
|
||||||
uniq = set(self.contents)
|
u = set()
|
||||||
self.contents = list(uniq)
|
ct = []
|
||||||
|
for t in self.contents:
|
||||||
|
if t not in u:
|
||||||
|
ct.append(t)
|
||||||
|
u.add(t)
|
||||||
|
self.contents = ct
|
||||||
|
#self.em.rclog("Contents size %d contents %s" % (len(self.contents), self.contents))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getipath(self, params):
|
def getipath(self, params):
|
||||||
@ -340,7 +348,6 @@ class rclCHM:
|
|||||||
|
|
||||||
def getnext(self, params):
|
def getnext(self, params):
|
||||||
if self.catenate:
|
if self.catenate:
|
||||||
self.em.setmimetype("text/plain")
|
|
||||||
alltxt = self.dumpall()
|
alltxt = self.dumpall()
|
||||||
self.closefile()
|
self.closefile()
|
||||||
if alltxt:
|
if alltxt:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user