#!/usr/bin/env python2 """Extract Html files from a Microsoft Compiled Html Help file (.chm) Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" from __future__ import print_function # Note: this is not converted to python3, libchm does not have a # python3 wrapper at this point (2015-11) # Do we return individual chapters as html pages or concatenate everything? rclchm_catenate = 0 # Use special html type to allow for mimeconf/mimeview Open magic, # Or go the regular html way with text/html #rclchm_html_mtype = "text/x-chm-html" rclchm_html_mtype = "text/html" import sys import os import re import posixpath import urlparse import urllib if rclchm_catenate: import subprocess import rclexecm try: from chm import chm,chmlib except: print("RECFILTERROR HELPERNOTFOUND python:chm") sys.exit(1); try: from HTMLParser import HTMLParser except: print("RECFILTERROR HELPERNOTFOUND python:HTMLParser") sys.exit(1); # Small helper routines def getfile(chmfile, path): """Extract internal file text from chm object, given path""" res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: #print("ResolveObject failed: %s" % path, file=sys.stderr) return "" res, doc = chmfile.RetrieveObject(ui) if not res: print("RetrieveObject failed: %s" % path, file=sys.stderr) return "" return doc def peekfile(chmfile, path): """Check that path resolves in chm object""" res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: return False return True # CHM Topics tree handler class ChmTopicsParser(HTMLParser): """Parse the chm's Topic file which is basically a listing of internal nodes (html files mostly). Build a list of all nodes (parent.contents), which will then be used to walk and index the chm. Most nodes in the Topic file look like the following:
  • Maybe we should filter out non "text/sitemap" Objects, and maybe there are things of interest whose name is not Local, but for now, we just take all values for parameters named "Local" (with some filtering/massaging), until proven wrong """ def __init__(self, rclchm): HTMLParser.__init__(self) self.em = rclchm.em self.rclchm = rclchm def handle_starttag(self, tag, attrs): #self.em.rclog("Beginning of a %s tag" % tag) # If this is a param tag with name Local, we're interested in # the value which lists a file ref. Discard those with # # in them (references inside files) # Sometimes it seems that refs are like Vendor:filename::path, # we only keep the path, and only if the file matches if tag != 'param': return name = '' value = '' for (nm,val) in attrs: if nm == 'name': name = val if nm == 'value': value = val #self.em.rclog("Name [%s] value [%s]" %(name, value)) if name != 'Local' or value == '': return # value may be url-encoded. Decode it. If there are no % in there, will # do nothing value = urllib.unquote(value) localpath = "" ll = value.split(":") if len(ll) == 1: localpath = value elif len(ll) == 4 and ll[-1] and ll[-3]: #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn)) # We used to test against the simple file name, but this does # not work if the file is renamed. Just check that the internal # path resolves. Old: if ll[-3] == self.rclchm.sfn: localpath = ll[-1] if not peekfile(self.rclchm.chm, localpath): #self.em.rclog("SKIPPING %s" % ll[-3]) localpath = "" if len(localpath) != 0 and localpath.find("#") == -1: if localpath[0] != '/': localpath = "/" + localpath self.rclchm.contents.append(localpath) # Used when there is no Topics node. Walk the links tree class ChmWalker(HTMLParser): """Links tree walker. This recursively follows all internal links found in the tree from the top node given as input, and augments the contents list.""" def __init__(self, rclchm, path, contents): HTMLParser.__init__(self) self.rclchm = rclchm self.chm = rclchm.chm self.contents = contents self.path = posixpath.normpath(path) self.dir = posixpath.dirname(self.path) contents.append(self.path) def handle_starttag(self, tag, attrs): if tag != 'a': return href = '' for (nm,val) in attrs: if nm == 'href': href = val path = "" res = urlparse.urlparse(href) if (not res.scheme or res.scheme.lower == "ms-its"): path = res.path lpath = path.split(':') if len(lpath) == 3: # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I # know this never happens because there was a runtime error # in this path path = lpath[2] if not peekfile(self.chm, path): path = "" elif len(lpath) == 1: path = lpath[0] else: path = "" if path: #print "got path", path, "me", self.path, "dir", self.dir if path[0] == "/": npath = posixpath.normpath(path) else: npath = posixpath.normpath(posixpath.join(self.dir, path)) if not npath in self.contents: #print("Going into [%s] paths [%s]\n" % #(npath,str(self.contents))) text = getfile(self.chm, npath) if text: try: newwalker = ChmWalker(self.rclchm, npath, self.contents) newwalker.feed(self.rclchm.fixencoding(text)) except: pass class rclCHM: """RclExecM slave worker for extracting all files from an Msoft chm file. We first extract the list of internal nodes, and them return them one by one. The ipath is the node path""" def __init__(self, em): self.contents = [] self.chm = chm.CHMFile() self.em = em if rclchm_catenate: self.em.setmimetype("text/plain") else: self.em.setmimetype(rclchm_html_mtype) expr = r'()' self.asciito1252re = re.compile(expr, re.IGNORECASE) expr = r'' self.findcharsetre = re.compile(expr, re.IGNORECASE) def extractone(self, path): """Extract one path-named internal file from the chm file""" #self.em.rclog("extractone: [%s]" % (path,)) iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: iseof = rclexecm.RclExecM.eofnext res, ui = self.chm.ResolveObject(path) #self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui)) if res != chmlib.CHM_RESOLVE_SUCCESS: return (False, "", path, iseof) # RetrieveObject() returns len,value res, doc = self.chm.RetrieveObject(ui) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: doc = re.sub('''''', doc) self.em.setmimetype(rclchm_html_mtype) return (True, doc, path, iseof) return (False, "", path, iseof) def dumpall(self): alltxt="" for pth in self.contents: ret,doc,path,iseof = self.extractone(pth) if not ret: continue # Feed doc to lynx process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist", "-display_charset=utf8", "-force_html"], stdin=subprocess.PIPE, stdout=subprocess.PIPE ) txt,err = process.communicate(doc) alltxt += txt return alltxt def fixencoding(self, text): """Fix encoding for supposedly html document. We do 2 things here: - Change any 'ASCII' charset decl to windows-1252 because windows people can't learn and we have to cope. - Decode the string to unicode if it's originally an str because that's what Python HTMLParser actually expects even if it does not really say so. See http://bugs.python.org/issue3932. """ # Memo. Charset decl example. Maybe we should also process the # HTML5 charset tag ? # if isinstance(text, str): # Fix an ascii charset decl to windows-1252 text = self.asciito1252re.sub(r"\1windows-1252\4", text, 1) # Convert to unicode according to charset decl m = self.findcharsetre.search(text) if m: text = text.decode(m.group(1)) return text def openfile(self, params): """Open the chm file and build the contents list by extracting and parsing the Topics object""" self.currentindex = -1 self.contents = [] filename = params["filename:"] if not self.chm.LoadCHM(filename): self.em.rclog("LoadCHM failed") return False #self.em.rclog("home [%s] topics [%s] title [%s]" % # (self.chm.home, self.chm.topics, self.chm.title)) self.topics = self.chm.GetTopicsTree() if self.topics: # Parse Topics file and extract list of internal nodes #self.em.rclog("Got topics"); tp = ChmTopicsParser(self) tp.feed(self.fixencoding(self.topics)) tp.close() else: # No topics. If there is a home, let's try to walk the tree #self.em.rclog("GetTopicsTree failed") if not self.chm.home: self.em.rclog("No topics and no home") return False home = self.chm.home if home[0] != '/': home = "/" + home text = getfile(self.chm, home) if not text: self.em.rclog("No topics and no home content") return False walker = ChmWalker(self, self.chm.home, self.contents) walker.feed(self.fixencoding(text)) walker.close() #self.em.rclog("Contents size %d" % len(self.contents)) uniq = set(self.contents) self.contents = list(uniq) return True def getipath(self, params): return self.extractone(params["ipath:"]) def getnext(self, params): if rclchm_catenate: alltxt = self.dumpall() if alltxt: return (True, alltxt, "", rclexecm.RclExecM.eofnext) else: return (False, "", "", rclexecm.RclExecM.eofnow) if self.currentindex == -1: # Return "self" doc self.currentindex = 0 self.em.setmimetype('text/plain') if len(self.contents) == 0: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof return (True, "", "", eof) if self.currentindex >= len(self.contents): return (False, "", "", rclexecm.RclExecM.eofnow) else: ret= self.extractone(self.contents[self.currentindex]) self.currentindex += 1 return ret proto = rclexecm.RclExecM() extract = rclCHM(proto) rclexecm.main(proto, extract)