From 7fcb7c9bf7ef5671d7692acf6724e5859e1c652d Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 12 Oct 2012 13:34:56 +0200 Subject: [PATCH] ensure chm file can be renamed --- src/filters/rclchm | 124 +++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/src/filters/rclchm b/src/filters/rclchm index 2bb78352..dff3b853 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -32,10 +32,33 @@ except: print "RECFILTERROR HELPERNOTFOUND python:HTMLParser" sys.exit(1); +# Small helper routines +def getfile(chmfile, path): + """Extract internal file text from chm object, given path""" + res, ui = chmfile.ResolveObject(path) + if res != chmlib.CHM_RESOLVE_SUCCESS: + #print "ResolveObject failed", path + return "" + res, doc = chmfile.RetrieveObject(ui) + if not res: + print "RetrieveObject failed", path + return "" + return doc + +def peekfile(chmfile, path): + """Check that path resolves in chm object""" + res, ui = chmfile.ResolveObject(path) + if res != chmlib.CHM_RESOLVE_SUCCESS: + return False + return True + + +# CHM Topics tree handler + class ChmTopicsParser(HTMLParser): """Parse the chm's Topic file which is basically a listing of internal nodes (html files mostly). Build a list of - all nodes (self.contents), which will then be used to walk and index + all nodes (parent.contents), which will then be used to walk and index the chm. Most nodes in the Topic file look like the following: @@ -49,10 +72,10 @@ class ChmTopicsParser(HTMLParser): all values for parameters named "Local" (with some filtering/massaging), until proven wrong """ - def __init__(self, em): + def __init__(self, rclchm): HTMLParser.__init__(self) - self.contents = [] - self.em = em + self.em = rclchm.em + self.rclchm = rclchm def handle_starttag(self, tag, attrs): #self.em.rclog("Beginning of a %s tag" % tag) @@ -86,47 +109,31 @@ class ChmTopicsParser(HTMLParser): if len(ll) == 1: localpath = value elif len(ll) == 4 and ll[-1] and ll[-3]: - #self.em.rclog("File: %s" % ll[-3]) - if ll[-3] == self.fname: - localpath = ll[-1] - else: + #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn)) + # We used to test against the simple file name, but this does + # not work if the file is renamed. Just check that the internal + # path resolves. Old: if ll[-3] == self.rclchm.sfn: + localpath = ll[-1] + if not peekfile(self.rclchm.chm, localpath): #self.em.rclog("SKIPPING %s" % ll[-3]) - pass + localpath = "" if len(localpath) != 0 and localpath.find("#") == -1: if localpath[0] != '/': localpath = "/" + localpath - self.contents.append(localpath) - - def reset(self): - self.contents = [] - self.fname = "" - HTMLParser.reset(self) - - def setname(self, name): - self.fname = name - -def getfile(chmfile, path): - """Extract internal file text from chm object, given path""" - res, ui = chmfile.ResolveObject(path) - if res != chmlib.CHM_RESOLVE_SUCCESS: - #print "ResolveObject failed", path - return "" - res, doc = chmfile.RetrieveObject(ui) - if not res: - print "RetrieveObject failed", path - return "" - return doc + self.rclchm.contents.append(localpath) +# Used when there is no Topics node. Walk the links tree class ChmWalker(HTMLParser): - """Links tree walker. This recursivelyfollows all internal links - found in the from the top node given as input, and augments the contents - list.""" + """Links tree walker. This recursively follows all internal links + found in the tree from the top node given as input, and augments + the contents list.""" - def __init__(self, chm, path, contents): + def __init__(self, rclchm, path, contents): HTMLParser.__init__(self) - self.chm = chm + self.rclchm = rclchm + self.chm = rclchm.chm self.contents = contents self.path = posixpath.normpath(path) self.dir = posixpath.dirname(self.path) @@ -146,9 +153,13 @@ class ChmWalker(HTMLParser): if (not res.scheme or res.scheme.lower == "ms-its"): path = res.path lpath = path.split(':') - if len(lpath) == 3 and lpath[1] == cefilename: - # MS-ITS::somefile.chm:/some/path/file.htm ? + if len(lpath) == 3: + # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I + # know this never happens because there was a runtime error + # in this path path = lpath[2] + if not peekfile(self.chm, path): + path = "" elif len(lpath) == 1: path = lpath[0] else: @@ -166,7 +177,7 @@ class ChmWalker(HTMLParser): text = getfile(self.chm, npath) if text: try: - newwalker = ChmWalker(self.chm, npath, self.contents) + newwalker = ChmWalker(self.rclchm, npath, self.contents) newwalker.feed(text) except: pass @@ -177,8 +188,8 @@ class rclCHM: one by one. The ipath is the node path""" def __init__(self, em): + self.contents = [] self.chm = chm.CHMFile() - self.tp = ChmTopicsParser(em) self.currentindex = 0 self.em = em if rclchm_catenate: @@ -189,9 +200,9 @@ class rclCHM: def extractone(self, path): """Extract one path-named internal file from the chm file""" - #self.em.rclog("extractone: [%s]"%(path)) + #self.em.rclog("extractone: [%s]" % (path,)) iseof = rclexecm.RclExecM.noteof - if self.currentindex >= len(self.tp.contents) -1: + if self.currentindex >= len(self.contents) -1: iseof = rclexecm.RclExecM.eofnext res, ui = self.chm.ResolveObject(path) @@ -210,7 +221,7 @@ class rclCHM: def dumpall(self): alltxt="" - for pth in self.tp.contents: + for pth in self.contents: ret,doc,path,iseof = self.extractone(pth) if not ret: continue @@ -230,22 +241,25 @@ class rclCHM: parsing the Topics object""" self.currentindex = 0 - self.tp.reset() + self.contents = [] + filename = params["filename:"] if not self.chm.LoadCHM(filename): self.em.rclog("LoadCHM failed") return False - if not self.chm.GetArchiveInfo(): - self.em.rclog("GetArchiveInfo failed") - return False + + self.sfn = os.path.basename(filename) + + #self.em.rclog("home [%s] topics [%s] title [%s]" % + # (self.chm.home, self.chm.topics, self.chm.title)) self.topics = self.chm.GetTopicsTree() if self.topics: # Parse Topics file and extract list of internal nodes #self.em.rclog("Got topics"); - self.tp.setname(os.path.basename(filename)) - self.tp.feed(self.topics) - self.tp.close() + tp = ChmTopicsParser(self) + tp.feed(self.topics) + tp.close() else: # No topics. If there is a home, let's try to walk the tree #self.em.rclog("GetTopicsTree failed") @@ -259,13 +273,13 @@ class rclCHM: if not text: self.em.rclog("No topics and no home content") return False - walker = ChmWalker(self.chm, self.chm.home, self.tp.contents) + walker = ChmWalker(self, self.chm.home, self.contents) walker.feed(text) walker.close() - #self.em.rclog("Contents size %d" % len(self.tp.contents)) - uniq = set(self.tp.contents) - self.tp.contents = list(uniq) + #self.em.rclog("Contents size %d" % len(self.contents)) + uniq = set(self.contents) + self.contents = list(uniq) return True def getipath(self, params): @@ -279,10 +293,10 @@ class rclCHM: else: return (False, "", "", rclexecm.RclExecM.eofnow) - if self.currentindex >= len(self.tp.contents): + if self.currentindex >= len(self.contents): return (False, "", "", rclexecm.RclExecM.eofnow) else: - ret= self.extractone(self.tp.contents[self.currentindex]) + ret= self.extractone(self.contents[self.currentindex]) self.currentindex += 1 return ret