ensure chm file can be renamed

This commit is contained in:
Jean-Francois Dockes 2012-10-12 13:34:56 +02:00
parent d4edbbaedb
commit 7fcb7c9bf7

View File

@ -32,10 +32,33 @@ except:
print "RECFILTERROR HELPERNOTFOUND python:HTMLParser" print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
sys.exit(1); sys.exit(1);
# Small helper routines
def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
return ""
return doc
def peekfile(chmfile, path):
"""Check that path resolves in chm object"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
return False
return True
# CHM Topics tree handler
class ChmTopicsParser(HTMLParser): class ChmTopicsParser(HTMLParser):
"""Parse the chm's Topic file which is basically """Parse the chm's Topic file which is basically
a listing of internal nodes (html files mostly). Build a list of a listing of internal nodes (html files mostly). Build a list of
all nodes (self.contents), which will then be used to walk and index all nodes (parent.contents), which will then be used to walk and index
the chm. the chm.
Most nodes in the Topic file look like the following: Most nodes in the Topic file look like the following:
@ -49,10 +72,10 @@ class ChmTopicsParser(HTMLParser):
all values for parameters named "Local" (with some filtering/massaging), all values for parameters named "Local" (with some filtering/massaging),
until proven wrong until proven wrong
""" """
def __init__(self, em): def __init__(self, rclchm):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.contents = [] self.em = rclchm.em
self.em = em self.rclchm = rclchm
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
#self.em.rclog("Beginning of a %s tag" % tag) #self.em.rclog("Beginning of a %s tag" % tag)
@ -86,47 +109,31 @@ class ChmTopicsParser(HTMLParser):
if len(ll) == 1: if len(ll) == 1:
localpath = value localpath = value
elif len(ll) == 4 and ll[-1] and ll[-3]: elif len(ll) == 4 and ll[-1] and ll[-3]:
#self.em.rclog("File: %s" % ll[-3]) #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn))
if ll[-3] == self.fname: # We used to test against the simple file name, but this does
localpath = ll[-1] # not work if the file is renamed. Just check that the internal
else: # path resolves. Old: if ll[-3] == self.rclchm.sfn:
localpath = ll[-1]
if not peekfile(self.rclchm.chm, localpath):
#self.em.rclog("SKIPPING %s" % ll[-3]) #self.em.rclog("SKIPPING %s" % ll[-3])
pass localpath = ""
if len(localpath) != 0 and localpath.find("#") == -1: if len(localpath) != 0 and localpath.find("#") == -1:
if localpath[0] != '/': if localpath[0] != '/':
localpath = "/" + localpath localpath = "/" + localpath
self.contents.append(localpath) self.rclchm.contents.append(localpath)
def reset(self):
self.contents = []
self.fname = ""
HTMLParser.reset(self)
def setname(self, name):
self.fname = name
def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
return ""
return doc
# Used when there is no Topics node. Walk the links tree
class ChmWalker(HTMLParser): class ChmWalker(HTMLParser):
"""Links tree walker. This recursivelyfollows all internal links """Links tree walker. This recursively follows all internal links
found in the from the top node given as input, and augments the contents found in the tree from the top node given as input, and augments
list.""" the contents list."""
def __init__(self, chm, path, contents): def __init__(self, rclchm, path, contents):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.chm = chm self.rclchm = rclchm
self.chm = rclchm.chm
self.contents = contents self.contents = contents
self.path = posixpath.normpath(path) self.path = posixpath.normpath(path)
self.dir = posixpath.dirname(self.path) self.dir = posixpath.dirname(self.path)
@ -146,9 +153,13 @@ class ChmWalker(HTMLParser):
if (not res.scheme or res.scheme.lower == "ms-its"): if (not res.scheme or res.scheme.lower == "ms-its"):
path = res.path path = res.path
lpath = path.split(':') lpath = path.split(':')
if len(lpath) == 3 and lpath[1] == cefilename: if len(lpath) == 3:
# MS-ITS::somefile.chm:/some/path/file.htm ? # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I
# know this never happens because there was a runtime error
# in this path
path = lpath[2] path = lpath[2]
if not peekfile(self.chm, path):
path = ""
elif len(lpath) == 1: elif len(lpath) == 1:
path = lpath[0] path = lpath[0]
else: else:
@ -166,7 +177,7 @@ class ChmWalker(HTMLParser):
text = getfile(self.chm, npath) text = getfile(self.chm, npath)
if text: if text:
try: try:
newwalker = ChmWalker(self.chm, npath, self.contents) newwalker = ChmWalker(self.rclchm, npath, self.contents)
newwalker.feed(text) newwalker.feed(text)
except: except:
pass pass
@ -177,8 +188,8 @@ class rclCHM:
one by one. The ipath is the node path""" one by one. The ipath is the node path"""
def __init__(self, em): def __init__(self, em):
self.contents = []
self.chm = chm.CHMFile() self.chm = chm.CHMFile()
self.tp = ChmTopicsParser(em)
self.currentindex = 0 self.currentindex = 0
self.em = em self.em = em
if rclchm_catenate: if rclchm_catenate:
@ -189,9 +200,9 @@ class rclCHM:
def extractone(self, path): def extractone(self, path):
"""Extract one path-named internal file from the chm file""" """Extract one path-named internal file from the chm file"""
#self.em.rclog("extractone: [%s]"%(path)) #self.em.rclog("extractone: [%s]" % (path,))
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.tp.contents) -1: if self.currentindex >= len(self.contents) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
res, ui = self.chm.ResolveObject(path) res, ui = self.chm.ResolveObject(path)
@ -210,7 +221,7 @@ class rclCHM:
def dumpall(self): def dumpall(self):
alltxt="" alltxt=""
for pth in self.tp.contents: for pth in self.contents:
ret,doc,path,iseof = self.extractone(pth) ret,doc,path,iseof = self.extractone(pth)
if not ret: if not ret:
continue continue
@ -230,22 +241,25 @@ class rclCHM:
parsing the Topics object""" parsing the Topics object"""
self.currentindex = 0 self.currentindex = 0
self.tp.reset() self.contents = []
filename = params["filename:"] filename = params["filename:"]
if not self.chm.LoadCHM(filename): if not self.chm.LoadCHM(filename):
self.em.rclog("LoadCHM failed") self.em.rclog("LoadCHM failed")
return False return False
if not self.chm.GetArchiveInfo():
self.em.rclog("GetArchiveInfo failed") self.sfn = os.path.basename(filename)
return False
#self.em.rclog("home [%s] topics [%s] title [%s]" %
# (self.chm.home, self.chm.topics, self.chm.title))
self.topics = self.chm.GetTopicsTree() self.topics = self.chm.GetTopicsTree()
if self.topics: if self.topics:
# Parse Topics file and extract list of internal nodes # Parse Topics file and extract list of internal nodes
#self.em.rclog("Got topics"); #self.em.rclog("Got topics");
self.tp.setname(os.path.basename(filename)) tp = ChmTopicsParser(self)
self.tp.feed(self.topics) tp.feed(self.topics)
self.tp.close() tp.close()
else: else:
# No topics. If there is a home, let's try to walk the tree # No topics. If there is a home, let's try to walk the tree
#self.em.rclog("GetTopicsTree failed") #self.em.rclog("GetTopicsTree failed")
@ -259,13 +273,13 @@ class rclCHM:
if not text: if not text:
self.em.rclog("No topics and no home content") self.em.rclog("No topics and no home content")
return False return False
walker = ChmWalker(self.chm, self.chm.home, self.tp.contents) walker = ChmWalker(self, self.chm.home, self.contents)
walker.feed(text) walker.feed(text)
walker.close() walker.close()
#self.em.rclog("Contents size %d" % len(self.tp.contents)) #self.em.rclog("Contents size %d" % len(self.contents))
uniq = set(self.tp.contents) uniq = set(self.contents)
self.tp.contents = list(uniq) self.contents = list(uniq)
return True return True
def getipath(self, params): def getipath(self, params):
@ -279,10 +293,10 @@ class rclCHM:
else: else:
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
if self.currentindex >= len(self.tp.contents): if self.currentindex >= len(self.contents):
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
else: else:
ret= self.extractone(self.tp.contents[self.currentindex]) ret= self.extractone(self.contents[self.currentindex])
self.currentindex += 1 self.currentindex += 1
return ret return ret