ensure chm file can be renamed

This commit is contained in:
Jean-Francois Dockes 2012-10-12 13:34:56 +02:00
parent d4edbbaedb
commit 7fcb7c9bf7

View File

@ -32,10 +32,33 @@ except:
print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
sys.exit(1);
# Small helper routines
def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
return ""
return doc
def peekfile(chmfile, path):
"""Check that path resolves in chm object"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
return False
return True
# CHM Topics tree handler
class ChmTopicsParser(HTMLParser):
"""Parse the chm's Topic file which is basically
a listing of internal nodes (html files mostly). Build a list of
all nodes (self.contents), which will then be used to walk and index
all nodes (parent.contents), which will then be used to walk and index
the chm.
Most nodes in the Topic file look like the following:
@ -49,10 +72,10 @@ class ChmTopicsParser(HTMLParser):
all values for parameters named "Local" (with some filtering/massaging),
until proven wrong
"""
def __init__(self, em):
def __init__(self, rclchm):
HTMLParser.__init__(self)
self.contents = []
self.em = em
self.em = rclchm.em
self.rclchm = rclchm
def handle_starttag(self, tag, attrs):
#self.em.rclog("Beginning of a %s tag" % tag)
@ -86,47 +109,31 @@ class ChmTopicsParser(HTMLParser):
if len(ll) == 1:
localpath = value
elif len(ll) == 4 and ll[-1] and ll[-3]:
#self.em.rclog("File: %s" % ll[-3])
if ll[-3] == self.fname:
localpath = ll[-1]
else:
#self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn))
# We used to test against the simple file name, but this does
# not work if the file is renamed. Just check that the internal
# path resolves. Old: if ll[-3] == self.rclchm.sfn:
localpath = ll[-1]
if not peekfile(self.rclchm.chm, localpath):
#self.em.rclog("SKIPPING %s" % ll[-3])
pass
localpath = ""
if len(localpath) != 0 and localpath.find("#") == -1:
if localpath[0] != '/':
localpath = "/" + localpath
self.contents.append(localpath)
def reset(self):
self.contents = []
self.fname = ""
HTMLParser.reset(self)
def setname(self, name):
self.fname = name
def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
return ""
return doc
self.rclchm.contents.append(localpath)
# Used when there is no Topics node. Walk the links tree
class ChmWalker(HTMLParser):
"""Links tree walker. This recursivelyfollows all internal links
found in the from the top node given as input, and augments the contents
list."""
"""Links tree walker. This recursively follows all internal links
found in the tree from the top node given as input, and augments
the contents list."""
def __init__(self, chm, path, contents):
def __init__(self, rclchm, path, contents):
HTMLParser.__init__(self)
self.chm = chm
self.rclchm = rclchm
self.chm = rclchm.chm
self.contents = contents
self.path = posixpath.normpath(path)
self.dir = posixpath.dirname(self.path)
@ -146,9 +153,13 @@ class ChmWalker(HTMLParser):
if (not res.scheme or res.scheme.lower == "ms-its"):
path = res.path
lpath = path.split(':')
if len(lpath) == 3 and lpath[1] == cefilename:
# MS-ITS::somefile.chm:/some/path/file.htm ?
if len(lpath) == 3:
# MS-ITS::somefile.chm:/some/path/file.htm ? As far as I
# know this never happens because there was a runtime error
# in this path
path = lpath[2]
if not peekfile(self.chm, path):
path = ""
elif len(lpath) == 1:
path = lpath[0]
else:
@ -166,7 +177,7 @@ class ChmWalker(HTMLParser):
text = getfile(self.chm, npath)
if text:
try:
newwalker = ChmWalker(self.chm, npath, self.contents)
newwalker = ChmWalker(self.rclchm, npath, self.contents)
newwalker.feed(text)
except:
pass
@ -177,8 +188,8 @@ class rclCHM:
one by one. The ipath is the node path"""
def __init__(self, em):
self.contents = []
self.chm = chm.CHMFile()
self.tp = ChmTopicsParser(em)
self.currentindex = 0
self.em = em
if rclchm_catenate:
@ -189,9 +200,9 @@ class rclCHM:
def extractone(self, path):
"""Extract one path-named internal file from the chm file"""
#self.em.rclog("extractone: [%s]"%(path))
#self.em.rclog("extractone: [%s]" % (path,))
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.tp.contents) -1:
if self.currentindex >= len(self.contents) -1:
iseof = rclexecm.RclExecM.eofnext
res, ui = self.chm.ResolveObject(path)
@ -210,7 +221,7 @@ class rclCHM:
def dumpall(self):
alltxt=""
for pth in self.tp.contents:
for pth in self.contents:
ret,doc,path,iseof = self.extractone(pth)
if not ret:
continue
@ -230,22 +241,25 @@ class rclCHM:
parsing the Topics object"""
self.currentindex = 0
self.tp.reset()
self.contents = []
filename = params["filename:"]
if not self.chm.LoadCHM(filename):
self.em.rclog("LoadCHM failed")
return False
if not self.chm.GetArchiveInfo():
self.em.rclog("GetArchiveInfo failed")
return False
self.sfn = os.path.basename(filename)
#self.em.rclog("home [%s] topics [%s] title [%s]" %
# (self.chm.home, self.chm.topics, self.chm.title))
self.topics = self.chm.GetTopicsTree()
if self.topics:
# Parse Topics file and extract list of internal nodes
#self.em.rclog("Got topics");
self.tp.setname(os.path.basename(filename))
self.tp.feed(self.topics)
self.tp.close()
tp = ChmTopicsParser(self)
tp.feed(self.topics)
tp.close()
else:
# No topics. If there is a home, let's try to walk the tree
#self.em.rclog("GetTopicsTree failed")
@ -259,13 +273,13 @@ class rclCHM:
if not text:
self.em.rclog("No topics and no home content")
return False
walker = ChmWalker(self.chm, self.chm.home, self.tp.contents)
walker = ChmWalker(self, self.chm.home, self.contents)
walker.feed(text)
walker.close()
#self.em.rclog("Contents size %d" % len(self.tp.contents))
uniq = set(self.tp.contents)
self.tp.contents = list(uniq)
#self.em.rclog("Contents size %d" % len(self.contents))
uniq = set(self.contents)
self.contents = list(uniq)
return True
def getipath(self, params):
@ -279,10 +293,10 @@ class rclCHM:
else:
return (False, "", "", rclexecm.RclExecM.eofnow)
if self.currentindex >= len(self.tp.contents):
if self.currentindex >= len(self.contents):
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(self.tp.contents[self.currentindex])
ret= self.extractone(self.contents[self.currentindex])
self.currentindex += 1
return ret