ensure chm file can be renamed

2012-10-12 13:34:56 +02:00 · 2012-10-12 13:34:56 +02:00 · 7fcb7c9bf7
commit 7fcb7c9bf7
parent d4edbbaedb
1 changed files with 69 additions and 55 deletions
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@ -32,10 +32,33 @@ except:
    print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
    sys.exit(1);

+# Small helper routines
+def getfile(chmfile, path):
+    """Extract internal file text from chm object, given path"""
+    res, ui = chmfile.ResolveObject(path)
+    if res != chmlib.CHM_RESOLVE_SUCCESS:
+        #print "ResolveObject failed", path
+        return ""
+    res, doc = chmfile.RetrieveObject(ui)
+    if not res:
+        print "RetrieveObject failed", path
+        return ""
+    return doc
+
+def peekfile(chmfile, path):
+    """Check that path resolves in chm object"""
+    res, ui = chmfile.ResolveObject(path)
+    if res != chmlib.CHM_RESOLVE_SUCCESS:
+        return False
+    return True
+
+
+# CHM Topics tree handler
+
 class ChmTopicsParser(HTMLParser):
    """Parse the chm's Topic file which is basically
    a listing of internal nodes (html files mostly). Build a list of
-    all nodes (self.contents), which will then be used to walk and index
+    all nodes (parent.contents), which will then be used to walk and index
    the chm.

    Most nodes in the Topic file look like the following:
@ -49,10 +72,10 @@ class ChmTopicsParser(HTMLParser):
    all values for parameters named "Local" (with some filtering/massaging),
    until proven wrong
    """
-    def __init__(self, em):
+    def __init__(self, rclchm):
        HTMLParser.__init__(self)
-        self.contents = []
-        self.em = em
+        self.em = rclchm.em
+        self.rclchm = rclchm
        
    def handle_starttag(self, tag, attrs):
        #self.em.rclog("Beginning of a %s tag" % tag)
@ -86,47 +109,31 @@ class ChmTopicsParser(HTMLParser):
        if len(ll) == 1:
            localpath = value
        elif len(ll) == 4 and ll[-1] and ll[-3]:
-            #self.em.rclog("File: %s" % ll[-3])
-            if ll[-3] == self.fname:
-                localpath = ll[-1]
-            else:
+            #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn))
+            # We used to test against the simple file name, but this does
+            # not work if the file is renamed. Just check that the internal
+            # path resolves. Old: if ll[-3] == self.rclchm.sfn:
+            localpath = ll[-1]
+            if not peekfile(self.rclchm.chm, localpath):
                #self.em.rclog("SKIPPING %s" % ll[-3])
-                pass
+                localpath = ""

        if len(localpath) != 0 and  localpath.find("#") == -1:
            if localpath[0] != '/':
                localpath = "/" + localpath
-            self.contents.append(localpath)
-
-    def reset(self):
-        self.contents = []
-        self.fname = ""
-        HTMLParser.reset(self)
-
-    def setname(self, name):
-        self.fname = name
-
-def getfile(chmfile, path):
-    """Extract internal file text from chm object, given path"""
-    res, ui = chmfile.ResolveObject(path)
-    if res != chmlib.CHM_RESOLVE_SUCCESS:
-        #print "ResolveObject failed", path
-        return ""
-    res, doc = chmfile.RetrieveObject(ui)
-    if not res:
-        print "RetrieveObject failed", path
-        return ""
-    return doc
+            self.rclchm.contents.append(localpath)


+# Used when there is no Topics node. Walk the links tree
 class ChmWalker(HTMLParser):
-    """Links tree walker. This recursivelyfollows all internal links
-    found in the from the top node given as input, and augments the contents
-    list."""
+    """Links tree walker. This recursively follows all internal links
+    found in the tree from the top node given as input, and augments
+    the contents list."""

-    def __init__(self, chm, path, contents):
+    def __init__(self, rclchm, path, contents):
        HTMLParser.__init__(self)
-        self.chm = chm
+        self.rclchm = rclchm
+        self.chm = rclchm.chm
        self.contents = contents
        self.path = posixpath.normpath(path)
        self.dir = posixpath.dirname(self.path)
@ -146,9 +153,13 @@ class ChmWalker(HTMLParser):
        if (not res.scheme or res.scheme.lower == "ms-its"):
            path = res.path
            lpath = path.split(':')
-            if len(lpath) == 3 and lpath[1] == cefilename:
-                # MS-ITS::somefile.chm:/some/path/file.htm ?
+            if len(lpath) == 3:
+                # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I
+                # know this never happens because there was a runtime error
+                # in this path
                path = lpath[2]
+                if not peekfile(self.chm, path):
+                    path = ""
            elif len(lpath) == 1:
                path = lpath[0]
            else:
@ -166,7 +177,7 @@ class ChmWalker(HTMLParser):
                text = getfile(self.chm, npath)
                if text:
                    try:
-                        newwalker = ChmWalker(self.chm, npath, self.contents)
+                        newwalker = ChmWalker(self.rclchm, npath, self.contents)
                        newwalker.feed(text)
                    except:
                        pass
@ -177,8 +188,8 @@ class rclCHM:
    one by one. The ipath is the node path"""

    def __init__(self, em):
+        self.contents = []
        self.chm = chm.CHMFile()
-        self.tp = ChmTopicsParser(em)
        self.currentindex = 0
        self.em = em
        if rclchm_catenate:
@ -189,9 +200,9 @@ class rclCHM:
    def extractone(self, path):
        """Extract one path-named internal file from the chm file"""

-        #self.em.rclog("extractone: [%s]"%(path))
+        #self.em.rclog("extractone: [%s]" % (path,))
        iseof = rclexecm.RclExecM.noteof
-        if self.currentindex >= len(self.tp.contents) -1:
+        if self.currentindex >= len(self.contents) -1:
            iseof = rclexecm.RclExecM.eofnext

        res, ui = self.chm.ResolveObject(path)
@ -210,7 +221,7 @@ class rclCHM:

    def dumpall(self):
        alltxt=""
-        for pth in self.tp.contents:
+        for pth in self.contents:
            ret,doc,path,iseof = self.extractone(pth)
            if not ret:
                continue
@ -230,22 +241,25 @@ class rclCHM:
        parsing the Topics object"""

        self.currentindex = 0
-        self.tp.reset()
+        self.contents = []
+        
        filename = params["filename:"]
        if not self.chm.LoadCHM(filename):
            self.em.rclog("LoadCHM failed")
            return False
-        if not self.chm.GetArchiveInfo():
-            self.em.rclog("GetArchiveInfo failed")
-            return False
+
+        self.sfn = os.path.basename(filename)
+
+        #self.em.rclog("home [%s] topics [%s] title [%s]" %
+        #              (self.chm.home, self.chm.topics, self.chm.title))

        self.topics = self.chm.GetTopicsTree()
        if self.topics:
            # Parse Topics file and extract list of internal nodes
            #self.em.rclog("Got topics");
-            self.tp.setname(os.path.basename(filename))
-            self.tp.feed(self.topics)
-            self.tp.close()
+            tp = ChmTopicsParser(self)
+            tp.feed(self.topics)
+            tp.close()
        else:
            # No topics. If there is a home, let's try to walk the tree
            #self.em.rclog("GetTopicsTree failed")
@ -259,13 +273,13 @@ class rclCHM:
            if not text:
                self.em.rclog("No topics and no home content")
                return False
-            walker = ChmWalker(self.chm, self.chm.home, self.tp.contents)
+            walker = ChmWalker(self, self.chm.home, self.contents)
            walker.feed(text)
            walker.close()

-        #self.em.rclog("Contents size %d" % len(self.tp.contents))
-        uniq = set(self.tp.contents)
-        self.tp.contents = list(uniq)
+        #self.em.rclog("Contents size %d" % len(self.contents))
+        uniq = set(self.contents)
+        self.contents = list(uniq)
        return True
    
    def getipath(self, params):
@ -279,10 +293,10 @@ class rclCHM:
            else:
                return (False, "", "", rclexecm.RclExecM.eofnow)

-        if self.currentindex >= len(self.tp.contents):
+        if self.currentindex >= len(self.contents):
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
-            ret= self.extractone(self.tp.contents[self.currentindex])
+            ret= self.extractone(self.contents[self.currentindex])
            self.currentindex += 1
            return ret