From 7fcb7c9bf7ef5671d7692acf6724e5859e1c652d Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Fri, 12 Oct 2012 13:34:56 +0200
Subject: [PATCH] ensure chm file can be renamed

---
 src/filters/rclchm | 124 +++++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 55 deletions(-)

diff --git a/src/filters/rclchm b/src/filters/rclchm
index 2bb78352..dff3b853 100755
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@@ -32,10 +32,33 @@ except:
     print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
     sys.exit(1);
 
+# Small helper routines
+def getfile(chmfile, path):
+    """Extract internal file text from chm object, given path"""
+    res, ui = chmfile.ResolveObject(path)
+    if res != chmlib.CHM_RESOLVE_SUCCESS:
+        #print "ResolveObject failed", path
+        return ""
+    res, doc = chmfile.RetrieveObject(ui)
+    if not res:
+        print "RetrieveObject failed", path
+        return ""
+    return doc
+
+def peekfile(chmfile, path):
+    """Check that path resolves in chm object"""
+    res, ui = chmfile.ResolveObject(path)
+    if res != chmlib.CHM_RESOLVE_SUCCESS:
+        return False
+    return True
+
+
+# CHM Topics tree handler
+
 class ChmTopicsParser(HTMLParser):
     """Parse the chm's Topic file which is basically
     a listing of internal nodes (html files mostly). Build a list of
-    all nodes (self.contents), which will then be used to walk and index
+    all nodes (parent.contents), which will then be used to walk and index
     the chm.
 
     Most nodes in the Topic file look like the following:
@@ -49,10 +72,10 @@ class ChmTopicsParser(HTMLParser):
     all values for parameters named "Local" (with some filtering/massaging),
     until proven wrong
     """
-    def __init__(self, em):
+    def __init__(self, rclchm):
         HTMLParser.__init__(self)
-        self.contents = []
-        self.em = em
+        self.em = rclchm.em
+        self.rclchm = rclchm
         
     def handle_starttag(self, tag, attrs):
         #self.em.rclog("Beginning of a %s tag" % tag)
@@ -86,47 +109,31 @@ class ChmTopicsParser(HTMLParser):
         if len(ll) == 1:
             localpath = value
         elif len(ll) == 4 and ll[-1] and ll[-3]:
-            #self.em.rclog("File: %s" % ll[-3])
-            if ll[-3] == self.fname:
-                localpath = ll[-1]
-            else:
+            #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn))
+            # We used to test against the simple file name, but this does
+            # not work if the file is renamed. Just check that the internal
+            # path resolves. Old: if ll[-3] == self.rclchm.sfn:
+            localpath = ll[-1]
+            if not peekfile(self.rclchm.chm, localpath):
                 #self.em.rclog("SKIPPING %s" % ll[-3])
-                pass
+                localpath = ""
 
         if len(localpath) != 0 and  localpath.find("#") == -1:
             if localpath[0] != '/':
                 localpath = "/" + localpath
-            self.contents.append(localpath)
-
-    def reset(self):
-        self.contents = []
-        self.fname = ""
-        HTMLParser.reset(self)
-
-    def setname(self, name):
-        self.fname = name
-
-def getfile(chmfile, path):
-    """Extract internal file text from chm object, given path"""
-    res, ui = chmfile.ResolveObject(path)
-    if res != chmlib.CHM_RESOLVE_SUCCESS:
-        #print "ResolveObject failed", path
-        return ""
-    res, doc = chmfile.RetrieveObject(ui)
-    if not res:
-        print "RetrieveObject failed", path
-        return ""
-    return doc
+            self.rclchm.contents.append(localpath)
 
 
+# Used when there is no Topics node. Walk the links tree
 class ChmWalker(HTMLParser):
-    """Links tree walker. This recursivelyfollows all internal links
-    found in the from the top node given as input, and augments the contents
-    list."""
+    """Links tree walker. This recursively follows all internal links
+    found in the tree from the top node given as input, and augments
+    the contents list."""
 
-    def __init__(self, chm, path, contents):
+    def __init__(self, rclchm, path, contents):
         HTMLParser.__init__(self)
-        self.chm = chm
+        self.rclchm = rclchm
+        self.chm = rclchm.chm
         self.contents = contents
         self.path = posixpath.normpath(path)
         self.dir = posixpath.dirname(self.path)
@@ -146,9 +153,13 @@ class ChmWalker(HTMLParser):
         if (not res.scheme or res.scheme.lower == "ms-its"):
             path = res.path
             lpath = path.split(':')
-            if len(lpath) == 3 and lpath[1] == cefilename:
-                # MS-ITS::somefile.chm:/some/path/file.htm ?
+            if len(lpath) == 3:
+                # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I
+                # know this never happens because there was a runtime error
+                # in this path
                 path = lpath[2]
+                if not peekfile(self.chm, path):
+                    path = ""
             elif len(lpath) == 1:
                 path = lpath[0]
             else:
@@ -166,7 +177,7 @@ class ChmWalker(HTMLParser):
                 text = getfile(self.chm, npath)
                 if text:
                     try:
-                        newwalker = ChmWalker(self.chm, npath, self.contents)
+                        newwalker = ChmWalker(self.rclchm, npath, self.contents)
                         newwalker.feed(text)
                     except:
                         pass
@@ -177,8 +188,8 @@ class rclCHM:
     one by one. The ipath is the node path"""
 
     def __init__(self, em):
+        self.contents = []
         self.chm = chm.CHMFile()
-        self.tp = ChmTopicsParser(em)
         self.currentindex = 0
         self.em = em
         if rclchm_catenate:
@@ -189,9 +200,9 @@ class rclCHM:
     def extractone(self, path):
         """Extract one path-named internal file from the chm file"""
 
-        #self.em.rclog("extractone: [%s]"%(path))
+        #self.em.rclog("extractone: [%s]" % (path,))
         iseof = rclexecm.RclExecM.noteof
-        if self.currentindex >= len(self.tp.contents) -1:
+        if self.currentindex >= len(self.contents) -1:
             iseof = rclexecm.RclExecM.eofnext
 
         res, ui = self.chm.ResolveObject(path)
@@ -210,7 +221,7 @@ class rclCHM:
 
     def dumpall(self):
         alltxt=""
-        for pth in self.tp.contents:
+        for pth in self.contents:
             ret,doc,path,iseof = self.extractone(pth)
             if not ret:
                 continue
@@ -230,22 +241,25 @@ class rclCHM:
         parsing the Topics object"""
 
         self.currentindex = 0
-        self.tp.reset()
+        self.contents = []
+        
         filename = params["filename:"]
         if not self.chm.LoadCHM(filename):
             self.em.rclog("LoadCHM failed")
             return False
-        if not self.chm.GetArchiveInfo():
-            self.em.rclog("GetArchiveInfo failed")
-            return False
+
+        self.sfn = os.path.basename(filename)
+
+        #self.em.rclog("home [%s] topics [%s] title [%s]" %
+        #              (self.chm.home, self.chm.topics, self.chm.title))
 
         self.topics = self.chm.GetTopicsTree()
         if self.topics:
             # Parse Topics file and extract list of internal nodes
             #self.em.rclog("Got topics");
-            self.tp.setname(os.path.basename(filename))
-            self.tp.feed(self.topics)
-            self.tp.close()
+            tp = ChmTopicsParser(self)
+            tp.feed(self.topics)
+            tp.close()
         else:
             # No topics. If there is a home, let's try to walk the tree
             #self.em.rclog("GetTopicsTree failed")
@@ -259,13 +273,13 @@ class rclCHM:
             if not text:
                 self.em.rclog("No topics and no home content")
                 return False
-            walker = ChmWalker(self.chm, self.chm.home, self.tp.contents)
+            walker = ChmWalker(self, self.chm.home, self.contents)
             walker.feed(text)
             walker.close()
 
-        #self.em.rclog("Contents size %d" % len(self.tp.contents))
-        uniq = set(self.tp.contents)
-        self.tp.contents = list(uniq)
+        #self.em.rclog("Contents size %d" % len(self.contents))
+        uniq = set(self.contents)
+        self.contents = list(uniq)
         return True
     
     def getipath(self, params):
@@ -279,10 +293,10 @@ class rclCHM:
             else:
                 return (False, "", "", rclexecm.RclExecM.eofnow)
 
-        if self.currentindex >= len(self.tp.contents):
+        if self.currentindex >= len(self.contents):
             return (False, "", "", rclexecm.RclExecM.eofnow)
         else:
-            ret= self.extractone(self.tp.contents[self.currentindex])
+            ret= self.extractone(self.contents[self.currentindex])
             self.currentindex += 1
             return ret