From 63ac7f6458778cdbbae4386ee4cd5b97f20c8508 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Sat, 24 Oct 2009 06:37:00 +0000
Subject: [PATCH] cleanup

---
 src/filters/rclchm | 78 ++++++++++++++++++++++++++++++++++++----------
 src/filters/rclics |  2 +-
 src/filters/rclzip |  1 -
 3 files changed, 62 insertions(+), 19 deletions(-)
diff --git a/src/filters/rclchm b/src/filters/rclchm
index 7dc3a7df..41dfe13f 100755
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@@ -1,60 +1,98 @@
 #!/usr/bin/env python
+"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
+Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
 
 import sys
+import os
 import rclexecm
 from chm import chm,chmlib
 from HTMLParser import HTMLParser
 
 class ChmTopicsParser(HTMLParser):
-    """Use HTMLParser to parse the chm's Topic file which is basically
+    """Parse the chm's Topic file which is basically
     a listing of internal nodes (html files mostly). Build a list of
     all nodes (self.contents), which will then be used to walk and index
     the chm.
+
     Most nodes in the Topic file look like the following:
     <LI> <OBJECT type="text/sitemap">
            <param name="Name" value="Global Module Index">
            <param name="Local" value="modindex.html">
           </OBJECT>
+
     Maybe we should filter out non "text/sitemap" Objects, and maybe there are
     things of interest whose name is not Local, but for now, we just take
-    all values for parameters named "Local", and this seems to work ok.
+    all values for parameters named "Local" (with some filtering/massaging),
+    until proven wrong
     """
     def __init__(self):
         HTMLParser.__init__(self)
         self.contents = []
 
     def handle_starttag(self, tag, attrs):
+        #print >> sys.stderr, "Encountered the beginning of a %s tag" % tag
         # If this is a param tag with name Local, we're interested in
-        # the value which lists an internal file. Discard those with #
+        # the value which lists a file ref. Discard those with #
         # in them (references inside files)
-        if tag == 'param':
-            name = ''
-            for (nm,val) in attrs:
-                if nm == 'name':
-                    name = val
-                if nm == 'value':
-                    value = val.encode('utf-8')
-            if name == 'Local':
-                if value.find("#") == -1:
-                    self.contents.append(value)
+        # Sometimes it seems that refs are like Vendor:filename::path,
+        # we only keep the path, and only if the file matches
 
+        if tag != 'param':
+            return
 
+        name = ''
+        value = ''
+        for (nm,val) in attrs:
+            if nm == 'name':
+                name = val
+            if nm == 'value':
+                value = val
+
+        if name != 'Local' or value == '':
+            return
+
+        localpath = ""
+        ll = value.split(":")
+        if len(ll) == 1:
+            localpath = value
+        elif len(ll) == 4 and ll[-1] and ll[-3]:
+            #print >>sys.stderr, "File: %s" % ll[-3]
+            if ll[-3] == self.fname:
+                localpath = ll[-1]
+            else:
+                #print >> sys.stderr, "SKIPPING %s" % ll[-3]
+                pass
+
+        if len(localpath) != 0 and  localpath.find("#") == -1:
+            if localpath[0] != '/':
+                localpath = "/" + localpath
+            self.contents.append(localpath)
+
+    def reset(self):
+        self.contents = []
+        self.fname = ""
+        HTMLParser.reset(self)
+
+    def setname(self, name):
+        self.fname = name
+        
 class rclCHM:
-    """RclExecM slave worker for extracting all files from an Msoft chm (.ics)
+    """RclExecM slave worker for extracting all files from an Msoft chm
     file. We first extract the list of internal nodes, and them return them
     one by one. The ipath is the node path"""
+
     def __init__(self, em):
         self.chm = chm.CHMFile()
         self.tp = ChmTopicsParser()
         self.currentindex = 0
         self.em = em
-
         
     def extractone(self, path):
         """Extract one path-named internal file from the chm file"""
+
         #self.em.rclog("extractone: [%s]"%(path))
         eof = (self.currentindex >= len(self.tp.contents) -1)
-        res, ui = self.chm.ResolveObject("/" + path)
+        res, ui = self.chm.ResolveObject(path)
         #self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui))
         if res != chmlib.CHM_RESOLVE_SUCCESS:
             return (False, "", path, eof)
@@ -68,15 +106,21 @@ class rclCHM:
     def openfile(self, params):
         """Open the chm file and build the contents list by extracting and
         parsing the Topics object"""
-        self.chm.LoadCHM(params["filename:"])
+
+        self.currentindex = 0
+        self.tp.reset()
+        filename = params["filename:"]
+        self.chm.LoadCHM(filename)
         self.chm.GetArchiveInfo()
         self.topics = self.chm.GetTopicsTree()
         if self.topics == None:
             return False
         #self.em.rclog(self.topics)
         # Parse Topics file and extract list of internal nodes
+        self.tp.setname(os.path.basename(filename))
         self.tp.feed(self.topics)
         self.tp.close()
+        #self.em.rclog("Contents size %d" % len(self.tp.contents))
         return True
 
     def getipath(self, params):
diff --git a/src/filters/rclics b/src/filters/rclics
index bf2f9013..116f0928 100755
--- a/src/filters/rclics
+++ b/src/filters/rclics
@@ -3,10 +3,10 @@
 import rclexecm
 from icalendar import Calendar, Event
 
-
 class IcalExtractor:
     def __init__(self, em):
         self.file = ""
+	self.contents = []
         self.em = em
         em.setmimetype("text/plain")
 
diff --git a/src/filters/rclzip b/src/filters/rclzip
index c7966107..9562f9df 100755
--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@@ -2,7 +2,6 @@
 
 # Zip file filter for Recoll
 
-import os
 import rclexecm
 from zipfile import ZipFile, error