From 544e687afe1a22261fa0d31096cae7ba9b88e1d7 Mon Sep 17 00:00:00 2001
From: "\"Jean-Francois Dockes ext:(%22)" <jfd@recoll.org>
Date: Tue, 3 Apr 2012 17:29:01 +0200
Subject: [PATCH] rclchm: add concatenating mode

---
 src/filters/rclchm | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/filters/rclchm b/src/filters/rclchm
index 6be74113..b78a1cd4 100755
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@@ -2,11 +2,20 @@
 """Extract Html files from a Microsoft Compiled Html Help file (.chm)
 Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
 
+# Do we return individual chapters as html pages or concatenate everything?
+rclchm_catenate = 0
+# Use special html type to allow for mimeconf/mimeview Open magic,
+# Or go the regular html way with text/html
+#rclchm_html_mtype = "text/x-chm-html"
+rclchm_html_mtype = "text/html"
+
 import sys
 import os
 import posixpath
 import urlparse
 import urllib
+if rclchm_catenate:
+    import subprocess
 
 import rclexecm
 
@@ -171,7 +180,11 @@ class rclCHM:
         self.tp = ChmTopicsParser(em)
         self.currentindex = 0
         self.em = em
-        
+        if rclchm_catenate:
+            self.em.setmimetype("text/plain")
+        else:
+            self.em.setmimetype(rclchm_html_mtype)
+
     def extractone(self, path):
         """Extract one path-named internal file from the chm file"""
 
@@ -188,10 +201,27 @@ class rclCHM:
         res, doc = self.chm.RetrieveObject(ui)
         #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
         if res > 0:
-            self.em.setmimetype("text/html")
+            self.em.setmimetype(rclchm_html_mtype)
             return (True, doc, path, iseof)
         return (False, "", path, iseof)
-    
+
+    def dumpall(self):
+        alltxt=""
+        for pth in self.tp.contents:
+            ret,doc,path,iseof = self.extractone(pth)
+            if not ret:
+                continue
+            # Feed doc to lynx
+            process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
+                                        "-display_charset=utf8",
+                                        "-force_html"], 
+                                       stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE
+                                       )
+            txt,err = process.communicate(doc)
+            alltxt += txt
+        return alltxt
+
     def openfile(self, params):
         """Open the chm file and build the contents list by extracting and
         parsing the Topics object"""
@@ -239,6 +269,13 @@ class rclCHM:
         return self.extractone(params["ipath:"])
         
     def getnext(self, params):
+        if rclchm_catenate:
+            alltxt = self.dumpall()
+            if alltxt:
+                return (True, alltxt, "", rclexecm.RclExecM.eofnext)
+            else:
+                return (False, "", "", rclexecm.RclExecM.eofnow)
+
         if self.currentindex >= len(self.tp.contents):
             return (False, "", "", rclexecm.RclExecM.eofnow)
         else: