From 544e687afe1a22261fa0d31096cae7ba9b88e1d7 Mon Sep 17 00:00:00 2001 From: "\"Jean-Francois Dockes ext:(%22)" Date: Tue, 3 Apr 2012 17:29:01 +0200 Subject: [PATCH] rclchm: add concatenating mode --- src/filters/rclchm | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/filters/rclchm b/src/filters/rclchm index 6be74113..b78a1cd4 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -2,11 +2,20 @@ """Extract Html files from a Microsoft Compiled Html Help file (.chm) Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" +# Do we return individual chapters as html pages or concatenate everything? +rclchm_catenate = 0 +# Use special html type to allow for mimeconf/mimeview Open magic, +# Or go the regular html way with text/html +#rclchm_html_mtype = "text/x-chm-html" +rclchm_html_mtype = "text/html" + import sys import os import posixpath import urlparse import urllib +if rclchm_catenate: + import subprocess import rclexecm @@ -171,7 +180,11 @@ class rclCHM: self.tp = ChmTopicsParser(em) self.currentindex = 0 self.em = em - + if rclchm_catenate: + self.em.setmimetype("text/plain") + else: + self.em.setmimetype(rclchm_html_mtype) + def extractone(self, path): """Extract one path-named internal file from the chm file""" @@ -188,10 +201,27 @@ class rclCHM: res, doc = self.chm.RetrieveObject(ui) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: - self.em.setmimetype("text/html") + self.em.setmimetype(rclchm_html_mtype) return (True, doc, path, iseof) return (False, "", path, iseof) - + + def dumpall(self): + alltxt="" + for pth in self.tp.contents: + ret,doc,path,iseof = self.extractone(pth) + if not ret: + continue + # Feed doc to lynx + process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist", + "-display_charset=utf8", + "-force_html"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE + ) + txt,err = process.communicate(doc) + alltxt += txt + return alltxt + def openfile(self, params): """Open the chm file and build the contents list by extracting and parsing the Topics object""" @@ -239,6 +269,13 @@ class rclCHM: return self.extractone(params["ipath:"]) def getnext(self, params): + if rclchm_catenate: + alltxt = self.dumpall() + if alltxt: + return (True, alltxt, "", rclexecm.RclExecM.eofnext) + else: + return (False, "", "", rclexecm.RclExecM.eofnow) + if self.currentindex >= len(self.tp.contents): return (False, "", "", rclexecm.RclExecM.eofnow) else: