rclchm: add concatenating mode

This commit is contained in:
"Jean-Francois Dockes ext:(%22) 2012-04-03 17:29:01 +02:00
parent 5f9095b472
commit 544e687afe

View File

@ -2,11 +2,20 @@
"""Extract Html files from a Microsoft Compiled Html Help file (.chm) """Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
# Do we return individual chapters as html pages or concatenate everything?
rclchm_catenate = 0
# Use special html type to allow for mimeconf/mimeview Open magic,
# Or go the regular html way with text/html
#rclchm_html_mtype = "text/x-chm-html"
rclchm_html_mtype = "text/html"
import sys import sys
import os import os
import posixpath import posixpath
import urlparse import urlparse
import urllib import urllib
if rclchm_catenate:
import subprocess
import rclexecm import rclexecm
@ -171,7 +180,11 @@ class rclCHM:
self.tp = ChmTopicsParser(em) self.tp = ChmTopicsParser(em)
self.currentindex = 0 self.currentindex = 0
self.em = em self.em = em
if rclchm_catenate:
self.em.setmimetype("text/plain")
else:
self.em.setmimetype(rclchm_html_mtype)
def extractone(self, path): def extractone(self, path):
"""Extract one path-named internal file from the chm file""" """Extract one path-named internal file from the chm file"""
@ -188,10 +201,27 @@ class rclCHM:
res, doc = self.chm.RetrieveObject(ui) res, doc = self.chm.RetrieveObject(ui)
#self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
if res > 0: if res > 0:
self.em.setmimetype("text/html") self.em.setmimetype(rclchm_html_mtype)
return (True, doc, path, iseof) return (True, doc, path, iseof)
return (False, "", path, iseof) return (False, "", path, iseof)
def dumpall(self):
alltxt=""
for pth in self.tp.contents:
ret,doc,path,iseof = self.extractone(pth)
if not ret:
continue
# Feed doc to lynx
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
"-display_charset=utf8",
"-force_html"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE
)
txt,err = process.communicate(doc)
alltxt += txt
return alltxt
def openfile(self, params): def openfile(self, params):
"""Open the chm file and build the contents list by extracting and """Open the chm file and build the contents list by extracting and
parsing the Topics object""" parsing the Topics object"""
@ -239,6 +269,13 @@ class rclCHM:
return self.extractone(params["ipath:"]) return self.extractone(params["ipath:"])
def getnext(self, params): def getnext(self, params):
if rclchm_catenate:
alltxt = self.dumpall()
if alltxt:
return (True, alltxt, "", rclexecm.RclExecM.eofnext)
else:
return (False, "", "", rclexecm.RclExecM.eofnow)
if self.currentindex >= len(self.tp.contents): if self.currentindex >= len(self.tp.contents):
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
else: else: