diff --git a/src/filters/rclchm b/src/filters/rclchm index b78a1cd4..2bb78352 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -11,6 +11,7 @@ rclchm_html_mtype = "text/html" import sys import os +import re import posixpath import urlparse import urllib @@ -201,6 +202,8 @@ class rclCHM: res, doc = self.chm.RetrieveObject(ui) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: + doc = re.sub('''[hH][eE][aA][dD]''', + '''''', doc) self.em.setmimetype(rclchm_html_mtype) return (True, doc, path, iseof) return (False, "", path, iseof) diff --git a/src/filters/rclepub b/src/filters/rclepub new file mode 100755 index 00000000..d1b9b39c --- /dev/null +++ b/src/filters/rclepub @@ -0,0 +1,74 @@ +#!/usr/bin/env python +"""Extract Html content from an EPUB file (.chm)""" + +rclepub_html_mtype = "text/html" + +import sys +import os +import re + +import rclexecm + +try: + import epub +except: + print "RECFILTERROR HELPERNOTFOUND python:epub" + sys.exit(1); + +class rclEPUB: + """RclExecM slave worker for extracting all text from an EPUB + file. We first extract the list of internal nodes, and them return them + one by one. The ipath is the internal href""" + + def __init__(self, em): + self.currentindex = 0 + self.em = em + self.em.setmimetype(rclepub_html_mtype) + + def extractone(self, path): + """Extract one path-named internal file from the EPUB file""" + + #self.em.rclog("extractone: [%s]"%(path)) + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.contents) -1: + iseof = rclexecm.RclExecM.eofnext + + try: + doc = self.book.read(path) + doc = re.sub('''[hH][eE][aA][dD]''', + '''''', doc) + return (True, doc, path, iseof) + except Exception, err: + self.em.rclog("openfile: failed: [%s]" % err) + return (False, "", path, iseof) + + def openfile(self, params): + """Open the EPUB file""" + self.currentindex = 0 + self.contents = [] + try: + self.book = epub.open(params["filename:"]) + except Exception, err: + self.em.rclog("openfile: failed: [%s]" % err) + return False + for id, item in self.book.opf.manifest.iteritems(): +# print item.__dict__ + if item.media_type == u'application/xhtml+xml': + self.contents.append(item.href) + return True + + + def getipath(self, params): + return self.extractone(params["ipath:"]) + + def getnext(self, params): + if self.currentindex >= len(self.contents): + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(self.contents[self.currentindex]) + self.currentindex += 1 + return ret + +proto = rclexecm.RclExecM() +extract = rclEPUB(proto) +rclexecm.main(proto, extract) diff --git a/src/filters/rclinfo b/src/filters/rclinfo index 1ccc2a72..01ef47d9 100755 --- a/src/filters/rclinfo +++ b/src/filters/rclinfo @@ -20,8 +20,10 @@ htmltemplate = '''
+
+
%s
diff --git a/src/filters/rclshowchm b/src/filters/rclshowchm
new file mode 100755
index 00000000..a1ef8e06
--- /dev/null
+++ b/src/filters/rclshowchm
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+kchmviewer --url $2 $1