#!/usr/bin/env python3 """Extract Html content from an EPUB file (.chm), concatenating all sections""" from __future__ import print_function import sys import os import re import rclexecm from rclbasehandler import RclBaseHandler sys.path.append(sys.path[0]+"/recollepub.zip") try: import epub except: print("RECFILTERROR HELPERNOTFOUND python3:epub") sys.exit(1); class EPUBConcatExtractor(RclBaseHandler): """RclExecM slave worker for extracting all text from an EPUB file. This version concatenates all nodes.""" def __init__(self, em): super(EPUBConcatExtractor, self).__init__(em) def _header(self): meta = self.book.opf.metadata title = "" for tt, lang in meta.titles: title += tt + " " author = "" for name, role, fileas in meta.creators: author += name + " " data = "\n\n" if title: data += "" + self.em.htmlescape(title) + "\n" if author: data += '\n' if meta.description: data += '\n' data += "" data = data.encode('UTF-8') return data def html_text(self, fn): """Extract EPUB data as concatenated HTML""" f = open(fn, 'rb') self.book = epub.open_epub(f) data = self._header() ids = [] if self.book.opf.spine: for id, linear in self.book.opf.spine.itemrefs: ids.append(id) else: for id, item in self.book.opf.manifest.items(): ids.append(id) for id in ids: item = self.book.get_item(id) if item is None or item.media_type != 'application/xhtml+xml': continue doc = self.book.read_item(item) doc = re.sub(b'''<\?.*\?>''', b'', doc) doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''', b'', doc, 1, re.DOTALL) doc = re.sub(b'''''', b'', doc) doc = re.sub(b'''''', b'', doc) data += doc data += b'' return data proto = rclexecm.RclExecM() extract = EPUBConcatExtractor(proto) rclexecm.main(proto, extract)