#!/usr/bin/env python3 """Extract Html content from an EPUB file (.epub)""" rclepub_html_mtype = "text/html" import sys import os import re import subprocess import rclexecm import rclconfig sys.path.insert(0, sys.path[0]+"/recollepub.zip") try: import epub except: print("RECFILTERROR HELPERNOTFOUND python3:epub") sys.exit(1); class rclEPUB: """RclExecM slave worker for extracting all text from an EPUB file. We first extract the list of internal nodes, and them return them one by one. The ipath is the internal href""" def __init__(self, em): self.currentindex = 0 self.em = em self.em.setmimetype(rclepub_html_mtype) cf = rclconfig.RclConfig() self.catenate = cf.getConfParam("epubcatenate") self.catenate = int(self.catenate) if self.catenate else False def _docheader(self): meta = self.book.opf.metadata title = "" for tt, lang in meta.titles: title += tt + " " author = "" for name, role, fileas in meta.creators: author += name + " " data = "\n
\n" if title: data += "