81 lines
2.4 KiB
Python
Executable File
81 lines
2.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Extract Html content from an EPUB file (.chm), concatenating all sections"""
|
|
from __future__ import print_function
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
|
|
import rclexecm
|
|
from rclbasehandler import RclBaseHandler
|
|
|
|
sys.path.append(sys.path[0]+"/recollepub.zip")
|
|
try:
|
|
import epub
|
|
except:
|
|
print("RECFILTERROR HELPERNOTFOUND python3:epub")
|
|
sys.exit(1);
|
|
|
|
class EPUBConcatExtractor(RclBaseHandler):
|
|
"""RclExecM slave worker for extracting all text from an EPUB
|
|
file. This version concatenates all nodes."""
|
|
|
|
def __init__(self, em):
|
|
super(EPUBConcatExtractor, self).__init__(em)
|
|
|
|
def _header(self):
|
|
meta = self.book.opf.metadata
|
|
title = ""
|
|
for tt, lang in meta.titles:
|
|
title += tt + " "
|
|
author = ""
|
|
for name, role, fileas in meta.creators:
|
|
author += name + " "
|
|
data = "<html>\n<head>\n"
|
|
if title:
|
|
data += "<title>" + self.em.htmlescape(title) + "</title>\n"
|
|
if author:
|
|
data += '<meta name="author" content="' + \
|
|
self.em.htmlescape(author).strip() + '">\n'
|
|
if meta.description:
|
|
data += '<meta name="description" content="' + \
|
|
self.em.htmlescape(meta.description) + '">\n'
|
|
data += "</head><body>"
|
|
data = data.encode('UTF-8')
|
|
|
|
return data
|
|
|
|
def html_text(self, fn):
|
|
"""Extract EPUB data as concatenated HTML"""
|
|
|
|
f = open(fn, 'rb')
|
|
self.book = epub.open_epub(f)
|
|
|
|
data = self._header()
|
|
ids = []
|
|
if self.book.opf.spine:
|
|
for id, linear in self.book.opf.spine.itemrefs:
|
|
ids.append(id)
|
|
else:
|
|
for id, item in self.book.opf.manifest.items():
|
|
ids.append(id)
|
|
|
|
for id in ids:
|
|
item = self.book.get_item(id)
|
|
if item is None or item.media_type != 'application/xhtml+xml':
|
|
continue
|
|
doc = self.book.read_item(item)
|
|
doc = re.sub(b'''<\?.*\?>''', b'', doc)
|
|
doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''',
|
|
b'', doc, 1, re.DOTALL)
|
|
doc = re.sub(b'''</[bB][oO][dD][yY]>''', b'', doc)
|
|
doc = re.sub(b'''</[hH][tT][mM][lL]>''', b'', doc)
|
|
data += doc
|
|
|
|
data += b'</body></html>'
|
|
return data
|
|
|
|
proto = rclexecm.RclExecM()
|
|
extract = EPUBConcatExtractor(proto)
|
|
rclexecm.main(proto, extract)
|