149 lines
4.9 KiB
Python
Executable File
149 lines
4.9 KiB
Python
Executable File
#!/usr/bin/python3
|
|
"""Extract Html content from an EPUB file (.epub)"""
|
|
from __future__ import print_function
|
|
|
|
rclepub_html_mtype = "text/html"
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import subprocess
|
|
|
|
import rclexecm
|
|
import rclconfig
|
|
|
|
sys.path.insert(0, sys.path[0]+"/recollepub.zip")
|
|
try:
|
|
import epub
|
|
except:
|
|
print("RECFILTERROR HELPERNOTFOUND python3:epub")
|
|
sys.exit(1);
|
|
|
|
class rclEPUB:
|
|
"""RclExecM slave worker for extracting all text from an EPUB
|
|
file. We first extract the list of internal nodes, and them return them
|
|
one by one. The ipath is the internal href"""
|
|
|
|
def __init__(self, em):
|
|
self.currentindex = 0
|
|
self.em = em
|
|
self.em.setmimetype(rclepub_html_mtype)
|
|
cf = rclconfig.RclConfig()
|
|
self.catenate = cf.getConfParam("epubcatenate")
|
|
self.catenate = int(self.catenate) if self.catenate else False
|
|
|
|
def _selfdoc(self):
|
|
meta = self.book.opf.metadata
|
|
title = ""
|
|
for tt, lang in meta.titles:
|
|
title += tt + " "
|
|
author = ""
|
|
for name, role, fileas in meta.creators:
|
|
author += name + " "
|
|
data = "<html>\n<head>\n"
|
|
if title:
|
|
data += "<title>" + self.em.htmlescape(title) + "</title>\n"
|
|
if author:
|
|
data += '<meta name="author" content="' + \
|
|
self.em.htmlescape(author).strip() + '">\n'
|
|
if meta.description:
|
|
data += '<meta name="description" content="' + \
|
|
self.em.htmlescape(meta.description) + '">\n'
|
|
data = data.encode('UTF-8')
|
|
self.em.setmimetype('text/html')
|
|
if len(self.contents) == 0:
|
|
self.closefile()
|
|
eof = rclexecm.RclExecM.eofnext
|
|
else:
|
|
eof = rclexecm.RclExecM.noteof
|
|
return (True, data, "", eof)
|
|
|
|
def extractone(self, id):
|
|
"""Extract one path-named internal file from the EPUB file"""
|
|
|
|
#self.em.rclog("extractone: [%s]"%(path))
|
|
iseof = rclexecm.RclExecM.noteof
|
|
if self.currentindex >= len(self.contents) -1:
|
|
iseof = rclexecm.RclExecM.eofnext
|
|
|
|
try:
|
|
item = self.book.get_item(id)
|
|
if item is None:
|
|
raise Exception("Item not found for id %s" % (id,))
|
|
doc = self.book.read_item(item)
|
|
doc = re.sub(b'''</[hH][eE][aA][dD]>''',
|
|
b'''<meta name="rclaptg" content="epub"></head>''', doc)
|
|
self.em.setmimetype(rclepub_html_mtype)
|
|
return (True, doc, id, iseof)
|
|
except Exception as err:
|
|
self.em.rclog("extractone: failed: [%s]" % err)
|
|
return (False, "", id, iseof)
|
|
|
|
def dumpall(self):
|
|
self.em.setmimetype('text/plain')
|
|
alltxt=""
|
|
|
|
for idx in range(len(self.contents)):
|
|
ret,doc,path,iseof = self.extractone(self.contents[idx])
|
|
if not ret:
|
|
continue
|
|
# Feed doc to lynx
|
|
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
|
|
"-display_charset=utf8",
|
|
"-force_html"],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE
|
|
)
|
|
txt,err = process.communicate(doc)
|
|
alltxt += txt.decode('utf-8')
|
|
return alltxt
|
|
|
|
def closefile(self):
|
|
self.book.close()
|
|
|
|
def openfile(self, params):
|
|
"""Open the EPUB file, create a contents array"""
|
|
self.currentindex = -1
|
|
self.contents = []
|
|
try:
|
|
self.book = epub.open_epub(params["filename"].decode('UTF-8'))
|
|
except Exception as err:
|
|
self.em.rclog("openfile: epub.open failed: [%s]" % err)
|
|
return False
|
|
for id, item in self.book.opf.manifest.items():
|
|
if item.media_type == 'application/xhtml+xml':
|
|
self.contents.append(id)
|
|
return True
|
|
|
|
|
|
def getipath(self, params):
|
|
return self.extractone(params["ipath"].decode('UTF-8'))
|
|
|
|
def getnext(self, params):
|
|
if self.catenate:
|
|
alltxt = self.dumpall()
|
|
self.closefile()
|
|
if alltxt:
|
|
return (True, alltxt, "", rclexecm.RclExecM.eofnext)
|
|
else:
|
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
|
|
if self.currentindex == -1:
|
|
self.currentindex = 0
|
|
return self._selfdoc()
|
|
|
|
if self.currentindex >= len(self.contents):
|
|
self.closefile()
|
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
else:
|
|
ret = self.extractone(self.contents[self.currentindex])
|
|
if ret[3] == rclexecm.RclExecM.eofnext or \
|
|
ret[3] == rclexecm.RclExecM.eofnow:
|
|
self.closefile()
|
|
self.currentindex += 1
|
|
return ret
|
|
|
|
proto = rclexecm.RclExecM()
|
|
extract = rclEPUB(proto)
|
|
rclexecm.main(proto, extract)
|