recoll/src/filters/rclepub
Jean-Francois Dockes c7a35a176c none
2012-10-12 13:35:21 +02:00

78 lines
2.4 KiB
Python
Executable File

#!/usr/bin/env python
"""Extract Html content from an EPUB file (.chm)"""
rclepub_html_mtype = "text/html"
import sys
import os
import re
import rclexecm
try:
import epub
except:
print "RECFILTERROR HELPERNOTFOUND python:epub"
sys.exit(1);
class rclEPUB:
"""RclExecM slave worker for extracting all text from an EPUB
file. We first extract the list of internal nodes, and them return them
one by one. The ipath is the internal href"""
def __init__(self, em):
self.currentindex = 0
self.em = em
self.em.setmimetype(rclepub_html_mtype)
def extractone(self, id):
"""Extract one path-named internal file from the EPUB file"""
#self.em.rclog("extractone: [%s]"%(path))
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.contents) -1:
iseof = rclexecm.RclExecM.eofnext
try:
item = self.book.get_item(id)
if item is None:
raise Exception("Item not found for id %s" % (id,))
doc = self.book.read_item(item)
doc = re.sub('''</[hH][eE][aA][dD]''',
'''<meta name="rclaptg" content="epub"></head>''', doc)
self.em.setmimetype(rclepub_html_mtype)
return (True, doc, id, iseof)
except Exception, err:
self.em.rclog("extractone: failed: [%s]" % err)
return (False, "", id, iseof)
def openfile(self, params):
"""Open the EPUB file, create a contents array"""
self.currentindex = 0
self.contents = []
try:
self.book = epub.open(params["filename:"])
except Exception, err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
for id, item in self.book.opf.manifest.iteritems():
if item.media_type == 'application/xhtml+xml':
self.contents.append(id)
return True
def getipath(self, params):
return self.extractone(params["ipath:"])
def getnext(self, params):
if self.currentindex >= len(self.contents):
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(self.contents[self.currentindex])
self.currentindex += 1
return ret
proto = rclexecm.RclExecM()
extract = rclEPUB(proto)
rclexecm.main(proto, extract)