epub handler: extract the opf metadata subjects fields as dc:subject tags. Share more code between rclepub and the now redundant rclepub1 (no more lynx usage in rclepub)

This commit is contained in:
Jean-Francois Dockes 2020-08-09 09:49:08 +02:00
parent 9f818ebe70
commit d932d19562
2 changed files with 58 additions and 35 deletions

View File

@ -32,7 +32,7 @@ class rclEPUB:
self.catenate = cf.getConfParam("epubcatenate")
self.catenate = int(self.catenate) if self.catenate else False
def _selfdoc(self):
def _docheader(self):
meta = self.book.opf.metadata
title = ""
for tt, lang in meta.titles:
@ -49,7 +49,39 @@ class rclEPUB:
if meta.description:
data += '<meta name="description" content="' + \
rclexecm.htmlescape(meta.description) + '">\n'
data = data.encode('UTF-8')
for value in meta.subjects:
data += '<meta name="dc:subject" content="' + \
rclexecm.htmlescape(value) + '">\n'
data += "</head>"
return data.encode('UTF-8')
def _catbodies(self):
data = b'<body>'
ids = []
if self.book.opf.spine:
for id, linear in self.book.opf.spine.itemrefs:
ids.append(id)
else:
for id, item in self.book.opf.manifest.items():
ids.append(id)
for id in ids:
item = self.book.get_item(id)
if item is None or item.media_type != 'application/xhtml+xml':
continue
doc = self.book.read_item(item)
doc = re.sub(b'''<\?.*\?>''', b'', doc)
doc = re.sub(b'''<html.*<body[^>]*>''',
b'', doc, 1, flags=re.DOTALL|re.I)
doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
data += doc
data += b'</body></html>'
return data
def _selfdoc(self):
data = self._docheader()
self.em.setmimetype('text/html')
if len(self.contents) == 0:
self.closefile()
@ -80,23 +112,9 @@ class rclEPUB:
return (False, "", id, iseof)
def dumpall(self):
self.em.setmimetype('text/plain')
alltxt=""
for idx in range(len(self.contents)):
ret,doc,path,iseof = self.extractone(self.contents[idx])
if not ret:
continue
# Feed doc to lynx
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
"-display_charset=utf8",
"-force_html"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE
)
txt,err = process.communicate(doc)
alltxt += txt.decode('utf-8')
return alltxt
data = self._docheader()
data += self._catbodies()
return data
def closefile(self):
self.book.close()

View File

@ -23,7 +23,7 @@ class EPUBConcatExtractor(RclBaseHandler):
def __init__(self, em):
super(EPUBConcatExtractor, self).__init__(em)
def _header(self):
def _docheader(self):
meta = self.book.opf.metadata
title = ""
for tt, lang in meta.titles:
@ -40,18 +40,14 @@ class EPUBConcatExtractor(RclBaseHandler):
if meta.description:
data += '<meta name="description" content="' + \
rclexecm.htmlescape(meta.description) + '">\n'
data += "</head><body>"
data = data.encode('UTF-8')
for value in meta.subjects:
data += '<meta name="dc:subject" content="' + \
rclexecm.htmlescape(value) + '">\n'
data += "</head>"
return data.encode('UTF-8')
return data
def html_text(self, fn):
"""Extract EPUB data as concatenated HTML"""
f = open(fn, 'rb')
self.book = epub.open_epub(f)
data = self._header()
def _catbodies(self):
data = b'<body>'
ids = []
if self.book.opf.spine:
for id, linear in self.book.opf.spine.itemrefs:
@ -66,13 +62,22 @@ class EPUBConcatExtractor(RclBaseHandler):
continue
doc = self.book.read_item(item)
doc = re.sub(b'''<\?.*\?>''', b'', doc)
doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''',
b'', doc, 1, re.DOTALL)
doc = re.sub(b'''</[bB][oO][dD][yY]>''', b'', doc)
doc = re.sub(b'''</[hH][tT][mM][lL]>''', b'', doc)
doc = re.sub(b'''<html.*<body[^>]*>''',
b'', doc, 1, flags=re.DOTALL|re.I)
doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
data += doc
data += b'</body></html>'
return data
def html_text(self, fn):
"""Extract EPUB data as concatenated HTML"""
f = open(fn, 'rb')
self.book = epub.open_epub(f)
data = self._docheader()
data += self._catbodies()
self.book.close()
return data