epub handler: extract the opf metadata subjects fields as dc:subject tags. Share more code between rclepub and the now redundant rclepub1 (no more lynx usage in rclepub)
This commit is contained in:
parent
9f818ebe70
commit
d932d19562
@ -32,7 +32,7 @@ class rclEPUB:
|
|||||||
self.catenate = cf.getConfParam("epubcatenate")
|
self.catenate = cf.getConfParam("epubcatenate")
|
||||||
self.catenate = int(self.catenate) if self.catenate else False
|
self.catenate = int(self.catenate) if self.catenate else False
|
||||||
|
|
||||||
def _selfdoc(self):
|
def _docheader(self):
|
||||||
meta = self.book.opf.metadata
|
meta = self.book.opf.metadata
|
||||||
title = ""
|
title = ""
|
||||||
for tt, lang in meta.titles:
|
for tt, lang in meta.titles:
|
||||||
@ -49,7 +49,39 @@ class rclEPUB:
|
|||||||
if meta.description:
|
if meta.description:
|
||||||
data += '<meta name="description" content="' + \
|
data += '<meta name="description" content="' + \
|
||||||
rclexecm.htmlescape(meta.description) + '">\n'
|
rclexecm.htmlescape(meta.description) + '">\n'
|
||||||
data = data.encode('UTF-8')
|
for value in meta.subjects:
|
||||||
|
data += '<meta name="dc:subject" content="' + \
|
||||||
|
rclexecm.htmlescape(value) + '">\n'
|
||||||
|
data += "</head>"
|
||||||
|
return data.encode('UTF-8')
|
||||||
|
|
||||||
|
def _catbodies(self):
|
||||||
|
data = b'<body>'
|
||||||
|
ids = []
|
||||||
|
if self.book.opf.spine:
|
||||||
|
for id, linear in self.book.opf.spine.itemrefs:
|
||||||
|
ids.append(id)
|
||||||
|
else:
|
||||||
|
for id, item in self.book.opf.manifest.items():
|
||||||
|
ids.append(id)
|
||||||
|
|
||||||
|
for id in ids:
|
||||||
|
item = self.book.get_item(id)
|
||||||
|
if item is None or item.media_type != 'application/xhtml+xml':
|
||||||
|
continue
|
||||||
|
doc = self.book.read_item(item)
|
||||||
|
doc = re.sub(b'''<\?.*\?>''', b'', doc)
|
||||||
|
doc = re.sub(b'''<html.*<body[^>]*>''',
|
||||||
|
b'', doc, 1, flags=re.DOTALL|re.I)
|
||||||
|
doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
|
||||||
|
doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
|
||||||
|
data += doc
|
||||||
|
|
||||||
|
data += b'</body></html>'
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _selfdoc(self):
|
||||||
|
data = self._docheader()
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
if len(self.contents) == 0:
|
if len(self.contents) == 0:
|
||||||
self.closefile()
|
self.closefile()
|
||||||
@ -80,23 +112,9 @@ class rclEPUB:
|
|||||||
return (False, "", id, iseof)
|
return (False, "", id, iseof)
|
||||||
|
|
||||||
def dumpall(self):
|
def dumpall(self):
|
||||||
self.em.setmimetype('text/plain')
|
data = self._docheader()
|
||||||
alltxt=""
|
data += self._catbodies()
|
||||||
|
return data
|
||||||
for idx in range(len(self.contents)):
|
|
||||||
ret,doc,path,iseof = self.extractone(self.contents[idx])
|
|
||||||
if not ret:
|
|
||||||
continue
|
|
||||||
# Feed doc to lynx
|
|
||||||
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
|
|
||||||
"-display_charset=utf8",
|
|
||||||
"-force_html"],
|
|
||||||
stdin=subprocess.PIPE,
|
|
||||||
stdout=subprocess.PIPE
|
|
||||||
)
|
|
||||||
txt,err = process.communicate(doc)
|
|
||||||
alltxt += txt.decode('utf-8')
|
|
||||||
return alltxt
|
|
||||||
|
|
||||||
def closefile(self):
|
def closefile(self):
|
||||||
self.book.close()
|
self.book.close()
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class EPUBConcatExtractor(RclBaseHandler):
|
|||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
super(EPUBConcatExtractor, self).__init__(em)
|
super(EPUBConcatExtractor, self).__init__(em)
|
||||||
|
|
||||||
def _header(self):
|
def _docheader(self):
|
||||||
meta = self.book.opf.metadata
|
meta = self.book.opf.metadata
|
||||||
title = ""
|
title = ""
|
||||||
for tt, lang in meta.titles:
|
for tt, lang in meta.titles:
|
||||||
@ -40,18 +40,14 @@ class EPUBConcatExtractor(RclBaseHandler):
|
|||||||
if meta.description:
|
if meta.description:
|
||||||
data += '<meta name="description" content="' + \
|
data += '<meta name="description" content="' + \
|
||||||
rclexecm.htmlescape(meta.description) + '">\n'
|
rclexecm.htmlescape(meta.description) + '">\n'
|
||||||
data += "</head><body>"
|
for value in meta.subjects:
|
||||||
data = data.encode('UTF-8')
|
data += '<meta name="dc:subject" content="' + \
|
||||||
|
rclexecm.htmlescape(value) + '">\n'
|
||||||
|
data += "</head>"
|
||||||
|
return data.encode('UTF-8')
|
||||||
|
|
||||||
return data
|
def _catbodies(self):
|
||||||
|
data = b'<body>'
|
||||||
def html_text(self, fn):
|
|
||||||
"""Extract EPUB data as concatenated HTML"""
|
|
||||||
|
|
||||||
f = open(fn, 'rb')
|
|
||||||
self.book = epub.open_epub(f)
|
|
||||||
|
|
||||||
data = self._header()
|
|
||||||
ids = []
|
ids = []
|
||||||
if self.book.opf.spine:
|
if self.book.opf.spine:
|
||||||
for id, linear in self.book.opf.spine.itemrefs:
|
for id, linear in self.book.opf.spine.itemrefs:
|
||||||
@ -66,13 +62,22 @@ class EPUBConcatExtractor(RclBaseHandler):
|
|||||||
continue
|
continue
|
||||||
doc = self.book.read_item(item)
|
doc = self.book.read_item(item)
|
||||||
doc = re.sub(b'''<\?.*\?>''', b'', doc)
|
doc = re.sub(b'''<\?.*\?>''', b'', doc)
|
||||||
doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''',
|
doc = re.sub(b'''<html.*<body[^>]*>''',
|
||||||
b'', doc, 1, re.DOTALL)
|
b'', doc, 1, flags=re.DOTALL|re.I)
|
||||||
doc = re.sub(b'''</[bB][oO][dD][yY]>''', b'', doc)
|
doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
|
||||||
doc = re.sub(b'''</[hH][tT][mM][lL]>''', b'', doc)
|
doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
|
||||||
data += doc
|
data += doc
|
||||||
|
|
||||||
data += b'</body></html>'
|
data += b'</body></html>'
|
||||||
|
return data
|
||||||
|
|
||||||
|
def html_text(self, fn):
|
||||||
|
"""Extract EPUB data as concatenated HTML"""
|
||||||
|
|
||||||
|
f = open(fn, 'rb')
|
||||||
|
self.book = epub.open_epub(f)
|
||||||
|
data = self._docheader()
|
||||||
|
data += self._catbodies()
|
||||||
self.book.close()
|
self.book.close()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user