diff --git a/src/filters/rclepub b/src/filters/rclepub index 32c55315..e0919a1a 100755 --- a/src/filters/rclepub +++ b/src/filters/rclepub @@ -32,7 +32,7 @@ class rclEPUB: self.catenate = cf.getConfParam("epubcatenate") self.catenate = int(self.catenate) if self.catenate else False - def _selfdoc(self): + def _docheader(self): meta = self.book.opf.metadata title = "" for tt, lang in meta.titles: @@ -49,7 +49,39 @@ class rclEPUB: if meta.description: data += '\n' - data = data.encode('UTF-8') + for value in meta.subjects: + data += '\n' + data += "" + return data.encode('UTF-8') + + def _catbodies(self): + data = b'' + ids = [] + if self.book.opf.spine: + for id, linear in self.book.opf.spine.itemrefs: + ids.append(id) + else: + for id, item in self.book.opf.manifest.items(): + ids.append(id) + + for id in ids: + item = self.book.get_item(id) + if item is None or item.media_type != 'application/xhtml+xml': + continue + doc = self.book.read_item(item) + doc = re.sub(b'''<\?.*\?>''', b'', doc) + doc = re.sub(b''']*>''', + b'', doc, 1, flags=re.DOTALL|re.I) + doc = re.sub(b'''''', b'', doc, flags=re.I) + doc = re.sub(b'''''', b'', doc, flags=re.I) + data += doc + + data += b'' + return data + + def _selfdoc(self): + data = self._docheader() self.em.setmimetype('text/html') if len(self.contents) == 0: self.closefile() @@ -80,23 +112,9 @@ class rclEPUB: return (False, "", id, iseof) def dumpall(self): - self.em.setmimetype('text/plain') - alltxt="" - - for idx in range(len(self.contents)): - ret,doc,path,iseof = self.extractone(self.contents[idx]) - if not ret: - continue - # Feed doc to lynx - process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist", - "-display_charset=utf8", - "-force_html"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE - ) - txt,err = process.communicate(doc) - alltxt += txt.decode('utf-8') - return alltxt + data = self._docheader() + data += self._catbodies() + return data def closefile(self): self.book.close() diff --git a/src/filters/rclepub1 b/src/filters/rclepub1 index e9574727..2d087a63 100755 --- a/src/filters/rclepub1 +++ b/src/filters/rclepub1 @@ -23,7 +23,7 @@ class EPUBConcatExtractor(RclBaseHandler): def __init__(self, em): super(EPUBConcatExtractor, self).__init__(em) - def _header(self): + def _docheader(self): meta = self.book.opf.metadata title = "" for tt, lang in meta.titles: @@ -40,18 +40,14 @@ class EPUBConcatExtractor(RclBaseHandler): if meta.description: data += '\n' - data += "" - data = data.encode('UTF-8') + for value in meta.subjects: + data += '\n' + data += "" + return data.encode('UTF-8') - return data - - def html_text(self, fn): - """Extract EPUB data as concatenated HTML""" - - f = open(fn, 'rb') - self.book = epub.open_epub(f) - - data = self._header() + def _catbodies(self): + data = b'' ids = [] if self.book.opf.spine: for id, linear in self.book.opf.spine.itemrefs: @@ -66,13 +62,22 @@ class EPUBConcatExtractor(RclBaseHandler): continue doc = self.book.read_item(item) doc = re.sub(b'''<\?.*\?>''', b'', doc) - doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''', - b'', doc, 1, re.DOTALL) - doc = re.sub(b'''''', b'', doc) - doc = re.sub(b'''''', b'', doc) + doc = re.sub(b''']*>''', + b'', doc, 1, flags=re.DOTALL|re.I) + doc = re.sub(b'''''', b'', doc, flags=re.I) + doc = re.sub(b'''''', b'', doc, flags=re.I) data += doc data += b'' + return data + + def html_text(self, fn): + """Extract EPUB data as concatenated HTML""" + + f = open(fn, 'rb') + self.book = epub.open_epub(f) + data = self._docheader() + data += self._catbodies() self.book.close() return data