epub handler: extract the opf metadata subjects fields as dc:subject tags. Share more code between rclepub and the now redundant rclepub1 (no more lynx usage in rclepub)

2020-08-09 09:49:08 +02:00 · 2020-08-09 09:49:08 +02:00 · d932d19562
commit d932d19562
parent 9f818ebe70
2 changed files with 58 additions and 35 deletions
--- a/src/filters/rclepub
+++ b/src/filters/rclepub
@ -32,7 +32,7 @@ class rclEPUB:
        self.catenate = cf.getConfParam("epubcatenate")
        self.catenate = int(self.catenate) if self.catenate else False
-    def _selfdoc(self):
+    def _docheader(self):
        meta = self.book.opf.metadata
        title = ""
        for tt, lang in meta.titles:
@ -49,7 +49,39 @@ class rclEPUB:
        if meta.description:
            data += '<meta name="description" content="' + \
                rclexecm.htmlescape(meta.description) + '">\n'
-        data = data.encode('UTF-8')
+        for value in meta.subjects:
            data += '<meta name="dc:subject" content="' + \
                rclexecm.htmlescape(value) + '">\n' 
        data += "</head>"
        return data.encode('UTF-8')
    def _catbodies(self):
        data = b'<body>'
        ids = []
        if self.book.opf.spine:
            for id, linear in self.book.opf.spine.itemrefs:
                ids.append(id)
        else:
            for id, item in self.book.opf.manifest.items():
                ids.append(id)
        for id in ids:
            item = self.book.get_item(id)
            if item is None or item.media_type != 'application/xhtml+xml':
                continue
            doc = self.book.read_item(item)
            doc = re.sub(b'''<\?.*\?>''', b'', doc)
            doc = re.sub(b'''<html.*<body[^>]*>''',
                         b'', doc, 1, flags=re.DOTALL|re.I)
            doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
            doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
            data += doc
        data += b'</body></html>'
        return data
    def _selfdoc(self):
        data = self._docheader()
        self.em.setmimetype('text/html')
        if len(self.contents) == 0:
            self.closefile()
@ -80,23 +112,9 @@ class rclEPUB:
            return (False, "", id, iseof)
    def dumpall(self):
-        self.em.setmimetype('text/plain')
+        data = self._docheader()
-        alltxt=""
+        data += self._catbodies()
-
+        return data
        for idx in range(len(self.contents)):
            ret,doc,path,iseof = self.extractone(self.contents[idx])
            if not ret:
                continue
            # Feed doc to lynx
            process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
                                        "-display_charset=utf8",
                                        "-force_html"], 
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE
                                       )
            txt,err = process.communicate(doc)
            alltxt += txt.decode('utf-8')
        return alltxt
    def closefile(self):
        self.book.close()
--- a/src/filters/rclepub1
+++ b/src/filters/rclepub1
@ -23,7 +23,7 @@ class EPUBConcatExtractor(RclBaseHandler):
    def __init__(self, em):
        super(EPUBConcatExtractor, self).__init__(em)
-    def _header(self):
+    def _docheader(self):
        meta = self.book.opf.metadata
        title = ""
        for tt, lang in meta.titles:
@ -40,18 +40,14 @@ class EPUBConcatExtractor(RclBaseHandler):
        if meta.description:
            data += '<meta name="description" content="' + \
                rclexecm.htmlescape(meta.description) + '">\n'
-        data += "</head><body>"
+        for value in meta.subjects:
-        data = data.encode('UTF-8')
+            data += '<meta name="dc:subject" content="' + \
                rclexecm.htmlescape(value) + '">\n' 
        data += "</head>"
        return data.encode('UTF-8')
-        return data
+    def _catbodies(self):
-
+        data = b'<body>'
    def html_text(self, fn):
        """Extract EPUB data as concatenated HTML"""
        f = open(fn, 'rb')
        self.book = epub.open_epub(f)
        data = self._header()
        ids = []
        if self.book.opf.spine:
            for id, linear in self.book.opf.spine.itemrefs:
@ -66,13 +62,22 @@ class EPUBConcatExtractor(RclBaseHandler):
                continue
            doc = self.book.read_item(item)
            doc = re.sub(b'''<\?.*\?>''', b'', doc)
-            doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''',
+            doc = re.sub(b'''<html.*<body[^>]*>''',
-                         b'', doc, 1, re.DOTALL)
+                         b'', doc, 1, flags=re.DOTALL|re.I)
-            doc = re.sub(b'''</[bB][oO][dD][yY]>''', b'', doc)
+            doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
-            doc = re.sub(b'''</[hH][tT][mM][lL]>''', b'', doc)
+            doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
            data += doc
        data += b'</body></html>'
        return data
    def html_text(self, fn):
        """Extract EPUB data as concatenated HTML"""
        f = open(fn, 'rb')
        self.book = epub.open_epub(f)
        data = self._docheader()
        data += self._catbodies()
        self.book.close()
        return data