epub handler: extract the opf metadata subjects fields as dc:subject tags. Share more code between rclepub and the now redundant rclepub1 (no more lynx usage in rclepub)

2020-08-09 09:49:08 +02:00 · 2020-08-09 09:49:08 +02:00 · d932d19562
commit d932d19562
parent 9f818ebe70
2 changed files with 58 additions and 35 deletions
--- a/src/filters/rclepub
+++ b/src/filters/rclepub
@ -32,7 +32,7 @@ class rclEPUB:
        self.catenate = cf.getConfParam("epubcatenate")
        self.catenate = int(self.catenate) if self.catenate else False

-    def _selfdoc(self):
+    def _docheader(self):
        meta = self.book.opf.metadata
        title = ""
        for tt, lang in meta.titles:
@ -49,7 +49,39 @@ class rclEPUB:
        if meta.description:
            data += '<meta name="description" content="' + \
                rclexecm.htmlescape(meta.description) + '">\n'
-        data = data.encode('UTF-8')
+        for value in meta.subjects:
+            data += '<meta name="dc:subject" content="' + \
+                rclexecm.htmlescape(value) + '">\n' 
+        data += "</head>"
+        return data.encode('UTF-8')
+
+    def _catbodies(self):
+        data = b'<body>'
+        ids = []
+        if self.book.opf.spine:
+            for id, linear in self.book.opf.spine.itemrefs:
+                ids.append(id)
+        else:
+            for id, item in self.book.opf.manifest.items():
+                ids.append(id)
+
+        for id in ids:
+            item = self.book.get_item(id)
+            if item is None or item.media_type != 'application/xhtml+xml':
+                continue
+            doc = self.book.read_item(item)
+            doc = re.sub(b'''<\?.*\?>''', b'', doc)
+            doc = re.sub(b'''<html.*<body[^>]*>''',
+                         b'', doc, 1, flags=re.DOTALL|re.I)
+            doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
+            doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
+            data += doc
+
+        data += b'</body></html>'
+        return data
+
+    def _selfdoc(self):
+        data = self._docheader()
        self.em.setmimetype('text/html')
        if len(self.contents) == 0:
            self.closefile()
@ -80,23 +112,9 @@ class rclEPUB:
            return (False, "", id, iseof)

    def dumpall(self):
-        self.em.setmimetype('text/plain')
-        alltxt=""
-
-        for idx in range(len(self.contents)):
-            ret,doc,path,iseof = self.extractone(self.contents[idx])
-            if not ret:
-                continue
-            # Feed doc to lynx
-            process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
-                                        "-display_charset=utf8",
-                                        "-force_html"], 
-                                       stdin=subprocess.PIPE,
-                                       stdout=subprocess.PIPE
-                                       )
-            txt,err = process.communicate(doc)
-            alltxt += txt.decode('utf-8')
-        return alltxt
+        data = self._docheader()
+        data += self._catbodies()
+        return data

    def closefile(self):
        self.book.close()
--- a/src/filters/rclepub1
+++ b/src/filters/rclepub1
@ -23,7 +23,7 @@ class EPUBConcatExtractor(RclBaseHandler):
    def __init__(self, em):
        super(EPUBConcatExtractor, self).__init__(em)

-    def _header(self):
+    def _docheader(self):
        meta = self.book.opf.metadata
        title = ""
        for tt, lang in meta.titles:
@ -40,18 +40,14 @@ class EPUBConcatExtractor(RclBaseHandler):
        if meta.description:
            data += '<meta name="description" content="' + \
                rclexecm.htmlescape(meta.description) + '">\n'
-        data += "</head><body>"
-        data = data.encode('UTF-8')
+        for value in meta.subjects:
+            data += '<meta name="dc:subject" content="' + \
+                rclexecm.htmlescape(value) + '">\n' 
+        data += "</head>"
+        return data.encode('UTF-8')

-        return data
-
-    def html_text(self, fn):
-        """Extract EPUB data as concatenated HTML"""
-
-        f = open(fn, 'rb')
-        self.book = epub.open_epub(f)
-
-        data = self._header()
+    def _catbodies(self):
+        data = b'<body>'
        ids = []
        if self.book.opf.spine:
            for id, linear in self.book.opf.spine.itemrefs:
@ -66,13 +62,22 @@ class EPUBConcatExtractor(RclBaseHandler):
                continue
            doc = self.book.read_item(item)
            doc = re.sub(b'''<\?.*\?>''', b'', doc)
-            doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''',
-                         b'', doc, 1, re.DOTALL)
-            doc = re.sub(b'''</[bB][oO][dD][yY]>''', b'', doc)
-            doc = re.sub(b'''</[hH][tT][mM][lL]>''', b'', doc)
+            doc = re.sub(b'''<html.*<body[^>]*>''',
+                         b'', doc, 1, flags=re.DOTALL|re.I)
+            doc = re.sub(b'''</body>''', b'', doc, flags=re.I)
+            doc = re.sub(b'''</html>''', b'', doc, flags=re.I)
            data += doc

        data += b'</body></html>'
+        return data
+        
+    def html_text(self, fn):
+        """Extract EPUB data as concatenated HTML"""
+
+        f = open(fn, 'rb')
+        self.book = epub.open_epub(f)
+        data = self._docheader()
+        data += self._catbodies()
        self.book.close()
        return data