rclchm, rclepub: define config variables chmcatenate and epubcatenate to specify that the files should be indexed as a whole instead of as individual chapters

2018-01-05 17:56:19 +01:00 · 2018-01-05 17:56:19 +01:00 · 413c710f34
commit 413c710f34
parent 453a3bb5c6
2 changed files with 39 additions and 10 deletions
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@ -7,11 +7,6 @@ from __future__ import print_function
 # Note: this is not converted to python3, libchm does not have a
 # python3 wrapper at this point (2015-11)

-# Do we return individual chapters as html pages or concatenate everything?
-rclchm_catenate = 0
-# Use special html type to allow for mimeconf/mimeview Open magic,
-# Or go the regular html way with text/html
-#rclchm_html_mtype = "text/x-chm-html"
 rclchm_html_mtype = "text/html"

 import sys
@ -20,9 +15,9 @@ import re
 import posixpath
 import urlparse
 import urllib
-if rclchm_catenate:
-    import subprocess
+import subprocess

+import rclconfig
 import rclexecm

 try:
@ -196,7 +191,10 @@ class rclCHM:
        self.contents = []
        self.chm = chm.CHMFile()
        self.em = em
-        if rclchm_catenate:
+        cf = rclconfig.RclConfig()
+        self.catenate = cf.getConfParam("chmcatenate")
+        self.catenate = int(self.catenate) if self.catenate else False
+        if self.catenate:
            self.em.setmimetype("text/plain")
        else:
            self.em.setmimetype(rclchm_html_mtype)
@ -314,7 +312,8 @@ class rclCHM:
        return self.extractone(params["ipath:"])
        
    def getnext(self, params):
-        if rclchm_catenate:
+        if self.catenate:
+            self.em.setmimetype("text/plain")
            alltxt = self.dumpall()
            if alltxt:
                return (True, alltxt, "", rclexecm.RclExecM.eofnext)
--- a/src/filters/rclepub
+++ b/src/filters/rclepub
@ -1,5 +1,5 @@
 #!/usr/bin/env python2
-"""Extract Html content from an EPUB file (.chm)"""
+"""Extract Html content from an EPUB file (.epub)"""
 from __future__ import print_function

 rclepub_html_mtype = "text/html"
@ -7,8 +7,10 @@ rclepub_html_mtype = "text/html"
 import sys
 import os
 import re
+import subprocess

 import rclexecm
+import rclconfig

 try:
    import epub
@ -25,6 +27,9 @@ class rclEPUB:
        self.currentindex = 0
        self.em = em
        self.em.setmimetype(rclepub_html_mtype)
+        cf = rclconfig.RclConfig()
+        self.catenate = cf.getConfParam("epubcatenate")
+        self.catenate = int(self.catenate) if self.catenate else False

    def _selfdoc(self):
        meta = self.book.opf.metadata
@ -72,6 +77,25 @@ class rclEPUB:
            self.em.rclog("extractone: failed: [%s]" % err)
            return (False, "", id, iseof)

+    def dumpall(self):
+        self.em.setmimetype('text/plain')
+        alltxt=""
+
+        for idx in range(len(self.contents)):
+            ret,doc,path,iseof = self.extractone(self.contents[idx])
+            if not ret:
+                continue
+            # Feed doc to lynx
+            process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
+                                        "-display_charset=utf8",
+                                        "-force_html"], 
+                                       stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE
+                                       )
+            txt,err = process.communicate(doc)
+            alltxt += txt
+        return alltxt
+
    def openfile(self, params):
        """Open the EPUB file, create a contents array"""
        self.currentindex = -1
@ -91,6 +115,12 @@ class rclEPUB:
        return self.extractone(params["ipath:"])

    def getnext(self, params):
+        if self.catenate:
+            alltxt = self.dumpall()
+            if alltxt:
+                return (True, alltxt, "", rclexecm.RclExecM.eofnext)
+            else:
+                return (False, "", "", rclexecm.RclExecM.eofnow)

        if self.currentindex == -1:
            self.currentindex = 0