From 413c710f34c5777199e182bc10844559b7ab1ff4 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 5 Jan 2018 17:56:19 +0100 Subject: [PATCH] rclchm, rclepub: define config variables chmcatenate and epubcatenate to specify that the files should be indexed as a whole instead of as individual chapters --- src/filters/rclchm | 17 ++++++++--------- src/filters/rclepub | 32 +++++++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/filters/rclchm b/src/filters/rclchm index e3046d39..617f13f4 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -7,11 +7,6 @@ from __future__ import print_function # Note: this is not converted to python3, libchm does not have a # python3 wrapper at this point (2015-11) -# Do we return individual chapters as html pages or concatenate everything? -rclchm_catenate = 0 -# Use special html type to allow for mimeconf/mimeview Open magic, -# Or go the regular html way with text/html -#rclchm_html_mtype = "text/x-chm-html" rclchm_html_mtype = "text/html" import sys @@ -20,9 +15,9 @@ import re import posixpath import urlparse import urllib -if rclchm_catenate: - import subprocess +import subprocess +import rclconfig import rclexecm try: @@ -196,7 +191,10 @@ class rclCHM: self.contents = [] self.chm = chm.CHMFile() self.em = em - if rclchm_catenate: + cf = rclconfig.RclConfig() + self.catenate = cf.getConfParam("chmcatenate") + self.catenate = int(self.catenate) if self.catenate else False + if self.catenate: self.em.setmimetype("text/plain") else: self.em.setmimetype(rclchm_html_mtype) @@ -314,7 +312,8 @@ class rclCHM: return self.extractone(params["ipath:"]) def getnext(self, params): - if rclchm_catenate: + if self.catenate: + self.em.setmimetype("text/plain") alltxt = self.dumpall() if alltxt: return (True, alltxt, "", rclexecm.RclExecM.eofnext) diff --git a/src/filters/rclepub b/src/filters/rclepub index c06c7cb3..ae28defd 100755 --- a/src/filters/rclepub +++ b/src/filters/rclepub @@ -1,5 +1,5 @@ #!/usr/bin/env python2 -"""Extract Html content from an EPUB file (.chm)""" +"""Extract Html content from an EPUB file (.epub)""" from __future__ import print_function rclepub_html_mtype = "text/html" @@ -7,8 +7,10 @@ rclepub_html_mtype = "text/html" import sys import os import re +import subprocess import rclexecm +import rclconfig try: import epub @@ -25,6 +27,9 @@ class rclEPUB: self.currentindex = 0 self.em = em self.em.setmimetype(rclepub_html_mtype) + cf = rclconfig.RclConfig() + self.catenate = cf.getConfParam("epubcatenate") + self.catenate = int(self.catenate) if self.catenate else False def _selfdoc(self): meta = self.book.opf.metadata @@ -72,6 +77,25 @@ class rclEPUB: self.em.rclog("extractone: failed: [%s]" % err) return (False, "", id, iseof) + def dumpall(self): + self.em.setmimetype('text/plain') + alltxt="" + + for idx in range(len(self.contents)): + ret,doc,path,iseof = self.extractone(self.contents[idx]) + if not ret: + continue + # Feed doc to lynx + process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist", + "-display_charset=utf8", + "-force_html"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE + ) + txt,err = process.communicate(doc) + alltxt += txt + return alltxt + def openfile(self, params): """Open the EPUB file, create a contents array""" self.currentindex = -1 @@ -91,6 +115,12 @@ class rclEPUB: return self.extractone(params["ipath:"]) def getnext(self, params): + if self.catenate: + alltxt = self.dumpall() + if alltxt: + return (True, alltxt, "", rclexecm.RclExecM.eofnext) + else: + return (False, "", "", rclexecm.RclExecM.eofnow) if self.currentindex == -1: self.currentindex = 0