rclchm, rclepub: define config variables chmcatenate and epubcatenate to specify that the files should be indexed as a whole instead of as individual chapters

This commit is contained in:
Jean-Francois Dockes 2018-01-05 17:56:19 +01:00
parent 453a3bb5c6
commit 413c710f34
2 changed files with 39 additions and 10 deletions

View File

@ -7,11 +7,6 @@ from __future__ import print_function
# Note: this is not converted to python3, libchm does not have a
# python3 wrapper at this point (2015-11)
# Do we return individual chapters as html pages or concatenate everything?
rclchm_catenate = 0
# Use special html type to allow for mimeconf/mimeview Open magic,
# Or go the regular html way with text/html
#rclchm_html_mtype = "text/x-chm-html"
rclchm_html_mtype = "text/html"
import sys
@ -20,9 +15,9 @@ import re
import posixpath
import urlparse
import urllib
if rclchm_catenate:
import subprocess
import subprocess
import rclconfig
import rclexecm
try:
@ -196,7 +191,10 @@ class rclCHM:
self.contents = []
self.chm = chm.CHMFile()
self.em = em
if rclchm_catenate:
cf = rclconfig.RclConfig()
self.catenate = cf.getConfParam("chmcatenate")
self.catenate = int(self.catenate) if self.catenate else False
if self.catenate:
self.em.setmimetype("text/plain")
else:
self.em.setmimetype(rclchm_html_mtype)
@ -314,7 +312,8 @@ class rclCHM:
return self.extractone(params["ipath:"])
def getnext(self, params):
if rclchm_catenate:
if self.catenate:
self.em.setmimetype("text/plain")
alltxt = self.dumpall()
if alltxt:
return (True, alltxt, "", rclexecm.RclExecM.eofnext)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python2
"""Extract Html content from an EPUB file (.chm)"""
"""Extract Html content from an EPUB file (.epub)"""
from __future__ import print_function
rclepub_html_mtype = "text/html"
@ -7,8 +7,10 @@ rclepub_html_mtype = "text/html"
import sys
import os
import re
import subprocess
import rclexecm
import rclconfig
try:
import epub
@ -25,6 +27,9 @@ class rclEPUB:
self.currentindex = 0
self.em = em
self.em.setmimetype(rclepub_html_mtype)
cf = rclconfig.RclConfig()
self.catenate = cf.getConfParam("epubcatenate")
self.catenate = int(self.catenate) if self.catenate else False
def _selfdoc(self):
meta = self.book.opf.metadata
@ -72,6 +77,25 @@ class rclEPUB:
self.em.rclog("extractone: failed: [%s]" % err)
return (False, "", id, iseof)
def dumpall(self):
self.em.setmimetype('text/plain')
alltxt=""
for idx in range(len(self.contents)):
ret,doc,path,iseof = self.extractone(self.contents[idx])
if not ret:
continue
# Feed doc to lynx
process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
"-display_charset=utf8",
"-force_html"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE
)
txt,err = process.communicate(doc)
alltxt += txt
return alltxt
def openfile(self, params):
"""Open the EPUB file, create a contents array"""
self.currentindex = -1
@ -91,6 +115,12 @@ class rclEPUB:
return self.extractone(params["ipath:"])
def getnext(self, params):
if self.catenate:
alltxt = self.dumpall()
if alltxt:
return (True, alltxt, "", rclexecm.RclExecM.eofnext)
else:
return (False, "", "", rclexecm.RclExecM.eofnow)
if self.currentindex == -1:
self.currentindex = 0