recoll/src/filters/rclchm

#!/usr/bin/env python3
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""

from __future__ import print_function

rclchm_html_mtype = "text/html"

import sys
import os
import re
import posixpath
PY3 = sys.version > '3'
if PY3:
    from urllib.parse import unquote as urllib_unquote
    from urllib.parse import urlparse as urlparse_urlparse
    from html.parser import HTMLParser
    chmpackname = 'pychm3.egg'
else:
    from urlparse import urlparse as urlparse_urlparse
    from urllib import unquote as urllib_unquote
    from HTMLParser import HTMLParser
    chmpackname = 'pychm2.egg'

import subprocess

import rclconfig
import rclexecm

# pychm has no official port to Python3, hence no package in the
# standard place.  Recoll bundles a python3 port which we install out
# of the standard python places. Look for it:
#  sys.path[0] is for MSW, where we install the egg in the filters
#  directory? TBD for now
try:
    # First try the system version if any
    from chm import chm,chmlib
except:
    try:
        from recollchm import chm,chmlib
    except:
        print("RECFILTERROR HELPERNOTFOUND python:chm")
        sys.exit(1);

# Small helper routines
def getfile(chmfile, path):
    """Extract internal file text from chm object, given path"""
    if type(path) != type(b''):
        raise Exception("Chm:getfile: must be called with path as bytes")
    res, ui = chmfile.ResolveObject(path)
    if res != chmlib.CHM_RESOLVE_SUCCESS:
        #print("ResolveObject failed: %s" % path, file=sys.stderr)
        return ""
    res, doc = chmfile.RetrieveObject(ui)
    if not res:
        print("RetrieveObject failed: %s" % path, file=sys.stderr)
        return ""
    return doc

def peekfile(chmfile, path, charset):
    """Check that path resolves in chm object"""
    if type(path) == type(u''):
        path = path.encode(charset)
    res, ui = chmfile.ResolveObject(path)
    if res != chmlib.CHM_RESOLVE_SUCCESS:
        return False
    return True


# CHM Topics tree handler

class ChmTopicsParser(HTMLParser):
    """Parse the chm's Topic file which is basically
    a listing of internal nodes (html files mostly). Build a list of
    all nodes (parent.contents), which will then be used to walk and index
    the chm.

    Most nodes in the Topic file look like the following:
    <LI> <OBJECT type="text/sitemap">
           <param name="Name" value="Global Module Index">
           <param name="Local" value="modindex.html">
          </OBJECT>

    Maybe we should filter out non "text/sitemap" Objects, and maybe there are
    things of interest whose name is not Local, but for now, we just take
    all values for parameters named "Local" (with some filtering/massaging),
    until proven wrong
    """
    def __init__(self, rclchm):
        HTMLParser.__init__(self)
        self.em = rclchm.em
        self.rclchm = rclchm

    def handle_starttag(self, tag, attrs):
        #self.em.rclog("Beginning of a %s tag" % tag)
        # If this is a param tag with name Local, we're interested in
        # the value which lists a file ref. Discard those with #
        # in them (references inside files)
        # Sometimes it seems that refs are like Vendor:filename::path,
        # we only keep the path, and only if the file matches

        if tag != 'param':
            return

        name = ''
        value = ''
        for (nm,val) in attrs:
            if nm == 'name':
                name = val
            if nm == 'value':
                value = val

        #self.em.rclog("Name [%s] value [%s]" %(name, value))

        if name != 'Local' or value == '':
            return
        # value may be url-encoded. Decode it. If there are no % in there, will
        # do nothing
        value = urllib_unquote(value)

        localpath = ""
        ll = value.split(":")
        if len(ll) == 1:
            localpath = value
        elif len(ll) == 4 and ll[-1] and ll[-3]:
            #self.em.rclog("File: [%s] sfn [%s]" % ((ll[-3]), self.rclchm.sfn))
            # We used to test against the simple file name, but this does
            # not work if the file is renamed. Just check that the internal
            # path resolves. Old: if ll[-3] == self.rclchm.sfn:
            localpath = ll[-1]
            if not peekfile(self.rclchm.chm, localpath, self.rclchm.charset):
                #self.em.rclog("SKIPPING %s" % ll[-3])
                localpath = ""

        if len(localpath) != 0 and  localpath.find("#") == -1:
            if localpath[0] != '/':
                localpath = "/" + localpath
            self.rclchm.contents.append(localpath.encode(self.rclchm.charset))


# Used when there is no Topics node. Walk the links tree
class ChmWalker(HTMLParser):
    """Links tree walker. This recursively follows all internal links
    found in the tree from the top node given as input, and augments
    the contents list."""

    def __init__(self, rclchm, path, contents):
        HTMLParser.__init__(self)
        self.rclchm = rclchm
        self.chm = rclchm.chm
        self.contents = contents
        if type(path) == type(u''):
            path = path.encode(self.rclchm.charset)
        self.path = posixpath.normpath(path)
        self.dir = posixpath.dirname(self.path)
        contents.append(self.path)

    def handle_starttag(self, tag, attrs):
        if tag != 'a':
            return

        href = ''
        for (nm,val) in attrs:
            if nm == 'href':
                href = val

        path = ""
        res = urlparse_urlparse(href)
        if (not res.scheme or res.scheme.lower == "ms-its"):
            path = res.path
            lpath = path.split(':')
            if len(lpath) == 3:
                # MS-ITS::somefile.chm:/some/path/file.htm ? As far as I
                # know this never happens because there was a runtime error
                # in this path
                path = lpath[2]
                if not peekfile(self.chm, path, self.rclchm.charset):
                    path = ""
            elif len(lpath) == 1:
                path = lpath[0]
            else:
                path = ""

        if path:
            #print "got path", path, "me", self.path, "dir", self.dir
            bpath = path.encode(self.rclchm.charset)
            if path[0] == "/"[0]:
                npath = posixpath.normpath(bpath)
            else:
                npath = posixpath.normpath(posixpath.join(self.dir, bpath))
            if not npath in self.contents:
                #print("Going into [%s] paths [%s]\n" %
                #(npath,str(self.contents)))
                text = getfile(self.chm, npath)
                if text:
                    try:
                        newwalker = ChmWalker(self.rclchm, npath, self.contents)
                        t,c = self.rclchm.fixencoding(text)
                        newwalker.feed(t)
                    except:
                        pass

class rclCHM:
    """RclExecM slave worker for extracting all files from an Msoft chm
    file. We first extract the list of internal nodes, and them return them
    one by one. The ipath is the node path"""

    def __init__(self, em):
        self.contents = []
        self.chm = chm.CHMFile()
        self.em = em
        cf = rclconfig.RclConfig()
        self.catenate = cf.getConfParam("chmcatenate")
        self.catenate = int(self.catenate) if self.catenate else False
        if self.catenate:
            self.em.setmimetype("text/plain")
        else:
            self.em.setmimetype(rclchm_html_mtype)
        expr = b'''(<meta *http-equiv *= *"content-type".*charset *= *)((us-)?ascii)( *" *>)'''
        self.asciito1252re = re.compile(expr, re.IGNORECASE)
        expr = b'''<meta *http-equiv *= *"content-type".*charset *= *([a-z0-9-]+) *" *>'''
        self.findcharsetre = re.compile(expr, re.IGNORECASE)

    def extractone(self, path):
        """Extract one path-named internal file from the chm file"""

        #self.em.rclog("extractone: [%s]" % (path,))
        if type(path) == type(u''):
            path = path.encode(self.charset)
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.contents) -1:
            iseof = rclexecm.RclExecM.eofnext

        res, ui = self.chm.ResolveObject(path)
        #self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui))
        if res != chmlib.CHM_RESOLVE_SUCCESS:
            return (False, "", path, iseof)
        # RetrieveObject() returns len,value
        res, doc = self.chm.RetrieveObject(ui)
        #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc))
        if res > 0:
            doc = re.sub(b'''</[hH][eE][aA][dD]''',
                         b'''<meta name="rclaptg" content="chm"></head>''', doc)
            self.em.setmimetype(rclchm_html_mtype)
            return (True, doc, path, iseof)
        return (False, "", path, iseof)

    def dumpall(self):
        alltxt=""
        for pth in self.contents:
            ret,doc,path,iseof = self.extractone(pth)
            if not ret:
                continue
            # Feed doc to lynx
            process = subprocess.Popen(["lynx", "-stdin", "-dump", "-nolist",
                                        "-display_charset=utf8",
                                        "-force_html"],
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE
                                       )
            txt,err = process.communicate(doc)
            alltxt += txt
        return alltxt

    def fixencoding(self, text):
        """Fix encoding for supposedly html document. We do 2 things here:
            - Change any 'ASCII' charset decl to windows-1252 because windows
              people can't learn and we have to cope.
            - Decode the string to unicode if it's originally an str because
              that's what Python HTMLParser actually expects even if it does not
              really say so. See http://bugs.python.org/issue3932.
        """

        # Memo. Charset decl example. Maybe we should also process the
        # HTML5 charset tag ?
        #<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=US-ASCII">

        if type(text) == type(b''):
            # Fix an ascii charset decl to windows-1252
            text = self.asciito1252re.sub(b'''\1windows-1252\4''', text, 1)
            # Convert to unicode according to charset decl
            m = self.findcharsetre.search(text)
            if m:
                charset = m.group(1).decode('cp1252')
            else:
                charset = 'cp1252'
            text = text.decode(charset, errors='replace')
        return text, charset

    def openfile(self, params):
        """Open the chm file and build the contents list by extracting and
        parsing the Topics object"""

        self.currentindex = -1
        self.contents = []

        filename = params["filename:"]
        if not self.chm.LoadCHM(filename):
            self.em.rclog("LoadCHM failed")
            return False

        #self.em.rclog("home [%s] topics [%s] title [%s]" %
        #              (self.chm.home, self.chm.topics, self.chm.title))

        self.topics = self.chm.GetTopicsTree()
        self.charset = 'cp1252'
        if self.topics:
            # Parse Topics file and extract list of internal nodes
            #self.em.rclog("Got topics");
            tp = ChmTopicsParser(self)
            text,self.charset = self.fixencoding(self.topics)
            tp.feed(text)
            tp.close()
        else:
            # No topics. If there is a home, let's try to walk the tree
            #self.em.rclog("GetTopicsTree failed")
            if not self.chm.home:
                self.em.rclog("No topics and no home")
                return False
            home = self.chm.home
            if home[0] != b'/'[0]:
                home = b"/" + home
            text = getfile(self.chm, home)
            if not text:
                self.em.rclog("No topics and no home content")
                return False
            walker = ChmWalker(self, self.chm.home, self.contents)
            text,self.charset = self.fixencoding(text)
            walker.feed(text)
            walker.close()

        #self.em.rclog("Contents size %d" % len(self.contents))
        uniq = set(self.contents)
        self.contents = list(uniq)
        return True

    def getipath(self, params):
        return self.extractone(params["ipath:"])

    def getnext(self, params):
        if self.catenate:
            self.em.setmimetype("text/plain")
            alltxt = self.dumpall()
            if alltxt:
                return (True, alltxt, "", rclexecm.RclExecM.eofnext)
            else:
                return (False, "", "", rclexecm.RclExecM.eofnow)

        if self.currentindex == -1:
            # Return "self" doc
            self.currentindex = 0
            self.em.setmimetype('text/plain')
            if len(self.contents) == 0:
                eof = rclexecm.RclExecM.eofnext
            else:
                eof = rclexecm.RclExecM.noteof
            return (True, "", "", eof)

        if self.currentindex >= len(self.contents):
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(self.contents[self.currentindex])
            self.currentindex += 1
            return ret

proto = rclexecm.RclExecM()
extract = rclCHM(proto)
rclexecm.main(proto, extract)