recoll/src/filters/rclinfo

#!/usr/bin/env python

# Read a file in GNU info format and output its nodes as subdocs,
# interfacing with recoll execm

from __future__ import print_function

import rclexecm
import sys
import os.path
import subprocess

# Prototype for the html document we're returning. Info files are
# normally ascii. Set no charset, and let it be provided by the
# environment if necessary
#
# Some info source docs contain charset info like:
# @documentencoding ISO-2022-JP
# But this seems to be absent from outputs.

# RclExecm interface
class InfoExtractor:
    def __init__(self, em):
        self.file = ""
        self.contents = []
        self.em = em

    def extractone(self, index):
        if index >= len(self.contents):
            return(False, "", "", True)

        nodename, docdata = self.contents[index]
        nodename = self.em.htmlescape(nodename)
        docdata = self.em.htmlescape(docdata)
        # strange whitespace to avoid changing the module tests (same as old)
        docdata = b'\n<html>\n  <head>\n      <title>' + nodename + \
                  b'</title>\n' + \
                  '      <meta name="rclaptg" content="gnuinfo">\n' + \
                  b'   </head>\n   <body>\n' + \
                  b'   <pre style="white-space: pre-wrap">\n   ' + \
                  docdata + b'\n   </pre></body>\n</html>\n'

        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.contents) -1:
            iseof = rclexecm.RclExecM.eofnext
        self.em.setmimetype("text/html")
        return (True, docdata, str(index), iseof)

    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.file = params["filename:"]

        if not os.path.isfile(self.file):
            self.em.rclog("Openfile: %s is not a file" % self.file)
            return False

        cmd = b'info --subnodes -o - -f ' + self.file
        nullstream = open("/dev/null", 'w')
        try:
            infostream = subprocess.Popen(cmd, shell=True, bufsize=1,
                                          stderr=nullstream,
                                          stdout=subprocess.PIPE).stdout
        except Exception as e:
            # Consider this as permanently fatal.
            self.em.rclog("Openfile: exec info: %s" % str(e))
            print("RECFILTERROR HELPERNOTFOUND info")
            sys.exit(1);

        self.currentindex = -1

        self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream)

        #self.em.rclog("openfile: Entry count: %d"%(len(self.contents)))
        return True

    # Extract specific node
    def getipath(self, params):
        try:
            index = int(params["ipath:"])
        except:
            return (False, "", "", True)
        return self.extractone(index)

    # Extract next in list
    def getnext(self, params):

        if self.currentindex == -1:
            # Return "self" doc
            self.currentindex = 0
            self.em.setmimetype('text/plain')
            if len(self.contents) == 0:
                eof = rclexecm.RclExecM.eofnext
            else:
                eof = rclexecm.RclExecM.noteof
            return (True, "", "", eof)

        if self.currentindex >= len(self.contents):
            self.em.rclog("getnext: EOF hit")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(self.currentindex)
            self.currentindex += 1
            return ret

# Info file splitter
class InfoSimpleSplitter:

    def splitinfo(self, filename, fin):
        gotblankline = 1
        index = 0
        listout = []
        node_dict = {}
        node = b''
        infofile = os.path.basename(filename)
        nodename = b'Unknown'

        for line in fin:

            # Top of node ?
            # It sometimes happens that info --subnodes produces a Node line
            # beginning with spaces (it's a bug probably, only seen it once)
            # Maybe we'd actually be better off directly interpreting the
            # info files
            if gotblankline and line.lstrip(b' ').startswith(b'File: '):
                prevnodename = nodename
                line = line.rstrip(b'\n\r')
                pairs = line.split(b',')
                up = b'Top'
                nodename = str(index)
                try:
                    for pair in pairs:
                        name, value = pair.split(b':')
                        name = name.strip(b' ')
                        value = value.strip(b' ')
                        if name == b'Node':
                            nodename = value
                        if name == b'Up':
                            up = value
                        if name == b'File':
                            infofile = value
                except Exception as err:
                    print("rclinfo: bad line in %s: [%s] %s\n" % \
                          (infofile, line, err), file = sys.stderr)
                    nodename = prevnodename
                    node += line
                    continue

                if nodename in node_dict:
                    print("Info file %s Dup node: %s" % (filename, nodename), \
                          file=sys.stderr)
                node_dict[nodename] = up

                if index != 0:
                    listout.append((prevnodename, node))
                node = b''
                index += 1

            if line.rstrip(b'\n\r') == b'':
                gotblankline = 1
            else:
                gotblankline = 0

            node += line

        # File done, add last dangling node
        if node != b'':
            listout.append((nodename, node))

        # Compute node paths (concatenate "Up" values), to be used
        # as page titles. It's unfortunate that this will crash if
        # the info file tree is bad
        listout1 = []
        for nodename, node in listout:
            title = b''
            loop = 0
            error = 0
            while nodename != b'Top':
                title = nodename + b' / ' + title
                if nodename in node_dict:
                    nodename = node_dict[nodename]
                else:
                    print(
           "Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \
                    (infofile, title, nodename), sys.stderr)
                    error = 1
                    break
                loop += 1
                if loop > 50:
                    print("Infofile: bad tree (looping) %s" % infofile, \
                          file = sys.stderr)
                    error = 1
                    break

            if error:
                continue

            if title == b'':
                title = infofile
            else:
                title = infofile + b' / ' + title
            title = title.rstrip(b' / ')
            listout1.append((title, node))

        return listout1


##### Main program: either talk to the parent or execute test loop
proto = rclexecm.RclExecM()
extract = InfoExtractor(proto)
rclexecm.main(proto, extract)