#!/usr/bin/env python # Read a file in GNU info format and output its nodes as subdocs, # interfacing with recoll execm from __future__ import print_function import rclexecm import sys import os.path import subprocess # Prototype for the html document we're returning. Info files are # normally ascii. Set no charset, and let it be provided by the # environment if necessary # # Some info source docs contain charset info like: # @documentencoding ISO-2022-JP # But this seems to be absent from outputs. # RclExecm interface class InfoExtractor: def __init__(self, em): self.file = "" self.contents = [] self.em = em def extractone(self, index): if index >= len(self.contents): return(False, "", "", True) nodename, docdata = self.contents[index] nodename = self.em.htmlescape(nodename) docdata = self.em.htmlescape(docdata) # strange whitespace to avoid changing the module tests (same as old) docdata = b'\n\n \n ' + nodename + \ b'\n' + \ ' \n' + \ b' \n \n' + \ b'
\n   ' + \
                  docdata + b'\n   
\n\n' iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: iseof = rclexecm.RclExecM.eofnext self.em.setmimetype("text/html") return (True, docdata, str(index), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): self.file = params["filename:"] if not os.path.isfile(self.file): self.em.rclog("Openfile: %s is not a file" % self.file) return False cmd = b'info --subnodes -o - -f ' + self.file nullstream = open("/dev/null", 'w') try: infostream = subprocess.Popen(cmd, shell=True, bufsize=1, stderr=nullstream, stdout=subprocess.PIPE).stdout except Exception as e: # Consider this as permanently fatal. self.em.rclog("Openfile: exec info: %s" % str(e)) print("RECFILTERROR HELPERNOTFOUND info") sys.exit(1); self.currentindex = -1 self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream) #self.em.rclog("openfile: Entry count: %d"%(len(self.contents))) return True # Extract specific node def getipath(self, params): try: index = int(params["ipath:"]) except: return (False, "", "", True) return self.extractone(index) # Extract next in list def getnext(self, params): if self.currentindex == -1: # Return "self" doc self.currentindex = 0 self.em.setmimetype('text/plain') if len(self.contents) == 0: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof return (True, "", "", eof) if self.currentindex >= len(self.contents): self.em.rclog("getnext: EOF hit") return (False, "", "", rclexecm.RclExecM.eofnow) else: ret= self.extractone(self.currentindex) self.currentindex += 1 return ret # Info file splitter class InfoSimpleSplitter: def splitinfo(self, filename, fin): gotblankline = 1 index = 0 listout = [] node_dict = {} node = b'' infofile = os.path.basename(filename) nodename = b'Unknown' for line in fin: # Top of node ? # It sometimes happens that info --subnodes produces a Node line # beginning with spaces (it's a bug probably, only seen it once) # Maybe we'd actually be better off directly interpreting the # info files if gotblankline and line.lstrip(b' ').startswith(b'File: '): prevnodename = nodename line = line.rstrip(b'\n\r') pairs = line.split(b',') up = b'Top' nodename = str(index) try: for pair in pairs: name, value = pair.split(b':') name = name.strip(b' ') value = value.strip(b' ') if name == b'Node': nodename = value if name == b'Up': up = value if name == b'File': infofile = value except Exception as err: print("rclinfo: bad line in %s: [%s] %s\n" % \ (infofile, line, err), file = sys.stderr) nodename = prevnodename node += line continue if nodename in node_dict: print("Info file %s Dup node: %s" % (filename, nodename), \ file=sys.stderr) node_dict[nodename] = up if index != 0: listout.append((prevnodename, node)) node = b'' index += 1 if line.rstrip(b'\n\r') == b'': gotblankline = 1 else: gotblankline = 0 node += line # File done, add last dangling node if node != b'': listout.append((nodename, node)) # Compute node paths (concatenate "Up" values), to be used # as page titles. It's unfortunate that this will crash if # the info file tree is bad listout1 = [] for nodename, node in listout: title = b'' loop = 0 error = 0 while nodename != b'Top': title = nodename + b' / ' + title if nodename in node_dict: nodename = node_dict[nodename] else: print( "Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \ (infofile, title, nodename), sys.stderr) error = 1 break loop += 1 if loop > 50: print("Infofile: bad tree (looping) %s" % infofile, \ file = sys.stderr) error = 1 break if error: continue if title == b'': title = infofile else: title = infofile + b' / ' + title title = title.rstrip(b' / ') listout1.append((title, node)) return listout1 ##### Main program: either talk to the parent or execute test loop proto = rclexecm.RclExecM() extract = InfoExtractor(proto) rclexecm.main(proto, extract)