diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py index 13970048..ffa68c53 100644 --- a/src/filters/rclexec1.py +++ b/src/filters/rclexec1.py @@ -34,12 +34,14 @@ import rclexecm # the object which we receive as a parameter, which in turn is defined # in the actual executable filter (e.g. rcldoc.py) class Executor: + opt_ignxval = 1 + def __init__(self, em, flt): self.em = em self.flt = flt self.currentindex = 0 - def runCmd(self, cmd, filename, postproc): + def runCmd(self, cmd, filename, postproc, opt): ''' Substitute parameters and execute command, process output with the specific postprocessor and return the complete text. We expect cmd as a list of command name + arguments''' @@ -60,7 +62,7 @@ class Executor: postproc.takeLine(line.strip()) proc.wait() - if proc.returncode: + if (opt & self.opt_ignxval) == 0 and proc.returncode: self.em.rclog("extractone: [%s] returncode %d" % \ (filename, proc.returncode)) return False, postproc.wrapData() @@ -78,9 +80,12 @@ class Executor: fn = params["filename:"] while True: - cmd, postproc = self.flt.getCmd(fn) + cmdseq = self.flt.getCmd(fn) + cmd = cmdseq[0] + postproc = cmdseq[1] + opt = cmdseq[2] if len(cmdseq) == 3 else 0 if cmd: - ok, data = self.runCmd(cmd, fn, postproc) + ok, data = self.runCmd(cmd, fn, postproc, opt) if ok: break else: diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py new file mode 100755 index 00000000..0d2bf661 --- /dev/null +++ b/src/filters/rclppt.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import rclexecm +import rclexec1 +import re +import sys +import os + +# Processing the output from unrtf +class PPTProcessData: + def __init__(self, em): + self.em = em + self.out = "" + self.gotdata = 0 + + # Some versions of unrtf put out a garbled charset line. + # Apart from this, we pass the data untouched. + def takeLine(self, line): + if not self.gotdata: + self.out += '''
''' + \ + '''''' + \ + '''''' + self.gotdata = True + self.out += self.em.htmlescape(line) + + def wrapData(self): + return self.out + '''''' + +class PPTFilter: + def __init__(self, em): + self.em = em + self.ntry = 0 + + def reset(self): + self.ntry = 0 + pass + + def getCmd(self, fn): + if self.ntry: + return ([], None) + self.ntry = 1 + cmd = rclexecm.which("ppt-dump.py") + if cmd: + # ppt-dump.py often exits 1 with valid data. Ignore exit value + return ([cmd, "--no-struct-output", "--dump-text"], + PPTProcessData(self.em), rclexec1.Executor.opt_ignxval) + else: + return ([], None) + +if __name__ == '__main__': + if not rclexecm.which("ppt-dump.py"): + print("RECFILTERROR HELPERNOTFOUND ppt-dump.py") + sys.exit(1) + proto = rclexecm.RclExecM() + filter = PPTFilter(proto) + extract = rclexec1.Executor(proto, filter) + rclexecm.main(proto, extract) diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py index 81af1eab..c7031030 100755 --- a/src/filters/rclrtf.py +++ b/src/filters/rclrtf.py @@ -35,16 +35,20 @@ class RTFProcessData: class RTFFilter: def __init__(self, em): self.em = em + self.ntry = 0 def reset(self): - pass + self.ntry = 0 def getCmd(self, fn): + if self.ntry: + return ([], None) + self.ntry = 1 cmd = rclexecm.which("unrtf") if cmd: return ([cmd, "--nopict", "--html"], RTFProcessData(self.em)) else: - return ([],None) + return ([], None) if __name__ == '__main__': if not rclexecm.which("unrtf"): diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py new file mode 100755 index 00000000..806efb34 --- /dev/null +++ b/src/filters/rclxls.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import rclexecm +import rclexec1 +import xlsxmltocsv +import re +import sys +import os +import xml.sax + +# Processing the output from unrtf +class XLSProcessData: + def __init__(self, em): + self.em = em + self.out = "" + self.gotdata = 0 + self.xmldata = "" + + # Some versions of unrtf put out a garbled charset line. + # Apart from this, we pass the data untouched. + def takeLine(self, line): + if not self.gotdata: + self.out += '''''' + \ + '''''' + \ + '''
''' + self.gotdata = True + self.xmldata += line + + def wrapData(self): + handler = xlsxmltocsv.XlsXmlHandler() + data = xml.sax.parseString(self.xmldata, handler) + self.out += self.em.htmlescape(handler.output) + return self.out + '''''' + +class XLSFilter: + def __init__(self, em): + self.em = em + self.ntry = 0 + + def reset(self): + self.ntry = 0 + pass + + def getCmd(self, fn): + if self.ntry: + return ([], None) + self.ntry = 1 + cmd = rclexecm.which("xls-dump.py") + if cmd: + # xls-dump.py often exits 1 with valid data. Ignore exit value + return ([cmd, "--dump-mode=canonical-xml", "--utf-8", "--catch"], + XLSProcessData(self.em), rclexec1.Executor.opt_ignxval) + else: + return ([], None) + +if __name__ == '__main__': + if not rclexecm.which("ppt-dump.py"): + print("RECFILTERROR HELPERNOTFOUND ppt-dump.py") + sys.exit(1) + proto = rclexecm.RclExecM() + filter = XLSFilter(proto) + extract = rclexec1.Executor(proto, filter) + rclexecm.main(proto, extract) diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py index f8a6d654..72850d3a 100755 --- a/src/filters/xlsxmltocsv.py +++ b/src/filters/xlsxmltocsv.py @@ -15,10 +15,13 @@ else: dquote = '"' class XlsXmlHandler(xml.sax.handler.ContentHandler): + def __init__(self): + self.output = "" + def startElement(self, name, attrs): if name == "worksheet": if "name" in attrs: - print("%s" % attrs["name"].encode("UTF-8")) + self.output += "%s\n" % attrs["name"].encode("UTF-8") elif name == "row": self.cells = dict() elif name == "label-cell" or name == "number-cell": @@ -30,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): self.cells[int(attrs["col"])] = value else: #?? - sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring)) + self.output += "%s%s" % (value.encode("UTF-8"), sepstring) elif name == "formula-cell": if "formula-result" in attrs and "col" in attrs: self.cells[int(attrs["col"])] = \ @@ -40,17 +43,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): if name == "row": curidx = 0 for idx, value in self.cells.iteritems(): - sys.stdout.write(sepstring * (idx - curidx)) - sys.stdout.write('%s%s%s' % (dquote, value, dquote)) + self.output += sepstring * (idx - curidx) + self.output += "%s%s%s" % (dquote, value, dquote) curidx = idx - sys.stdout.write("\n") + self.output += "\n" elif name == "worksheet": - print("") + self.output += "\n" -try: - xml.sax.parse(sys.stdin, XlsXmlHandler()) -except BaseException as err: - error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),)) - sys.exit(1) -sys.exit(0) +if __name__ == '__main__': + try: + handler = XlsXmlHandler() + xml.sax.parse(sys.stdin, handler) + print(handler.output) + except BaseException as err: + error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),)) + sys.exit(1) + + sys.exit(0)