python scripts for ppt and xls

--HG--
branch : WINDOWSPORT
This commit is contained in:
Jean-Francois Dockes 2015-09-14 11:32:16 +02:00
parent 86ef362461
commit 002eb67185
5 changed files with 156 additions and 18 deletions

View File

@ -34,12 +34,14 @@ import rclexecm
# the object which we receive as a parameter, which in turn is defined # the object which we receive as a parameter, which in turn is defined
# in the actual executable filter (e.g. rcldoc.py) # in the actual executable filter (e.g. rcldoc.py)
class Executor: class Executor:
opt_ignxval = 1
def __init__(self, em, flt): def __init__(self, em, flt):
self.em = em self.em = em
self.flt = flt self.flt = flt
self.currentindex = 0 self.currentindex = 0
def runCmd(self, cmd, filename, postproc): def runCmd(self, cmd, filename, postproc, opt):
''' Substitute parameters and execute command, process output ''' Substitute parameters and execute command, process output
with the specific postprocessor and return the complete text. with the specific postprocessor and return the complete text.
We expect cmd as a list of command name + arguments''' We expect cmd as a list of command name + arguments'''
@ -60,7 +62,7 @@ class Executor:
postproc.takeLine(line.strip()) postproc.takeLine(line.strip())
proc.wait() proc.wait()
if proc.returncode: if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \ self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode)) (filename, proc.returncode))
return False, postproc.wrapData() return False, postproc.wrapData()
@ -78,9 +80,12 @@ class Executor:
fn = params["filename:"] fn = params["filename:"]
while True: while True:
cmd, postproc = self.flt.getCmd(fn) cmdseq = self.flt.getCmd(fn)
cmd = cmdseq[0]
postproc = cmdseq[1]
opt = cmdseq[2] if len(cmdseq) == 3 else 0
if cmd: if cmd:
ok, data = self.runCmd(cmd, fn, postproc) ok, data = self.runCmd(cmd, fn, postproc, opt)
if ok: if ok:
break break
else: else:

58
src/filters/rclppt.py Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import rclexecm
import rclexec1
import re
import sys
import os
# Processing the output from unrtf
class PPTProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.gotdata = True
self.out += self.em.htmlescape(line)
def wrapData(self):
return self.out + '''</pre></body></html>'''
class PPTFilter:
def __init__(self, em):
self.em = em
self.ntry = 0
def reset(self):
self.ntry = 0
pass
def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("ppt-dump.py")
if cmd:
# ppt-dump.py often exits 1 with valid data. Ignore exit value
return ([cmd, "--no-struct-output", "--dump-text"],
PPTProcessData(self.em), rclexec1.Executor.opt_ignxval)
else:
return ([], None)
if __name__ == '__main__':
if not rclexecm.which("ppt-dump.py"):
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = PPTFilter(proto)
extract = rclexec1.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -35,16 +35,20 @@ class RTFProcessData:
class RTFFilter: class RTFFilter:
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
self.ntry = 0
def reset(self): def reset(self):
pass self.ntry = 0
def getCmd(self, fn): def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("unrtf") cmd = rclexecm.which("unrtf")
if cmd: if cmd:
return ([cmd, "--nopict", "--html"], RTFProcessData(self.em)) return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
else: else:
return ([],None) return ([], None)
if __name__ == '__main__': if __name__ == '__main__':
if not rclexecm.which("unrtf"): if not rclexecm.which("unrtf"):

64
src/filters/rclxls.py Executable file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env python
import rclexecm
import rclexec1
import xlsxmltocsv
import re
import sys
import os
import xml.sax
# Processing the output from unrtf
class XLSProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
self.xmldata = ""
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.gotdata = True
self.xmldata += line
def wrapData(self):
handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output)
return self.out + '''</pre></body></html>'''
class XLSFilter:
def __init__(self, em):
self.em = em
self.ntry = 0
def reset(self):
self.ntry = 0
pass
def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("xls-dump.py")
if cmd:
# xls-dump.py often exits 1 with valid data. Ignore exit value
return ([cmd, "--dump-mode=canonical-xml", "--utf-8", "--catch"],
XLSProcessData(self.em), rclexec1.Executor.opt_ignxval)
else:
return ([], None)
if __name__ == '__main__':
if not rclexecm.which("ppt-dump.py"):
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = XLSFilter(proto)
extract = rclexec1.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -15,10 +15,13 @@ else:
dquote = '"' dquote = '"'
class XlsXmlHandler(xml.sax.handler.ContentHandler): class XlsXmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.output = ""
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name == "worksheet": if name == "worksheet":
if "name" in attrs: if "name" in attrs:
print("%s" % attrs["name"].encode("UTF-8")) self.output += "%s\n" % attrs["name"].encode("UTF-8")
elif name == "row": elif name == "row":
self.cells = dict() self.cells = dict()
elif name == "label-cell" or name == "number-cell": elif name == "label-cell" or name == "number-cell":
@ -30,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
self.cells[int(attrs["col"])] = value self.cells[int(attrs["col"])] = value
else: else:
#?? #??
sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring)) self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
elif name == "formula-cell": elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs: if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \ self.cells[int(attrs["col"])] = \
@ -40,17 +43,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
if name == "row": if name == "row":
curidx = 0 curidx = 0
for idx, value in self.cells.iteritems(): for idx, value in self.cells.iteritems():
sys.stdout.write(sepstring * (idx - curidx)) self.output += sepstring * (idx - curidx)
sys.stdout.write('%s%s%s' % (dquote, value, dquote)) self.output += "%s%s%s" % (dquote, value, dquote)
curidx = idx curidx = idx
sys.stdout.write("\n") self.output += "\n"
elif name == "worksheet": elif name == "worksheet":
print("") self.output += "\n"
try:
xml.sax.parse(sys.stdin, XlsXmlHandler())
except BaseException as err:
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
sys.exit(1)
sys.exit(0) if __name__ == '__main__':
try:
handler = XlsXmlHandler()
xml.sax.parse(sys.stdin, handler)
print(handler.output)
except BaseException as err:
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
sys.exit(1)
sys.exit(0)