python scripts for ppt and xls

--HG--
branch : WINDOWSPORT
This commit is contained in:
Jean-Francois Dockes 2015-09-14 11:32:16 +02:00
parent 86ef362461
commit 002eb67185
5 changed files with 156 additions and 18 deletions

View File

@ -34,12 +34,14 @@ import rclexecm
# the object which we receive as a parameter, which in turn is defined
# in the actual executable filter (e.g. rcldoc.py)
class Executor:
opt_ignxval = 1
def __init__(self, em, flt):
self.em = em
self.flt = flt
self.currentindex = 0
def runCmd(self, cmd, filename, postproc):
def runCmd(self, cmd, filename, postproc, opt):
''' Substitute parameters and execute command, process output
with the specific postprocessor and return the complete text.
We expect cmd as a list of command name + arguments'''
@ -60,7 +62,7 @@ class Executor:
postproc.takeLine(line.strip())
proc.wait()
if proc.returncode:
if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode))
return False, postproc.wrapData()
@ -78,9 +80,12 @@ class Executor:
fn = params["filename:"]
while True:
cmd, postproc = self.flt.getCmd(fn)
cmdseq = self.flt.getCmd(fn)
cmd = cmdseq[0]
postproc = cmdseq[1]
opt = cmdseq[2] if len(cmdseq) == 3 else 0
if cmd:
ok, data = self.runCmd(cmd, fn, postproc)
ok, data = self.runCmd(cmd, fn, postproc, opt)
if ok:
break
else:

58
src/filters/rclppt.py Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import rclexecm
import rclexec1
import re
import sys
import os
# Processing the output from unrtf
class PPTProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.gotdata = True
self.out += self.em.htmlescape(line)
def wrapData(self):
return self.out + '''</pre></body></html>'''
class PPTFilter:
def __init__(self, em):
self.em = em
self.ntry = 0
def reset(self):
self.ntry = 0
pass
def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("ppt-dump.py")
if cmd:
# ppt-dump.py often exits 1 with valid data. Ignore exit value
return ([cmd, "--no-struct-output", "--dump-text"],
PPTProcessData(self.em), rclexec1.Executor.opt_ignxval)
else:
return ([], None)
if __name__ == '__main__':
if not rclexecm.which("ppt-dump.py"):
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = PPTFilter(proto)
extract = rclexec1.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -35,16 +35,20 @@ class RTFProcessData:
class RTFFilter:
def __init__(self, em):
self.em = em
self.ntry = 0
def reset(self):
pass
self.ntry = 0
def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("unrtf")
if cmd:
return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
else:
return ([],None)
return ([], None)
if __name__ == '__main__':
if not rclexecm.which("unrtf"):

64
src/filters/rclxls.py Executable file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env python
import rclexecm
import rclexec1
import xlsxmltocsv
import re
import sys
import os
import xml.sax
# Processing the output from unrtf
class XLSProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
self.xmldata = ""
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.gotdata = True
self.xmldata += line
def wrapData(self):
handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output)
return self.out + '''</pre></body></html>'''
class XLSFilter:
def __init__(self, em):
self.em = em
self.ntry = 0
def reset(self):
self.ntry = 0
pass
def getCmd(self, fn):
if self.ntry:
return ([], None)
self.ntry = 1
cmd = rclexecm.which("xls-dump.py")
if cmd:
# xls-dump.py often exits 1 with valid data. Ignore exit value
return ([cmd, "--dump-mode=canonical-xml", "--utf-8", "--catch"],
XLSProcessData(self.em), rclexec1.Executor.opt_ignxval)
else:
return ([], None)
if __name__ == '__main__':
if not rclexecm.which("ppt-dump.py"):
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = XLSFilter(proto)
extract = rclexec1.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -15,10 +15,13 @@ else:
dquote = '"'
class XlsXmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.output = ""
def startElement(self, name, attrs):
if name == "worksheet":
if "name" in attrs:
print("%s" % attrs["name"].encode("UTF-8"))
self.output += "%s\n" % attrs["name"].encode("UTF-8")
elif name == "row":
self.cells = dict()
elif name == "label-cell" or name == "number-cell":
@ -30,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
self.cells[int(attrs["col"])] = value
else:
#??
sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \
@ -40,17 +43,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
if name == "row":
curidx = 0
for idx, value in self.cells.iteritems():
sys.stdout.write(sepstring * (idx - curidx))
sys.stdout.write('%s%s%s' % (dquote, value, dquote))
self.output += sepstring * (idx - curidx)
self.output += "%s%s%s" % (dquote, value, dquote)
curidx = idx
sys.stdout.write("\n")
self.output += "\n"
elif name == "worksheet":
print("")
self.output += "\n"
try:
xml.sax.parse(sys.stdin, XlsXmlHandler())
except BaseException as err:
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
sys.exit(1)
sys.exit(0)
if __name__ == '__main__':
try:
handler = XlsXmlHandler()
xml.sax.parse(sys.stdin, handler)
print(handler.output)
except BaseException as err:
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
sys.exit(1)
sys.exit(0)