python scripts for ppt and xls
--HG-- branch : WINDOWSPORT
This commit is contained in:
parent
86ef362461
commit
002eb67185
@ -34,12 +34,14 @@ import rclexecm
|
|||||||
# the object which we receive as a parameter, which in turn is defined
|
# the object which we receive as a parameter, which in turn is defined
|
||||||
# in the actual executable filter (e.g. rcldoc.py)
|
# in the actual executable filter (e.g. rcldoc.py)
|
||||||
class Executor:
|
class Executor:
|
||||||
|
opt_ignxval = 1
|
||||||
|
|
||||||
def __init__(self, em, flt):
|
def __init__(self, em, flt):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.flt = flt
|
self.flt = flt
|
||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
|
|
||||||
def runCmd(self, cmd, filename, postproc):
|
def runCmd(self, cmd, filename, postproc, opt):
|
||||||
''' Substitute parameters and execute command, process output
|
''' Substitute parameters and execute command, process output
|
||||||
with the specific postprocessor and return the complete text.
|
with the specific postprocessor and return the complete text.
|
||||||
We expect cmd as a list of command name + arguments'''
|
We expect cmd as a list of command name + arguments'''
|
||||||
@ -60,7 +62,7 @@ class Executor:
|
|||||||
postproc.takeLine(line.strip())
|
postproc.takeLine(line.strip())
|
||||||
|
|
||||||
proc.wait()
|
proc.wait()
|
||||||
if proc.returncode:
|
if (opt & self.opt_ignxval) == 0 and proc.returncode:
|
||||||
self.em.rclog("extractone: [%s] returncode %d" % \
|
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||||
(filename, proc.returncode))
|
(filename, proc.returncode))
|
||||||
return False, postproc.wrapData()
|
return False, postproc.wrapData()
|
||||||
@ -78,9 +80,12 @@ class Executor:
|
|||||||
|
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
while True:
|
while True:
|
||||||
cmd, postproc = self.flt.getCmd(fn)
|
cmdseq = self.flt.getCmd(fn)
|
||||||
|
cmd = cmdseq[0]
|
||||||
|
postproc = cmdseq[1]
|
||||||
|
opt = cmdseq[2] if len(cmdseq) == 3 else 0
|
||||||
if cmd:
|
if cmd:
|
||||||
ok, data = self.runCmd(cmd, fn, postproc)
|
ok, data = self.runCmd(cmd, fn, postproc, opt)
|
||||||
if ok:
|
if ok:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|||||||
58
src/filters/rclppt.py
Executable file
58
src/filters/rclppt.py
Executable file
@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
import rclexec1
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Processing the output from unrtf
|
||||||
|
class PPTProcessData:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.em = em
|
||||||
|
self.out = ""
|
||||||
|
self.gotdata = 0
|
||||||
|
|
||||||
|
# Some versions of unrtf put out a garbled charset line.
|
||||||
|
# Apart from this, we pass the data untouched.
|
||||||
|
def takeLine(self, line):
|
||||||
|
if not self.gotdata:
|
||||||
|
self.out += '''<html><head>''' + \
|
||||||
|
'''<meta http-equiv="Content-Type" ''' + \
|
||||||
|
'''content="text/html;charset=UTF-8">''' + \
|
||||||
|
'''</head><body><pre>'''
|
||||||
|
self.gotdata = True
|
||||||
|
self.out += self.em.htmlescape(line)
|
||||||
|
|
||||||
|
def wrapData(self):
|
||||||
|
return self.out + '''</pre></body></html>'''
|
||||||
|
|
||||||
|
class PPTFilter:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.em = em
|
||||||
|
self.ntry = 0
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.ntry = 0
|
||||||
|
pass
|
||||||
|
|
||||||
|
def getCmd(self, fn):
|
||||||
|
if self.ntry:
|
||||||
|
return ([], None)
|
||||||
|
self.ntry = 1
|
||||||
|
cmd = rclexecm.which("ppt-dump.py")
|
||||||
|
if cmd:
|
||||||
|
# ppt-dump.py often exits 1 with valid data. Ignore exit value
|
||||||
|
return ([cmd, "--no-struct-output", "--dump-text"],
|
||||||
|
PPTProcessData(self.em), rclexec1.Executor.opt_ignxval)
|
||||||
|
else:
|
||||||
|
return ([], None)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if not rclexecm.which("ppt-dump.py"):
|
||||||
|
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
|
||||||
|
sys.exit(1)
|
||||||
|
proto = rclexecm.RclExecM()
|
||||||
|
filter = PPTFilter(proto)
|
||||||
|
extract = rclexec1.Executor(proto, filter)
|
||||||
|
rclexecm.main(proto, extract)
|
||||||
@ -35,16 +35,20 @@ class RTFProcessData:
|
|||||||
class RTFFilter:
|
class RTFFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
|
self.ntry = 0
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
pass
|
self.ntry = 0
|
||||||
|
|
||||||
def getCmd(self, fn):
|
def getCmd(self, fn):
|
||||||
|
if self.ntry:
|
||||||
|
return ([], None)
|
||||||
|
self.ntry = 1
|
||||||
cmd = rclexecm.which("unrtf")
|
cmd = rclexecm.which("unrtf")
|
||||||
if cmd:
|
if cmd:
|
||||||
return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
|
return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
|
||||||
else:
|
else:
|
||||||
return ([],None)
|
return ([], None)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if not rclexecm.which("unrtf"):
|
if not rclexecm.which("unrtf"):
|
||||||
|
|||||||
64
src/filters/rclxls.py
Executable file
64
src/filters/rclxls.py
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
import rclexec1
|
||||||
|
import xlsxmltocsv
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import xml.sax
|
||||||
|
|
||||||
|
# Processing the output from unrtf
|
||||||
|
class XLSProcessData:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.em = em
|
||||||
|
self.out = ""
|
||||||
|
self.gotdata = 0
|
||||||
|
self.xmldata = ""
|
||||||
|
|
||||||
|
# Some versions of unrtf put out a garbled charset line.
|
||||||
|
# Apart from this, we pass the data untouched.
|
||||||
|
def takeLine(self, line):
|
||||||
|
if not self.gotdata:
|
||||||
|
self.out += '''<html><head>''' + \
|
||||||
|
'''<meta http-equiv="Content-Type" ''' + \
|
||||||
|
'''content="text/html;charset=UTF-8">''' + \
|
||||||
|
'''</head><body><pre>'''
|
||||||
|
self.gotdata = True
|
||||||
|
self.xmldata += line
|
||||||
|
|
||||||
|
def wrapData(self):
|
||||||
|
handler = xlsxmltocsv.XlsXmlHandler()
|
||||||
|
data = xml.sax.parseString(self.xmldata, handler)
|
||||||
|
self.out += self.em.htmlescape(handler.output)
|
||||||
|
return self.out + '''</pre></body></html>'''
|
||||||
|
|
||||||
|
class XLSFilter:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.em = em
|
||||||
|
self.ntry = 0
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.ntry = 0
|
||||||
|
pass
|
||||||
|
|
||||||
|
def getCmd(self, fn):
|
||||||
|
if self.ntry:
|
||||||
|
return ([], None)
|
||||||
|
self.ntry = 1
|
||||||
|
cmd = rclexecm.which("xls-dump.py")
|
||||||
|
if cmd:
|
||||||
|
# xls-dump.py often exits 1 with valid data. Ignore exit value
|
||||||
|
return ([cmd, "--dump-mode=canonical-xml", "--utf-8", "--catch"],
|
||||||
|
XLSProcessData(self.em), rclexec1.Executor.opt_ignxval)
|
||||||
|
else:
|
||||||
|
return ([], None)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if not rclexecm.which("ppt-dump.py"):
|
||||||
|
print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
|
||||||
|
sys.exit(1)
|
||||||
|
proto = rclexecm.RclExecM()
|
||||||
|
filter = XLSFilter(proto)
|
||||||
|
extract = rclexec1.Executor(proto, filter)
|
||||||
|
rclexecm.main(proto, extract)
|
||||||
@ -15,10 +15,13 @@ else:
|
|||||||
dquote = '"'
|
dquote = '"'
|
||||||
|
|
||||||
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||||
|
def __init__(self):
|
||||||
|
self.output = ""
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name == "worksheet":
|
if name == "worksheet":
|
||||||
if "name" in attrs:
|
if "name" in attrs:
|
||||||
print("%s" % attrs["name"].encode("UTF-8"))
|
self.output += "%s\n" % attrs["name"].encode("UTF-8")
|
||||||
elif name == "row":
|
elif name == "row":
|
||||||
self.cells = dict()
|
self.cells = dict()
|
||||||
elif name == "label-cell" or name == "number-cell":
|
elif name == "label-cell" or name == "number-cell":
|
||||||
@ -30,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
self.cells[int(attrs["col"])] = value
|
self.cells[int(attrs["col"])] = value
|
||||||
else:
|
else:
|
||||||
#??
|
#??
|
||||||
sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
|
self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
|
||||||
elif name == "formula-cell":
|
elif name == "formula-cell":
|
||||||
if "formula-result" in attrs and "col" in attrs:
|
if "formula-result" in attrs and "col" in attrs:
|
||||||
self.cells[int(attrs["col"])] = \
|
self.cells[int(attrs["col"])] = \
|
||||||
@ -40,17 +43,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
if name == "row":
|
if name == "row":
|
||||||
curidx = 0
|
curidx = 0
|
||||||
for idx, value in self.cells.iteritems():
|
for idx, value in self.cells.iteritems():
|
||||||
sys.stdout.write(sepstring * (idx - curidx))
|
self.output += sepstring * (idx - curidx)
|
||||||
sys.stdout.write('%s%s%s' % (dquote, value, dquote))
|
self.output += "%s%s%s" % (dquote, value, dquote)
|
||||||
curidx = idx
|
curidx = idx
|
||||||
sys.stdout.write("\n")
|
self.output += "\n"
|
||||||
elif name == "worksheet":
|
elif name == "worksheet":
|
||||||
print("")
|
self.output += "\n"
|
||||||
|
|
||||||
try:
|
|
||||||
xml.sax.parse(sys.stdin, XlsXmlHandler())
|
|
||||||
except BaseException as err:
|
|
||||||
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
sys.exit(0)
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
handler = XlsXmlHandler()
|
||||||
|
xml.sax.parse(sys.stdin, handler)
|
||||||
|
print(handler.output)
|
||||||
|
except BaseException as err:
|
||||||
|
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user