Modified xls and ppt filter to be compatible with python3
This commit is contained in:
parent
56f0a0f9e6
commit
d9afcdf8a3
Binary file not shown.
@ -5,10 +5,6 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
# mso-dumper is not compatible with python3
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys, os.path, getopt
|
||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||
from msodumper import ole, pptstream, globals, olestream
|
||||
@ -51,25 +47,26 @@ class PPTDumper(object):
|
||||
dirnames = strm.getDirectoryNames()
|
||||
result = True
|
||||
for dirname in dirnames:
|
||||
if len(dirname) == 0 or dirname == 'Root Entry':
|
||||
sdirname = globals.nulltrunc(dirname)
|
||||
if len(sdirname) == 0 or sdirname == b"Root Entry":
|
||||
continue
|
||||
|
||||
try:
|
||||
dirstrm = strm.getDirectoryStreamByName(dirname)
|
||||
except Exception as err:
|
||||
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
|
||||
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
|
||||
# The previous version was killed by the exception
|
||||
# here, so the equivalent is to break, but maybe there
|
||||
# is no reason to do so.
|
||||
break
|
||||
self.__printDirHeader(dirname, len(dirstrm.bytes))
|
||||
if dirname == "PowerPoint Document":
|
||||
if sdirname == b"PowerPoint Document":
|
||||
if not self.__readSubStream(dirstrm):
|
||||
result = False
|
||||
elif dirname == "Current User":
|
||||
elif sdirname == b"Current User":
|
||||
if not self.__readSubStream(dirstrm):
|
||||
result = False
|
||||
elif dirname == "\x05DocumentSummaryInformation":
|
||||
elif sdirname == b"\x05DocumentSummaryInformation":
|
||||
strm = olestream.PropertySetStream(dirstrm.bytes)
|
||||
strm.read()
|
||||
else:
|
||||
@ -118,26 +115,15 @@ def main (args):
|
||||
except getopt.GetoptError:
|
||||
error("error parsing input options\n")
|
||||
usage(exname)
|
||||
return false
|
||||
|
||||
status = True
|
||||
try:
|
||||
dumper = PPTDumper(args[0], globals.params)
|
||||
if not dumper.dump():
|
||||
error("ppt-dump: dump error " + args[0] + "\n")
|
||||
status = False
|
||||
except:
|
||||
error("ppt-dump: FAILURE (bad format?) " + args[0] + "\n")
|
||||
status = False
|
||||
return
|
||||
|
||||
dumper = PPTDumper(args[0], globals.params)
|
||||
if not dumper.dump():
|
||||
error("FAILURE\n")
|
||||
if globals.params.dumpText:
|
||||
print(globals.textdump.replace("\r", "\n"))
|
||||
return(status)
|
||||
|
||||
globals.dumptext()
|
||||
|
||||
if __name__ == '__main__':
|
||||
if main(sys.argv):
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
main(sys.argv)
|
||||
|
||||
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
|
||||
|
||||
@ -15,20 +15,20 @@ import os
|
||||
class PPTProcessData:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.out = ""
|
||||
self.out = b""
|
||||
self.gotdata = 0
|
||||
|
||||
def takeLine(self, line):
|
||||
if not self.gotdata:
|
||||
self.out += '''<html><head>''' + \
|
||||
'''<meta http-equiv="Content-Type" ''' + \
|
||||
'''content="text/html;charset=UTF-8">''' + \
|
||||
'''</head><body><pre>'''
|
||||
self.out += b'''<html><head>''' + \
|
||||
b'''<meta http-equiv="Content-Type" ''' + \
|
||||
b'''content="text/html;charset=UTF-8">''' + \
|
||||
b'''</head><body><pre>'''
|
||||
self.gotdata = True
|
||||
self.out += self.em.htmlescape(line) + "<br>\n"
|
||||
self.out += self.em.htmlescape(line) + b"<br>\n"
|
||||
|
||||
def wrapData(self):
|
||||
return self.out + '''</pre></body></html>'''
|
||||
return self.out + b'''</pre></body></html>'''
|
||||
|
||||
class PPTFilter:
|
||||
def __init__(self, em):
|
||||
|
||||
@ -15,9 +15,9 @@ import xml.sax
|
||||
class XLSProcessData:
|
||||
def __init__(self, em, ishtml = False):
|
||||
self.em = em
|
||||
self.out = ""
|
||||
self.out = b""
|
||||
self.gotdata = 0
|
||||
self.xmldata = ""
|
||||
self.xmldata = b""
|
||||
self.ishtml = ishtml
|
||||
|
||||
def takeLine(self, line):
|
||||
@ -25,10 +25,10 @@ class XLSProcessData:
|
||||
self.out += line + "\n"
|
||||
return
|
||||
if not self.gotdata:
|
||||
self.out += '''<html><head>''' + \
|
||||
'''<meta http-equiv="Content-Type" ''' + \
|
||||
'''content="text/html;charset=UTF-8">''' + \
|
||||
'''</head><body><pre>'''
|
||||
self.out += b'''<html><head>''' + \
|
||||
b'''<meta http-equiv="Content-Type" ''' + \
|
||||
b'''content="text/html;charset=UTF-8">''' + \
|
||||
b'''</head><body><pre>'''
|
||||
self.gotdata = True
|
||||
self.xmldata += line
|
||||
|
||||
@ -36,9 +36,9 @@ class XLSProcessData:
|
||||
if self.ishtml:
|
||||
return self.out
|
||||
handler = xlsxmltocsv.XlsXmlHandler()
|
||||
data = xml.sax.parseString(self.xmldata, handler)
|
||||
xml.sax.parseString(self.xmldata, handler)
|
||||
self.out += self.em.htmlescape(handler.output)
|
||||
return self.out + '''</pre></body></html>'''
|
||||
return self.out + b'''</pre></body></html>'''
|
||||
|
||||
class XLSFilter:
|
||||
def __init__(self, em):
|
||||
@ -56,7 +56,7 @@ class XLSFilter:
|
||||
# Some HTML files masquerade as XLS
|
||||
try:
|
||||
data = open(fn, 'rb').read(512)
|
||||
if data.find('html') != -1 or data.find('HTML') != -1:
|
||||
if data.find(b'html') != -1 or data.find(b'HTML') != -1:
|
||||
return ("cat", XLSProcessData(self.em, True))
|
||||
except Exception as err:
|
||||
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
|
||||
|
||||
@ -4,11 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
# mso-dumper is not compatible with python3
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from builtins import range
|
||||
import sys, os.path, optparse
|
||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||
|
||||
@ -21,8 +17,8 @@ def equalsName (name, array):
|
||||
if len(name) != len(array):
|
||||
return False
|
||||
|
||||
for i in xrange(0, len(name)):
|
||||
if ord(name[i]) != array[i]:
|
||||
for i in range(0, len(name)):
|
||||
if globals.indexbytes(name, i) != array[i]:
|
||||
return False
|
||||
|
||||
return True
|
||||
@ -50,13 +46,13 @@ class XLDumper(object):
|
||||
def __printDirHeader (self, direntry, byteLen):
|
||||
dirname = direntry.Name
|
||||
dirname = globals.encodeName(dirname)
|
||||
print("")
|
||||
print("="*globals.OutputWidth)
|
||||
globals.outputln("")
|
||||
globals.outputln("="*globals.OutputWidth)
|
||||
if direntry.isStorage():
|
||||
print("%s (storage)"%dirname)
|
||||
globals.outputln("%s (storage)"%dirname)
|
||||
else:
|
||||
print("%s (stream, size: %d bytes)"%(dirname, byteLen))
|
||||
print("-"*globals.OutputWidth)
|
||||
globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen))
|
||||
globals.outputln("-"*globals.OutputWidth)
|
||||
|
||||
def __parseFile (self):
|
||||
file = open(self.filepath, 'rb')
|
||||
@ -71,38 +67,34 @@ class XLDumper(object):
|
||||
root = docroot.appendElement('xls-dump')
|
||||
|
||||
for d in dirs:
|
||||
if d.Name != "Workbook":
|
||||
if d.Name != b"Workbook":
|
||||
# for now, we only dump the Workbook directory stream.
|
||||
continue
|
||||
|
||||
dirstrm = self.strm.getDirectoryStream(d)
|
||||
data = self.__readSubStreamXML(dirstrm)
|
||||
self.__dumpDataAsXML(data, root)
|
||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
||||
|
||||
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
|
||||
|
||||
def dumpCanonicalXML (self):
|
||||
try:
|
||||
self.__parseFile()
|
||||
docroot = node.Root()
|
||||
root = docroot.appendElement('xls-dump')
|
||||
self.__parseFile()
|
||||
docroot = node.Root()
|
||||
root = docroot.appendElement('xls-dump')
|
||||
|
||||
dirEntries = self.strm.getDirectoryEntries()
|
||||
for entry in dirEntries:
|
||||
dirname = entry.Name
|
||||
if dirname != "Workbook":
|
||||
# for now, we only dump the Workbook directory stream.
|
||||
continue
|
||||
|
||||
dirstrm = self.strm.getDirectoryStream(entry)
|
||||
wbmodel = self.__buildWorkbookModel(dirstrm)
|
||||
wbmodel.encrypted = self.strmData.encrypted
|
||||
root.appendChild(wbmodel.createDOM())
|
||||
|
||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
||||
dirEntries = self.strm.getDirectoryEntries()
|
||||
for entry in dirEntries:
|
||||
dirname = entry.Name
|
||||
if dirname != b"Workbook":
|
||||
# for now, we only dump the Workbook directory stream.
|
||||
continue
|
||||
|
||||
except Exception as err:
|
||||
print("xls-dump.py: error: %s" % err, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
dirstrm = self.strm.getDirectoryStream(entry)
|
||||
wbmodel = self.__buildWorkbookModel(dirstrm)
|
||||
wbmodel.encrypted = self.strmData.encrypted
|
||||
root.appendChild(wbmodel.createDOM())
|
||||
|
||||
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
|
||||
|
||||
def dump (self):
|
||||
self.__parseFile()
|
||||
@ -123,18 +115,18 @@ class XLDumper(object):
|
||||
if entry.isStorage():
|
||||
continue
|
||||
|
||||
elif dirname == "Workbook":
|
||||
elif dirname == b"Workbook":
|
||||
success = True
|
||||
while success:
|
||||
success = self.__readSubStream(dirstrm)
|
||||
|
||||
elif dirname == "Revision Log":
|
||||
elif dirname == b"Revision Log":
|
||||
dirstrm.type = xlsstream.DirType.RevisionLog
|
||||
self.__readSubStream(dirstrm)
|
||||
|
||||
elif dirname == "EncryptionInfo":
|
||||
elif dirname == b"EncryptionInfo":
|
||||
globals.dumpBytes(dirstrm.bytes, 512)
|
||||
print("-"*globals.OutputWidth)
|
||||
globals.outputln("-"*globals.OutputWidth)
|
||||
info = msocrypto.EncryptionInfo(dirstrm.bytes)
|
||||
info.read()
|
||||
info.output()
|
||||
|
||||
@ -32,20 +32,20 @@ import xml.sax
|
||||
dtt = True
|
||||
|
||||
if dtt:
|
||||
sepstring = "\t"
|
||||
dquote = ''
|
||||
sepstring = b"\t"
|
||||
dquote = b""
|
||||
else:
|
||||
sepstring = ","
|
||||
dquote = '"'
|
||||
|
||||
sepstring = b","
|
||||
dquote = b'"'
|
||||
|
||||
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||
def __init__(self):
|
||||
self.output = ""
|
||||
self.output = b''
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name == "worksheet":
|
||||
if "name" in attrs:
|
||||
self.output += "%s\n" % attrs["name"].encode("UTF-8")
|
||||
self.output += b"%s\n" % attrs["name"].encode("UTF-8")
|
||||
elif name == "row":
|
||||
self.cells = dict()
|
||||
elif name == "label-cell" or name == "number-cell":
|
||||
@ -57,7 +57,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||
self.cells[int(attrs["col"])] = value
|
||||
else:
|
||||
#??
|
||||
self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
|
||||
self.output += b"%s%s" % (value.encode("UTF-8"), sepstring)
|
||||
elif name == "formula-cell":
|
||||
if "formula-result" in attrs and "col" in attrs:
|
||||
self.cells[int(attrs["col"])] = \
|
||||
@ -68,11 +68,11 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||
curidx = 0
|
||||
for idx, value in self.cells.items():
|
||||
self.output += sepstring * (idx - curidx)
|
||||
self.output += "%s%s%s" % (dquote, value, dquote)
|
||||
self.output += b"%s%s%s" % (dquote, value, dquote)
|
||||
curidx = idx
|
||||
self.output += "\n"
|
||||
self.output += b"\n"
|
||||
elif name == "worksheet":
|
||||
self.output += "\n"
|
||||
self.output += b"\n"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user