Modified xls and ppt filter to be compatible with python3

This commit is contained in:
Jean-Francois Dockes 2018-03-08 15:51:12 +01:00
parent 56f0a0f9e6
commit d9afcdf8a3
6 changed files with 70 additions and 92 deletions

Binary file not shown.

View File

@ -5,10 +5,6 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# mso-dumper is not compatible with python3
from __future__ import print_function
import sys, os.path, getopt
sys.path.append(sys.path[0]+"/msodump.zip")
from msodumper import ole, pptstream, globals, olestream
@ -51,25 +47,26 @@ class PPTDumper(object):
dirnames = strm.getDirectoryNames()
result = True
for dirname in dirnames:
if len(dirname) == 0 or dirname == 'Root Entry':
sdirname = globals.nulltrunc(dirname)
if len(sdirname) == 0 or sdirname == b"Root Entry":
continue
try:
dirstrm = strm.getDirectoryStreamByName(dirname)
except Exception as err:
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
# The previous version was killed by the exception
# here, so the equivalent is to break, but maybe there
# is no reason to do so.
break
self.__printDirHeader(dirname, len(dirstrm.bytes))
if dirname == "PowerPoint Document":
if sdirname == b"PowerPoint Document":
if not self.__readSubStream(dirstrm):
result = False
elif dirname == "Current User":
elif sdirname == b"Current User":
if not self.__readSubStream(dirstrm):
result = False
elif dirname == "\x05DocumentSummaryInformation":
elif sdirname == b"\x05DocumentSummaryInformation":
strm = olestream.PropertySetStream(dirstrm.bytes)
strm.read()
else:
@ -118,26 +115,15 @@ def main (args):
except getopt.GetoptError:
error("error parsing input options\n")
usage(exname)
return false
status = True
try:
dumper = PPTDumper(args[0], globals.params)
if not dumper.dump():
error("ppt-dump: dump error " + args[0] + "\n")
status = False
except:
error("ppt-dump: FAILURE (bad format?) " + args[0] + "\n")
status = False
return
dumper = PPTDumper(args[0], globals.params)
if not dumper.dump():
error("FAILURE\n")
if globals.params.dumpText:
print(globals.textdump.replace("\r", "\n"))
return(status)
globals.dumptext()
if __name__ == '__main__':
if main(sys.argv):
sys.exit(0)
else:
sys.exit(1)
main(sys.argv)
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:

View File

@ -15,20 +15,20 @@ import os
class PPTProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.out = b""
self.gotdata = 0
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.out += b'''<html><head>''' + \
b'''<meta http-equiv="Content-Type" ''' + \
b'''content="text/html;charset=UTF-8">''' + \
b'''</head><body><pre>'''
self.gotdata = True
self.out += self.em.htmlescape(line) + "<br>\n"
self.out += self.em.htmlescape(line) + b"<br>\n"
def wrapData(self):
return self.out + '''</pre></body></html>'''
return self.out + b'''</pre></body></html>'''
class PPTFilter:
def __init__(self, em):

View File

@ -15,9 +15,9 @@ import xml.sax
class XLSProcessData:
def __init__(self, em, ishtml = False):
self.em = em
self.out = ""
self.out = b""
self.gotdata = 0
self.xmldata = ""
self.xmldata = b""
self.ishtml = ishtml
def takeLine(self, line):
@ -25,10 +25,10 @@ class XLSProcessData:
self.out += line + "\n"
return
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.out += b'''<html><head>''' + \
b'''<meta http-equiv="Content-Type" ''' + \
b'''content="text/html;charset=UTF-8">''' + \
b'''</head><body><pre>'''
self.gotdata = True
self.xmldata += line
@ -36,9 +36,9 @@ class XLSProcessData:
if self.ishtml:
return self.out
handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler)
xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output)
return self.out + '''</pre></body></html>'''
return self.out + b'''</pre></body></html>'''
class XLSFilter:
def __init__(self, em):
@ -56,7 +56,7 @@ class XLSFilter:
# Some HTML files masquerade as XLS
try:
data = open(fn, 'rb').read(512)
if data.find('html') != -1 or data.find('HTML') != -1:
if data.find(b'html') != -1 or data.find(b'HTML') != -1:
return ("cat", XLSProcessData(self.em, True))
except Exception as err:
self.em.rclog("Error reading %s:%s" % (fn, str(err)))

View File

@ -4,11 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# mso-dumper is not compatible with python3
from __future__ import print_function
from builtins import range
import sys, os.path, optparse
sys.path.append(sys.path[0]+"/msodump.zip")
@ -21,8 +17,8 @@ def equalsName (name, array):
if len(name) != len(array):
return False
for i in xrange(0, len(name)):
if ord(name[i]) != array[i]:
for i in range(0, len(name)):
if globals.indexbytes(name, i) != array[i]:
return False
return True
@ -50,13 +46,13 @@ class XLDumper(object):
def __printDirHeader (self, direntry, byteLen):
dirname = direntry.Name
dirname = globals.encodeName(dirname)
print("")
print("="*globals.OutputWidth)
globals.outputln("")
globals.outputln("="*globals.OutputWidth)
if direntry.isStorage():
print("%s (storage)"%dirname)
globals.outputln("%s (storage)"%dirname)
else:
print("%s (stream, size: %d bytes)"%(dirname, byteLen))
print("-"*globals.OutputWidth)
globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen))
globals.outputln("-"*globals.OutputWidth)
def __parseFile (self):
file = open(self.filepath, 'rb')
@ -71,38 +67,34 @@ class XLDumper(object):
root = docroot.appendElement('xls-dump')
for d in dirs:
if d.Name != "Workbook":
if d.Name != b"Workbook":
# for now, we only dump the Workbook directory stream.
continue
dirstrm = self.strm.getDirectoryStream(d)
data = self.__readSubStreamXML(dirstrm)
self.__dumpDataAsXML(data, root)
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
def dumpCanonicalXML (self):
try:
self.__parseFile()
docroot = node.Root()
root = docroot.appendElement('xls-dump')
self.__parseFile()
docroot = node.Root()
root = docroot.appendElement('xls-dump')
dirEntries = self.strm.getDirectoryEntries()
for entry in dirEntries:
dirname = entry.Name
if dirname != "Workbook":
# for now, we only dump the Workbook directory stream.
continue
dirstrm = self.strm.getDirectoryStream(entry)
wbmodel = self.__buildWorkbookModel(dirstrm)
wbmodel.encrypted = self.strmData.encrypted
root.appendChild(wbmodel.createDOM())
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
dirEntries = self.strm.getDirectoryEntries()
for entry in dirEntries:
dirname = entry.Name
if dirname != b"Workbook":
# for now, we only dump the Workbook directory stream.
continue
except Exception as err:
print("xls-dump.py: error: %s" % err, file=sys.stderr)
sys.exit(1)
dirstrm = self.strm.getDirectoryStream(entry)
wbmodel = self.__buildWorkbookModel(dirstrm)
wbmodel.encrypted = self.strmData.encrypted
root.appendChild(wbmodel.createDOM())
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
def dump (self):
self.__parseFile()
@ -123,18 +115,18 @@ class XLDumper(object):
if entry.isStorage():
continue
elif dirname == "Workbook":
elif dirname == b"Workbook":
success = True
while success:
success = self.__readSubStream(dirstrm)
elif dirname == "Revision Log":
elif dirname == b"Revision Log":
dirstrm.type = xlsstream.DirType.RevisionLog
self.__readSubStream(dirstrm)
elif dirname == "EncryptionInfo":
elif dirname == b"EncryptionInfo":
globals.dumpBytes(dirstrm.bytes, 512)
print("-"*globals.OutputWidth)
globals.outputln("-"*globals.OutputWidth)
info = msocrypto.EncryptionInfo(dirstrm.bytes)
info.read()
info.output()

View File

@ -32,20 +32,20 @@ import xml.sax
dtt = True
if dtt:
sepstring = "\t"
dquote = ''
sepstring = b"\t"
dquote = b""
else:
sepstring = ","
dquote = '"'
sepstring = b","
dquote = b'"'
class XlsXmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.output = ""
self.output = b''
def startElement(self, name, attrs):
if name == "worksheet":
if "name" in attrs:
self.output += "%s\n" % attrs["name"].encode("UTF-8")
self.output += b"%s\n" % attrs["name"].encode("UTF-8")
elif name == "row":
self.cells = dict()
elif name == "label-cell" or name == "number-cell":
@ -57,7 +57,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
self.cells[int(attrs["col"])] = value
else:
#??
self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
self.output += b"%s%s" % (value.encode("UTF-8"), sepstring)
elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \
@ -68,11 +68,11 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
curidx = 0
for idx, value in self.cells.items():
self.output += sepstring * (idx - curidx)
self.output += "%s%s%s" % (dquote, value, dquote)
self.output += b"%s%s%s" % (dquote, value, dquote)
curidx = idx
self.output += "\n"
self.output += b"\n"
elif name == "worksheet":
self.output += "\n"
self.output += b"\n"
if __name__ == '__main__':