Modified xls and ppt filter to be compatible with python3
This commit is contained in:
parent
56f0a0f9e6
commit
d9afcdf8a3
Binary file not shown.
@ -5,10 +5,6 @@
|
|||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
#
|
#
|
||||||
|
|
||||||
# mso-dumper is not compatible with python3
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import sys, os.path, getopt
|
import sys, os.path, getopt
|
||||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||||
from msodumper import ole, pptstream, globals, olestream
|
from msodumper import ole, pptstream, globals, olestream
|
||||||
@ -51,25 +47,26 @@ class PPTDumper(object):
|
|||||||
dirnames = strm.getDirectoryNames()
|
dirnames = strm.getDirectoryNames()
|
||||||
result = True
|
result = True
|
||||||
for dirname in dirnames:
|
for dirname in dirnames:
|
||||||
if len(dirname) == 0 or dirname == 'Root Entry':
|
sdirname = globals.nulltrunc(dirname)
|
||||||
|
if len(sdirname) == 0 or sdirname == b"Root Entry":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dirstrm = strm.getDirectoryStreamByName(dirname)
|
dirstrm = strm.getDirectoryStreamByName(dirname)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
|
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
|
||||||
# The previous version was killed by the exception
|
# The previous version was killed by the exception
|
||||||
# here, so the equivalent is to break, but maybe there
|
# here, so the equivalent is to break, but maybe there
|
||||||
# is no reason to do so.
|
# is no reason to do so.
|
||||||
break
|
break
|
||||||
self.__printDirHeader(dirname, len(dirstrm.bytes))
|
self.__printDirHeader(dirname, len(dirstrm.bytes))
|
||||||
if dirname == "PowerPoint Document":
|
if sdirname == b"PowerPoint Document":
|
||||||
if not self.__readSubStream(dirstrm):
|
if not self.__readSubStream(dirstrm):
|
||||||
result = False
|
result = False
|
||||||
elif dirname == "Current User":
|
elif sdirname == b"Current User":
|
||||||
if not self.__readSubStream(dirstrm):
|
if not self.__readSubStream(dirstrm):
|
||||||
result = False
|
result = False
|
||||||
elif dirname == "\x05DocumentSummaryInformation":
|
elif sdirname == b"\x05DocumentSummaryInformation":
|
||||||
strm = olestream.PropertySetStream(dirstrm.bytes)
|
strm = olestream.PropertySetStream(dirstrm.bytes)
|
||||||
strm.read()
|
strm.read()
|
||||||
else:
|
else:
|
||||||
@ -118,26 +115,15 @@ def main (args):
|
|||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
error("error parsing input options\n")
|
error("error parsing input options\n")
|
||||||
usage(exname)
|
usage(exname)
|
||||||
return false
|
return
|
||||||
|
|
||||||
status = True
|
|
||||||
try:
|
|
||||||
dumper = PPTDumper(args[0], globals.params)
|
|
||||||
if not dumper.dump():
|
|
||||||
error("ppt-dump: dump error " + args[0] + "\n")
|
|
||||||
status = False
|
|
||||||
except:
|
|
||||||
error("ppt-dump: FAILURE (bad format?) " + args[0] + "\n")
|
|
||||||
status = False
|
|
||||||
|
|
||||||
|
dumper = PPTDumper(args[0], globals.params)
|
||||||
|
if not dumper.dump():
|
||||||
|
error("FAILURE\n")
|
||||||
if globals.params.dumpText:
|
if globals.params.dumpText:
|
||||||
print(globals.textdump.replace("\r", "\n"))
|
globals.dumptext()
|
||||||
return(status)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if main(sys.argv):
|
main(sys.argv)
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
|
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
|
||||||
|
|||||||
@ -15,20 +15,20 @@ import os
|
|||||||
class PPTProcessData:
|
class PPTProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = ""
|
self.out = b""
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += '''<html><head>''' + \
|
self.out += b'''<html><head>''' + \
|
||||||
'''<meta http-equiv="Content-Type" ''' + \
|
b'''<meta http-equiv="Content-Type" ''' + \
|
||||||
'''content="text/html;charset=UTF-8">''' + \
|
b'''content="text/html;charset=UTF-8">''' + \
|
||||||
'''</head><body><pre>'''
|
b'''</head><body><pre>'''
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.out += self.em.htmlescape(line) + "<br>\n"
|
self.out += self.em.htmlescape(line) + b"<br>\n"
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
return self.out + '''</pre></body></html>'''
|
return self.out + b'''</pre></body></html>'''
|
||||||
|
|
||||||
class PPTFilter:
|
class PPTFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
|
|||||||
@ -15,9 +15,9 @@ import xml.sax
|
|||||||
class XLSProcessData:
|
class XLSProcessData:
|
||||||
def __init__(self, em, ishtml = False):
|
def __init__(self, em, ishtml = False):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = ""
|
self.out = b""
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
self.xmldata = ""
|
self.xmldata = b""
|
||||||
self.ishtml = ishtml
|
self.ishtml = ishtml
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
@ -25,10 +25,10 @@ class XLSProcessData:
|
|||||||
self.out += line + "\n"
|
self.out += line + "\n"
|
||||||
return
|
return
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += '''<html><head>''' + \
|
self.out += b'''<html><head>''' + \
|
||||||
'''<meta http-equiv="Content-Type" ''' + \
|
b'''<meta http-equiv="Content-Type" ''' + \
|
||||||
'''content="text/html;charset=UTF-8">''' + \
|
b'''content="text/html;charset=UTF-8">''' + \
|
||||||
'''</head><body><pre>'''
|
b'''</head><body><pre>'''
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.xmldata += line
|
self.xmldata += line
|
||||||
|
|
||||||
@ -36,9 +36,9 @@ class XLSProcessData:
|
|||||||
if self.ishtml:
|
if self.ishtml:
|
||||||
return self.out
|
return self.out
|
||||||
handler = xlsxmltocsv.XlsXmlHandler()
|
handler = xlsxmltocsv.XlsXmlHandler()
|
||||||
data = xml.sax.parseString(self.xmldata, handler)
|
xml.sax.parseString(self.xmldata, handler)
|
||||||
self.out += self.em.htmlescape(handler.output)
|
self.out += self.em.htmlescape(handler.output)
|
||||||
return self.out + '''</pre></body></html>'''
|
return self.out + b'''</pre></body></html>'''
|
||||||
|
|
||||||
class XLSFilter:
|
class XLSFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
@ -56,7 +56,7 @@ class XLSFilter:
|
|||||||
# Some HTML files masquerade as XLS
|
# Some HTML files masquerade as XLS
|
||||||
try:
|
try:
|
||||||
data = open(fn, 'rb').read(512)
|
data = open(fn, 'rb').read(512)
|
||||||
if data.find('html') != -1 or data.find('HTML') != -1:
|
if data.find(b'html') != -1 or data.find(b'HTML') != -1:
|
||||||
return ("cat", XLSProcessData(self.em, True))
|
return ("cat", XLSProcessData(self.em, True))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
|
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
|
||||||
|
|||||||
@ -4,11 +4,7 @@
|
|||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
#
|
#
|
||||||
|
from builtins import range
|
||||||
# mso-dumper is not compatible with python3
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import sys, os.path, optparse
|
import sys, os.path, optparse
|
||||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||||
|
|
||||||
@ -21,8 +17,8 @@ def equalsName (name, array):
|
|||||||
if len(name) != len(array):
|
if len(name) != len(array):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for i in xrange(0, len(name)):
|
for i in range(0, len(name)):
|
||||||
if ord(name[i]) != array[i]:
|
if globals.indexbytes(name, i) != array[i]:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -50,13 +46,13 @@ class XLDumper(object):
|
|||||||
def __printDirHeader (self, direntry, byteLen):
|
def __printDirHeader (self, direntry, byteLen):
|
||||||
dirname = direntry.Name
|
dirname = direntry.Name
|
||||||
dirname = globals.encodeName(dirname)
|
dirname = globals.encodeName(dirname)
|
||||||
print("")
|
globals.outputln("")
|
||||||
print("="*globals.OutputWidth)
|
globals.outputln("="*globals.OutputWidth)
|
||||||
if direntry.isStorage():
|
if direntry.isStorage():
|
||||||
print("%s (storage)"%dirname)
|
globals.outputln("%s (storage)"%dirname)
|
||||||
else:
|
else:
|
||||||
print("%s (stream, size: %d bytes)"%(dirname, byteLen))
|
globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen))
|
||||||
print("-"*globals.OutputWidth)
|
globals.outputln("-"*globals.OutputWidth)
|
||||||
|
|
||||||
def __parseFile (self):
|
def __parseFile (self):
|
||||||
file = open(self.filepath, 'rb')
|
file = open(self.filepath, 'rb')
|
||||||
@ -71,38 +67,34 @@ class XLDumper(object):
|
|||||||
root = docroot.appendElement('xls-dump')
|
root = docroot.appendElement('xls-dump')
|
||||||
|
|
||||||
for d in dirs:
|
for d in dirs:
|
||||||
if d.Name != "Workbook":
|
if d.Name != b"Workbook":
|
||||||
# for now, we only dump the Workbook directory stream.
|
# for now, we only dump the Workbook directory stream.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dirstrm = self.strm.getDirectoryStream(d)
|
dirstrm = self.strm.getDirectoryStream(d)
|
||||||
data = self.__readSubStreamXML(dirstrm)
|
data = self.__readSubStreamXML(dirstrm)
|
||||||
self.__dumpDataAsXML(data, root)
|
self.__dumpDataAsXML(data, root)
|
||||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
|
||||||
|
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
|
||||||
|
|
||||||
def dumpCanonicalXML (self):
|
def dumpCanonicalXML (self):
|
||||||
try:
|
self.__parseFile()
|
||||||
self.__parseFile()
|
docroot = node.Root()
|
||||||
docroot = node.Root()
|
root = docroot.appendElement('xls-dump')
|
||||||
root = docroot.appendElement('xls-dump')
|
|
||||||
|
|
||||||
dirEntries = self.strm.getDirectoryEntries()
|
dirEntries = self.strm.getDirectoryEntries()
|
||||||
for entry in dirEntries:
|
for entry in dirEntries:
|
||||||
dirname = entry.Name
|
dirname = entry.Name
|
||||||
if dirname != "Workbook":
|
if dirname != b"Workbook":
|
||||||
# for now, we only dump the Workbook directory stream.
|
# for now, we only dump the Workbook directory stream.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dirstrm = self.strm.getDirectoryStream(entry)
|
|
||||||
wbmodel = self.__buildWorkbookModel(dirstrm)
|
|
||||||
wbmodel.encrypted = self.strmData.encrypted
|
|
||||||
root.appendChild(wbmodel.createDOM())
|
|
||||||
|
|
||||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
|
||||||
|
|
||||||
except Exception as err:
|
dirstrm = self.strm.getDirectoryStream(entry)
|
||||||
print("xls-dump.py: error: %s" % err, file=sys.stderr)
|
wbmodel = self.__buildWorkbookModel(dirstrm)
|
||||||
sys.exit(1)
|
wbmodel.encrypted = self.strmData.encrypted
|
||||||
|
root.appendChild(wbmodel.createDOM())
|
||||||
|
|
||||||
|
node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
|
||||||
|
|
||||||
def dump (self):
|
def dump (self):
|
||||||
self.__parseFile()
|
self.__parseFile()
|
||||||
@ -123,18 +115,18 @@ class XLDumper(object):
|
|||||||
if entry.isStorage():
|
if entry.isStorage():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif dirname == "Workbook":
|
elif dirname == b"Workbook":
|
||||||
success = True
|
success = True
|
||||||
while success:
|
while success:
|
||||||
success = self.__readSubStream(dirstrm)
|
success = self.__readSubStream(dirstrm)
|
||||||
|
|
||||||
elif dirname == "Revision Log":
|
elif dirname == b"Revision Log":
|
||||||
dirstrm.type = xlsstream.DirType.RevisionLog
|
dirstrm.type = xlsstream.DirType.RevisionLog
|
||||||
self.__readSubStream(dirstrm)
|
self.__readSubStream(dirstrm)
|
||||||
|
|
||||||
elif dirname == "EncryptionInfo":
|
elif dirname == b"EncryptionInfo":
|
||||||
globals.dumpBytes(dirstrm.bytes, 512)
|
globals.dumpBytes(dirstrm.bytes, 512)
|
||||||
print("-"*globals.OutputWidth)
|
globals.outputln("-"*globals.OutputWidth)
|
||||||
info = msocrypto.EncryptionInfo(dirstrm.bytes)
|
info = msocrypto.EncryptionInfo(dirstrm.bytes)
|
||||||
info.read()
|
info.read()
|
||||||
info.output()
|
info.output()
|
||||||
|
|||||||
@ -32,20 +32,20 @@ import xml.sax
|
|||||||
dtt = True
|
dtt = True
|
||||||
|
|
||||||
if dtt:
|
if dtt:
|
||||||
sepstring = "\t"
|
sepstring = b"\t"
|
||||||
dquote = ''
|
dquote = b""
|
||||||
else:
|
else:
|
||||||
sepstring = ","
|
sepstring = b","
|
||||||
dquote = '"'
|
dquote = b'"'
|
||||||
|
|
||||||
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.output = ""
|
self.output = b''
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name == "worksheet":
|
if name == "worksheet":
|
||||||
if "name" in attrs:
|
if "name" in attrs:
|
||||||
self.output += "%s\n" % attrs["name"].encode("UTF-8")
|
self.output += b"%s\n" % attrs["name"].encode("UTF-8")
|
||||||
elif name == "row":
|
elif name == "row":
|
||||||
self.cells = dict()
|
self.cells = dict()
|
||||||
elif name == "label-cell" or name == "number-cell":
|
elif name == "label-cell" or name == "number-cell":
|
||||||
@ -57,7 +57,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
self.cells[int(attrs["col"])] = value
|
self.cells[int(attrs["col"])] = value
|
||||||
else:
|
else:
|
||||||
#??
|
#??
|
||||||
self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
|
self.output += b"%s%s" % (value.encode("UTF-8"), sepstring)
|
||||||
elif name == "formula-cell":
|
elif name == "formula-cell":
|
||||||
if "formula-result" in attrs and "col" in attrs:
|
if "formula-result" in attrs and "col" in attrs:
|
||||||
self.cells[int(attrs["col"])] = \
|
self.cells[int(attrs["col"])] = \
|
||||||
@ -68,11 +68,11 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
curidx = 0
|
curidx = 0
|
||||||
for idx, value in self.cells.items():
|
for idx, value in self.cells.items():
|
||||||
self.output += sepstring * (idx - curidx)
|
self.output += sepstring * (idx - curidx)
|
||||||
self.output += "%s%s%s" % (dquote, value, dquote)
|
self.output += b"%s%s%s" % (dquote, value, dquote)
|
||||||
curidx = idx
|
curidx = idx
|
||||||
self.output += "\n"
|
self.output += b"\n"
|
||||||
elif name == "worksheet":
|
elif name == "worksheet":
|
||||||
self.output += "\n"
|
self.output += b"\n"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user