diff --git a/src/filters/msodump.zip b/src/filters/msodump.zip index a1b6d9e6..e39359d5 100644 Binary files a/src/filters/msodump.zip and b/src/filters/msodump.zip differ diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py index a84eec79..f41a9f39 100755 --- a/src/filters/ppt-dump.py +++ b/src/filters/ppt-dump.py @@ -5,10 +5,6 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -# mso-dumper is not compatible with python3 - -from __future__ import print_function - import sys, os.path, getopt sys.path.append(sys.path[0]+"/msodump.zip") from msodumper import ole, pptstream, globals, olestream @@ -51,25 +47,26 @@ class PPTDumper(object): dirnames = strm.getDirectoryNames() result = True for dirname in dirnames: - if len(dirname) == 0 or dirname == 'Root Entry': + sdirname = globals.nulltrunc(dirname) + if len(sdirname) == 0 or sdirname == b"Root Entry": continue try: dirstrm = strm.getDirectoryStreamByName(dirname) except Exception as err: - error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath)) + error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err))) # The previous version was killed by the exception # here, so the equivalent is to break, but maybe there # is no reason to do so. break self.__printDirHeader(dirname, len(dirstrm.bytes)) - if dirname == "PowerPoint Document": + if sdirname == b"PowerPoint Document": if not self.__readSubStream(dirstrm): result = False - elif dirname == "Current User": + elif sdirname == b"Current User": if not self.__readSubStream(dirstrm): result = False - elif dirname == "\x05DocumentSummaryInformation": + elif sdirname == b"\x05DocumentSummaryInformation": strm = olestream.PropertySetStream(dirstrm.bytes) strm.read() else: @@ -118,26 +115,15 @@ def main (args): except getopt.GetoptError: error("error parsing input options\n") usage(exname) - return false - - status = True - try: - dumper = PPTDumper(args[0], globals.params) - if not dumper.dump(): - error("ppt-dump: dump error " + args[0] + "\n") - status = False - except: - error("ppt-dump: FAILURE (bad format?) " + args[0] + "\n") - status = False + return + dumper = PPTDumper(args[0], globals.params) + if not dumper.dump(): + error("FAILURE\n") if globals.params.dumpText: - print(globals.textdump.replace("\r", "\n")) - return(status) - + globals.dumptext() + if __name__ == '__main__': - if main(sys.argv): - sys.exit(0) - else: - sys.exit(1) + main(sys.argv) # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py index d86cc897..a4e50265 100755 --- a/src/filters/rclppt.py +++ b/src/filters/rclppt.py @@ -15,20 +15,20 @@ import os class PPTProcessData: def __init__(self, em): self.em = em - self.out = "" + self.out = b"" self.gotdata = 0 def takeLine(self, line): if not self.gotdata: - self.out += '''
''' + \ - '''''' + \ - ''''''
+ self.out += b'''''' + \
+ b'''''' + \
+ b''''''
self.gotdata = True
- self.out += self.em.htmlescape(line) + "
\n"
+ self.out += self.em.htmlescape(line) + b"
\n"
def wrapData(self):
- return self.out + ''''''
+ return self.out + b''''''
class PPTFilter:
def __init__(self, em):
diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py
index 82fc2379..c7b2343a 100755
--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@@ -15,9 +15,9 @@ import xml.sax
class XLSProcessData:
def __init__(self, em, ishtml = False):
self.em = em
- self.out = ""
+ self.out = b""
self.gotdata = 0
- self.xmldata = ""
+ self.xmldata = b""
self.ishtml = ishtml
def takeLine(self, line):
@@ -25,10 +25,10 @@ class XLSProcessData:
self.out += line + "\n"
return
if not self.gotdata:
- self.out += '''''' + \
- '''''' + \
- ''''''
+ self.out += b'''''' + \
+ b'''''' + \
+ b''''''
self.gotdata = True
self.xmldata += line
@@ -36,9 +36,9 @@ class XLSProcessData:
if self.ishtml:
return self.out
handler = xlsxmltocsv.XlsXmlHandler()
- data = xml.sax.parseString(self.xmldata, handler)
+ xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output)
- return self.out + ''''''
+ return self.out + b''''''
class XLSFilter:
def __init__(self, em):
@@ -56,7 +56,7 @@ class XLSFilter:
# Some HTML files masquerade as XLS
try:
data = open(fn, 'rb').read(512)
- if data.find('html') != -1 or data.find('HTML') != -1:
+ if data.find(b'html') != -1 or data.find(b'HTML') != -1:
return ("cat", XLSProcessData(self.em, True))
except Exception as err:
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
diff --git a/src/filters/xls-dump.py b/src/filters/xls-dump.py
index 15613f35..abffa330 100755
--- a/src/filters/xls-dump.py
+++ b/src/filters/xls-dump.py
@@ -4,11 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
-
-# mso-dumper is not compatible with python3
-
-from __future__ import print_function
-
+from builtins import range
import sys, os.path, optparse
sys.path.append(sys.path[0]+"/msodump.zip")
@@ -21,8 +17,8 @@ def equalsName (name, array):
if len(name) != len(array):
return False
- for i in xrange(0, len(name)):
- if ord(name[i]) != array[i]:
+ for i in range(0, len(name)):
+ if globals.indexbytes(name, i) != array[i]:
return False
return True
@@ -50,13 +46,13 @@ class XLDumper(object):
def __printDirHeader (self, direntry, byteLen):
dirname = direntry.Name
dirname = globals.encodeName(dirname)
- print("")
- print("="*globals.OutputWidth)
+ globals.outputln("")
+ globals.outputln("="*globals.OutputWidth)
if direntry.isStorage():
- print("%s (storage)"%dirname)
+ globals.outputln("%s (storage)"%dirname)
else:
- print("%s (stream, size: %d bytes)"%(dirname, byteLen))
- print("-"*globals.OutputWidth)
+ globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen))
+ globals.outputln("-"*globals.OutputWidth)
def __parseFile (self):
file = open(self.filepath, 'rb')
@@ -71,38 +67,34 @@ class XLDumper(object):
root = docroot.appendElement('xls-dump')
for d in dirs:
- if d.Name != "Workbook":
+ if d.Name != b"Workbook":
# for now, we only dump the Workbook directory stream.
continue
dirstrm = self.strm.getDirectoryStream(d)
data = self.__readSubStreamXML(dirstrm)
self.__dumpDataAsXML(data, root)
- node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
+
+ node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
def dumpCanonicalXML (self):
- try:
- self.__parseFile()
- docroot = node.Root()
- root = docroot.appendElement('xls-dump')
+ self.__parseFile()
+ docroot = node.Root()
+ root = docroot.appendElement('xls-dump')
- dirEntries = self.strm.getDirectoryEntries()
- for entry in dirEntries:
- dirname = entry.Name
- if dirname != "Workbook":
- # for now, we only dump the Workbook directory stream.
- continue
-
- dirstrm = self.strm.getDirectoryStream(entry)
- wbmodel = self.__buildWorkbookModel(dirstrm)
- wbmodel.encrypted = self.strmData.encrypted
- root.appendChild(wbmodel.createDOM())
-
- node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
+ dirEntries = self.strm.getDirectoryEntries()
+ for entry in dirEntries:
+ dirname = entry.Name
+ if dirname != b"Workbook":
+ # for now, we only dump the Workbook directory stream.
+ continue
- except Exception as err:
- print("xls-dump.py: error: %s" % err, file=sys.stderr)
- sys.exit(1)
+ dirstrm = self.strm.getDirectoryStream(entry)
+ wbmodel = self.__buildWorkbookModel(dirstrm)
+ wbmodel.encrypted = self.strmData.encrypted
+ root.appendChild(wbmodel.createDOM())
+
+ node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8)
def dump (self):
self.__parseFile()
@@ -123,18 +115,18 @@ class XLDumper(object):
if entry.isStorage():
continue
- elif dirname == "Workbook":
+ elif dirname == b"Workbook":
success = True
while success:
success = self.__readSubStream(dirstrm)
- elif dirname == "Revision Log":
+ elif dirname == b"Revision Log":
dirstrm.type = xlsstream.DirType.RevisionLog
self.__readSubStream(dirstrm)
- elif dirname == "EncryptionInfo":
+ elif dirname == b"EncryptionInfo":
globals.dumpBytes(dirstrm.bytes, 512)
- print("-"*globals.OutputWidth)
+ globals.outputln("-"*globals.OutputWidth)
info = msocrypto.EncryptionInfo(dirstrm.bytes)
info.read()
info.output()
diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py
index 7fa12e58..a8930167 100755
--- a/src/filters/xlsxmltocsv.py
+++ b/src/filters/xlsxmltocsv.py
@@ -32,20 +32,20 @@ import xml.sax
dtt = True
if dtt:
- sepstring = "\t"
- dquote = ''
+ sepstring = b"\t"
+ dquote = b""
else:
- sepstring = ","
- dquote = '"'
-
+ sepstring = b","
+ dquote = b'"'
+
class XlsXmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
- self.output = ""
+ self.output = b''
def startElement(self, name, attrs):
if name == "worksheet":
if "name" in attrs:
- self.output += "%s\n" % attrs["name"].encode("UTF-8")
+ self.output += b"%s\n" % attrs["name"].encode("UTF-8")
elif name == "row":
self.cells = dict()
elif name == "label-cell" or name == "number-cell":
@@ -57,7 +57,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
self.cells[int(attrs["col"])] = value
else:
#??
- self.output += "%s%s" % (value.encode("UTF-8"), sepstring)
+ self.output += b"%s%s" % (value.encode("UTF-8"), sepstring)
elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \
@@ -68,11 +68,11 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
curidx = 0
for idx, value in self.cells.items():
self.output += sepstring * (idx - curidx)
- self.output += "%s%s%s" % (dquote, value, dquote)
+ self.output += b"%s%s%s" % (dquote, value, dquote)
curidx = idx
- self.output += "\n"
+ self.output += b"\n"
elif name == "worksheet":
- self.output += "\n"
+ self.output += b"\n"
if __name__ == '__main__':