diff --git a/src/filters/msodump.zip b/src/filters/msodump.zip index a1b6d9e6..e39359d5 100644 Binary files a/src/filters/msodump.zip and b/src/filters/msodump.zip differ diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py index a84eec79..f41a9f39 100755 --- a/src/filters/ppt-dump.py +++ b/src/filters/ppt-dump.py @@ -5,10 +5,6 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -# mso-dumper is not compatible with python3 - -from __future__ import print_function - import sys, os.path, getopt sys.path.append(sys.path[0]+"/msodump.zip") from msodumper import ole, pptstream, globals, olestream @@ -51,25 +47,26 @@ class PPTDumper(object): dirnames = strm.getDirectoryNames() result = True for dirname in dirnames: - if len(dirname) == 0 or dirname == 'Root Entry': + sdirname = globals.nulltrunc(dirname) + if len(sdirname) == 0 or sdirname == b"Root Entry": continue try: dirstrm = strm.getDirectoryStreamByName(dirname) except Exception as err: - error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath)) + error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err))) # The previous version was killed by the exception # here, so the equivalent is to break, but maybe there # is no reason to do so. break self.__printDirHeader(dirname, len(dirstrm.bytes)) - if dirname == "PowerPoint Document": + if sdirname == b"PowerPoint Document": if not self.__readSubStream(dirstrm): result = False - elif dirname == "Current User": + elif sdirname == b"Current User": if not self.__readSubStream(dirstrm): result = False - elif dirname == "\x05DocumentSummaryInformation": + elif sdirname == b"\x05DocumentSummaryInformation": strm = olestream.PropertySetStream(dirstrm.bytes) strm.read() else: @@ -118,26 +115,15 @@ def main (args): except getopt.GetoptError: error("error parsing input options\n") usage(exname) - return false - - status = True - try: - dumper = PPTDumper(args[0], globals.params) - if not dumper.dump(): - error("ppt-dump: dump error " + args[0] + "\n") - status = False - except: - error("ppt-dump: FAILURE (bad format?) " + args[0] + "\n") - status = False + return + dumper = PPTDumper(args[0], globals.params) + if not dumper.dump(): + error("FAILURE\n") if globals.params.dumpText: - print(globals.textdump.replace("\r", "\n")) - return(status) - + globals.dumptext() + if __name__ == '__main__': - if main(sys.argv): - sys.exit(0) - else: - sys.exit(1) + main(sys.argv) # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py index d86cc897..a4e50265 100755 --- a/src/filters/rclppt.py +++ b/src/filters/rclppt.py @@ -15,20 +15,20 @@ import os class PPTProcessData: def __init__(self, em): self.em = em - self.out = "" + self.out = b"" self.gotdata = 0 def takeLine(self, line): if not self.gotdata: - self.out += '''''' + \ - '''''' + \ - '''
'''
+            self.out += b'''''' + \
+                        b'''''' + \
+                        b'''
'''
             self.gotdata = True
-        self.out += self.em.htmlescape(line) + "
\n" + self.out += self.em.htmlescape(line) + b"
\n" def wrapData(self): - return self.out + '''
''' + return self.out + b'''
''' class PPTFilter: def __init__(self, em): diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py index 82fc2379..c7b2343a 100755 --- a/src/filters/rclxls.py +++ b/src/filters/rclxls.py @@ -15,9 +15,9 @@ import xml.sax class XLSProcessData: def __init__(self, em, ishtml = False): self.em = em - self.out = "" + self.out = b"" self.gotdata = 0 - self.xmldata = "" + self.xmldata = b"" self.ishtml = ishtml def takeLine(self, line): @@ -25,10 +25,10 @@ class XLSProcessData: self.out += line + "\n" return if not self.gotdata: - self.out += '''''' + \ - '''''' + \ - '''
'''
+            self.out += b'''''' + \
+                        b'''''' + \
+                        b'''
'''
             self.gotdata = True
         self.xmldata += line
 
@@ -36,9 +36,9 @@ class XLSProcessData:
         if self.ishtml:
             return self.out
         handler =  xlsxmltocsv.XlsXmlHandler()
-        data = xml.sax.parseString(self.xmldata, handler)
+        xml.sax.parseString(self.xmldata, handler)
         self.out += self.em.htmlescape(handler.output)
-        return self.out + '''
''' + return self.out + b'''
''' class XLSFilter: def __init__(self, em): @@ -56,7 +56,7 @@ class XLSFilter: # Some HTML files masquerade as XLS try: data = open(fn, 'rb').read(512) - if data.find('html') != -1 or data.find('HTML') != -1: + if data.find(b'html') != -1 or data.find(b'HTML') != -1: return ("cat", XLSProcessData(self.em, True)) except Exception as err: self.em.rclog("Error reading %s:%s" % (fn, str(err))) diff --git a/src/filters/xls-dump.py b/src/filters/xls-dump.py index 15613f35..abffa330 100755 --- a/src/filters/xls-dump.py +++ b/src/filters/xls-dump.py @@ -4,11 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # - -# mso-dumper is not compatible with python3 - -from __future__ import print_function - +from builtins import range import sys, os.path, optparse sys.path.append(sys.path[0]+"/msodump.zip") @@ -21,8 +17,8 @@ def equalsName (name, array): if len(name) != len(array): return False - for i in xrange(0, len(name)): - if ord(name[i]) != array[i]: + for i in range(0, len(name)): + if globals.indexbytes(name, i) != array[i]: return False return True @@ -50,13 +46,13 @@ class XLDumper(object): def __printDirHeader (self, direntry, byteLen): dirname = direntry.Name dirname = globals.encodeName(dirname) - print("") - print("="*globals.OutputWidth) + globals.outputln("") + globals.outputln("="*globals.OutputWidth) if direntry.isStorage(): - print("%s (storage)"%dirname) + globals.outputln("%s (storage)"%dirname) else: - print("%s (stream, size: %d bytes)"%(dirname, byteLen)) - print("-"*globals.OutputWidth) + globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen)) + globals.outputln("-"*globals.OutputWidth) def __parseFile (self): file = open(self.filepath, 'rb') @@ -71,38 +67,34 @@ class XLDumper(object): root = docroot.appendElement('xls-dump') for d in dirs: - if d.Name != "Workbook": + if d.Name != b"Workbook": # for now, we only dump the Workbook directory stream. continue dirstrm = self.strm.getDirectoryStream(d) data = self.__readSubStreamXML(dirstrm) self.__dumpDataAsXML(data, root) - node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) + + node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8) def dumpCanonicalXML (self): - try: - self.__parseFile() - docroot = node.Root() - root = docroot.appendElement('xls-dump') + self.__parseFile() + docroot = node.Root() + root = docroot.appendElement('xls-dump') - dirEntries = self.strm.getDirectoryEntries() - for entry in dirEntries: - dirname = entry.Name - if dirname != "Workbook": - # for now, we only dump the Workbook directory stream. - continue - - dirstrm = self.strm.getDirectoryStream(entry) - wbmodel = self.__buildWorkbookModel(dirstrm) - wbmodel.encrypted = self.strmData.encrypted - root.appendChild(wbmodel.createDOM()) - - node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) + dirEntries = self.strm.getDirectoryEntries() + for entry in dirEntries: + dirname = entry.Name + if dirname != b"Workbook": + # for now, we only dump the Workbook directory stream. + continue - except Exception as err: - print("xls-dump.py: error: %s" % err, file=sys.stderr) - sys.exit(1) + dirstrm = self.strm.getDirectoryStream(entry) + wbmodel = self.__buildWorkbookModel(dirstrm) + wbmodel.encrypted = self.strmData.encrypted + root.appendChild(wbmodel.createDOM()) + + node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8) def dump (self): self.__parseFile() @@ -123,18 +115,18 @@ class XLDumper(object): if entry.isStorage(): continue - elif dirname == "Workbook": + elif dirname == b"Workbook": success = True while success: success = self.__readSubStream(dirstrm) - elif dirname == "Revision Log": + elif dirname == b"Revision Log": dirstrm.type = xlsstream.DirType.RevisionLog self.__readSubStream(dirstrm) - elif dirname == "EncryptionInfo": + elif dirname == b"EncryptionInfo": globals.dumpBytes(dirstrm.bytes, 512) - print("-"*globals.OutputWidth) + globals.outputln("-"*globals.OutputWidth) info = msocrypto.EncryptionInfo(dirstrm.bytes) info.read() info.output() diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py index 7fa12e58..a8930167 100755 --- a/src/filters/xlsxmltocsv.py +++ b/src/filters/xlsxmltocsv.py @@ -32,20 +32,20 @@ import xml.sax dtt = True if dtt: - sepstring = "\t" - dquote = '' + sepstring = b"\t" + dquote = b"" else: - sepstring = "," - dquote = '"' - + sepstring = b"," + dquote = b'"' + class XlsXmlHandler(xml.sax.handler.ContentHandler): def __init__(self): - self.output = "" + self.output = b'' def startElement(self, name, attrs): if name == "worksheet": if "name" in attrs: - self.output += "%s\n" % attrs["name"].encode("UTF-8") + self.output += b"%s\n" % attrs["name"].encode("UTF-8") elif name == "row": self.cells = dict() elif name == "label-cell" or name == "number-cell": @@ -57,7 +57,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): self.cells[int(attrs["col"])] = value else: #?? - self.output += "%s%s" % (value.encode("UTF-8"), sepstring) + self.output += b"%s%s" % (value.encode("UTF-8"), sepstring) elif name == "formula-cell": if "formula-result" in attrs and "col" in attrs: self.cells[int(attrs["col"])] = \ @@ -68,11 +68,11 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): curidx = 0 for idx, value in self.cells.items(): self.output += sepstring * (idx - curidx) - self.output += "%s%s%s" % (dquote, value, dquote) + self.output += b"%s%s%s" % (dquote, value, dquote) curidx = idx - self.output += "\n" + self.output += b"\n" elif name == "worksheet": - self.output += "\n" + self.output += b"\n" if __name__ == '__main__':