more filters made compatible with python3

This commit is contained in:
Jean-Francois Dockes 2015-11-07 16:59:17 +01:00
parent f344e8fedd
commit dfe00ab11f
18 changed files with 209 additions and 100 deletions

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python2
"""Extract Html files from a Microsoft Compiled Html Help file (.chm) """Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""

View File

@ -16,12 +16,15 @@ with acute accent ?
from __future__ import print_function from __future__ import print_function
import sys import sys
import string PY3 = sys.version > '3'
if not PY3:
import string
import glob import glob
import os import os
import os.path import os.path
from zipfile import ZipFile from zipfile import ZipFile
class European8859TextClassifier: class European8859TextClassifier:
def __init__(self, langzip=""): def __init__(self, langzip=""):
"""langzip contains text files. Each text file is named like lang_code.txt """langzip contains text files. Each text file is named like lang_code.txt
@ -33,9 +36,12 @@ class European8859TextClassifier:
self.readlanguages(langzip) self.readlanguages(langzip)
# Table to translate from punctuation to spaces # Table to translate from punctuation to spaces
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r" self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r"
spaces = len(self.punct) * " " spaces = len(self.punct) * b' '
self.spacetable = string.maketrans(self.punct, spaces) if PY3:
self.spacetable = bytes.maketrans(self.punct, spaces)
else:
self.spacetable = string.maketrans(self.punct, spaces)
def readlanguages(self, langzip): def readlanguages(self, langzip):
"""Extract the stop words lists from the zip file. """Extract the stop words lists from the zip file.
@ -53,7 +59,7 @@ class European8859TextClassifier:
text = zip.read(fn) text = zip.read(fn)
words = text.split() words = text.split()
for word in words: for word in words:
if self.allwords.has_key(word): if word in self.allwords:
self.allwords[word].append((lang, code)) self.allwords[word].append((lang, code))
else: else:
self.allwords[word] = [(lang, code)] self.allwords[word] = [(lang, code)]
@ -64,7 +70,7 @@ class European8859TextClassifier:
# Limit to reasonable size. # Limit to reasonable size.
if len(rawtext) > 10000: if len(rawtext) > 10000:
i = rawtext.find(" ", 9000) i = rawtext.find(b' ', 9000)
if i == -1: if i == -1:
i = 9000 i = 9000
rawtext = rawtext[0:i] rawtext = rawtext[0:i]
@ -79,9 +85,9 @@ class European8859TextClassifier:
dict = {} dict = {}
for w in words: for w in words:
dict[w] = dict.get(w, 0) + 1 dict[w] = dict.get(w, 0) + 1
lfreq = [a[0] for a in sorted(dict.iteritems(), \ lfreq = [a[0] for a in sorted(dict.items(), \
key=lambda entry: entry[1], reverse=True)[0:ntest]] key=lambda entry: entry[1], reverse=True)[0:ntest]]
#print lfreq #print(lfreq)
# Build a dict (lang,code)->matchcount # Build a dict (lang,code)->matchcount
langstats = {} langstats = {}
@ -91,9 +97,9 @@ class European8859TextClassifier:
langstats[lc] = langstats.get(lc, 0) + 1 langstats[lc] = langstats.get(lc, 0) + 1
# Get a list of (lang,code) sorted by match count # Get a list of (lang,code) sorted by match count
lcfreq = sorted(langstats.iteritems(), \ lcfreq = sorted(langstats.items(), \
key=lambda entry: entry[1], reverse=True) key=lambda entry: entry[1], reverse=True)
#print lcfreq[0:3] #print(lcfreq[0:3])
if len(lcfreq) != 0: if len(lcfreq) != 0:
lc,maxcount = lcfreq[0] lc,maxcount = lcfreq[0]
maxlang = lc[0] maxlang = lc[0]

View File

@ -17,6 +17,8 @@
# Recoll PDF extractor, with support for attachments # Recoll PDF extractor, with support for attachments
from __future__ import print_function
import os import os
import sys import sys
import re import re
@ -89,7 +91,7 @@ class PDFExtractor:
"output", tmpdir]) "output", tmpdir])
self.attachlist = sorted(os.listdir(tmpdir)) self.attachlist = sorted(os.listdir(tmpdir))
return True return True
except Exception, e: except Exception as e:
self.em.rclog("extractAttach: failed: %s" % e) self.em.rclog("extractAttach: failed: %s" % e)
# Return true anyway, pdf attachments are no big deal # Return true anyway, pdf attachments are no big deal
return True return True

View File

@ -146,7 +146,7 @@ class OXExtractor:
return stylesheet return stylesheet
def extractone(self, params): def extractone(self, params):
if not params.has_key("filename:"): if "filename:" not in params:
self.em.rclog("extractone: no mime or file name") self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]

View File

@ -1,4 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python2
# Recoll PPT text extractor
# Msodump is not compatible with Python3 AFAIK, so this is stuck to
# Python2 too
from __future__ import print_function
import rclexecm import rclexecm
import rclexec1 import rclexec1

View File

@ -22,6 +22,8 @@
# - parse script encoding and allow output in any encoding by using unicode # - parse script encoding and allow output in any encoding by using unicode
# as intermediate # as intermediate
from __future__ import print_function
__version__ = '0.3' __version__ = '0.3'
__date__ = '2005-07-04' __date__ = '2005-07-04'
__license__ = 'GPL' __license__ = 'GPL'
@ -29,9 +31,26 @@ __author__ = 'J
# Imports # Imports
import cgi, string, sys, cStringIO import cgi, string, sys
PY2 = sys.version < '3'
if PY2:
import cStringIO
else:
import io
import keyword, token, tokenize import keyword, token, tokenize
if PY2:
def makebytes(data):
if isinstance(data, unicode):
return data.encode("UTF-8")
else:
return data
else:
def makebytes(data):
if isinstance(data, bytes):
return data
else:
return data.encode("UTF-8")
############################################################################# #############################################################################
### Python Source Parser (does Hilighting) ### Python Source Parser (does Hilighting)
@ -57,7 +76,7 @@ _HTML_HEADER = """\
<html> <html>
<head> <head>
<title>%%(title)s</title> <title>%%(title)s</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="colorize.py (version %s)"> <meta name="Generator" content="colorize.py (version %s)">
</head> </head>
<body> <body>
@ -114,7 +133,7 @@ class Parser:
def __init__(self, raw, out=sys.stdout): def __init__(self, raw, out=sys.stdout):
""" Store the source text. """ Store the source text.
""" """
self.raw = string.strip(string.expandtabs(raw)) self.raw = raw.expandtabs().strip()
self.out = out self.out = out
def format(self): def format(self):
@ -124,35 +143,44 @@ class Parser:
self.lines = [0, 0] self.lines = [0, 0]
pos = 0 pos = 0
while 1: while 1:
pos = string.find(self.raw, '\n', pos) + 1 pos = self.raw.find(b'\n', pos) + 1
if not pos: break if not pos: break
self.lines.append(pos) self.lines.append(pos)
self.lines.append(len(self.raw)) self.lines.append(len(self.raw))
# parse the source and write it # parse the source and write it
self.pos = 0 self.pos = 0
text = cStringIO.StringIO(self.raw) if PY2:
self.out.write(self.stylesheet) text = cStringIO.StringIO(self.raw)
self.out.write('<pre class="code">\n') else:
text = io.BytesIO(self.raw)
self.out.write(makebytes(self.stylesheet))
self.out.write(b'<pre class="code">\n')
try: try:
tokenize.tokenize(text.readline, self) if PY2:
except tokenize.TokenError, ex: tokenize.tokenize(text.readline, self)
else:
for a,b,c,d,e in tokenize.tokenize(text.readline):
self(a,b,c,d,e)
except tokenize.TokenError as ex:
msg = ex[0] msg = ex[0]
line = ex[1][0] line = ex[1][0]
self.out.write("<h3>ERROR: %s</h3>%s\n" % ( self.out.write("<h3>ERROR: %s</h3>%s\n" % (
msg, self.raw[self.lines[line]:])) msg, self.raw[self.lines[line]:]))
except IndentationError, ex: except IndentationError as ex:
msg = ex[0] msg = ex[0]
self.out.write("<h3>ERROR: %s</h3>\n" % (msg)) self.out.write("<h3>ERROR: %s</h3>\n" % (msg))
self.out.write('\n</pre>') self.out.write(b'\n</pre>')
def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line): def __call__(self, toktype, toktext, startpos, endpos, line):
""" Token handler. """ Token handler.
""" """
if 0: if 0:
print "type", toktype, token.tok_name[toktype], "text", toktext, print("type %s %s text %s start %s %s end %s %s<br>\n" % \
print "start", srow,scol, "end", erow,ecol, "<br>" (toktype, token.tok_name[toktype], toktext, \
srow, scol,erow,ecol))
srow, scol = startpos
erow, ecol = endpos
# calculate new positions # calculate new positions
oldpos = self.pos oldpos = self.pos
newpos = self.lines[srow] + scol newpos = self.lines[srow] + scol
@ -160,7 +188,7 @@ class Parser:
# handle newlines # handle newlines
if toktype in [token.NEWLINE, tokenize.NL]: if toktype in [token.NEWLINE, tokenize.NL]:
self.out.write('\n') self.out.write(b'\n')
return return
# send the original whitespace, if needed # send the original whitespace, if needed
@ -180,9 +208,9 @@ class Parser:
css_class = _css_classes.get(toktype, 'text') css_class = _css_classes.get(toktype, 'text')
# send text # send text
self.out.write('<span class="%s">' % (css_class,)) self.out.write(makebytes('<span class="%s">' % (css_class,)))
self.out.write(cgi.escape(toktext)) self.out.write(makebytes(cgi.escape(toktext)))
self.out.write('</span>') self.out.write(b'</span>')
def colorize_file(file=None, outstream=sys.stdout, standalone=True): def colorize_file(file=None, outstream=sys.stdout, standalone=True):
@ -205,7 +233,7 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
filename = 'STREAM' filename = 'STREAM'
elif file is not None: elif file is not None:
try: try:
sourcefile = open(file) sourcefile = open(file, 'rb')
filename = basename(file) filename = basename(file)
except IOError: except IOError:
raise SystemExit("File %s unknown." % file) raise SystemExit("File %s unknown." % file)
@ -215,22 +243,26 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
source = sourcefile.read() source = sourcefile.read()
if standalone: if standalone:
outstream.write(_HTML_HEADER % {'title': filename}) outstream.write(makebytes(_HTML_HEADER % {'title': filename}))
Parser(source, out=outstream).format() Parser(source, out=outstream).format()
if standalone: if standalone:
outstream.write(_HTML_FOOTER) outstream.write(makebytes(_HTML_FOOTER))
if file: if file:
sourcefile.close() sourcefile.close()
if __name__ == "__main__": if __name__ == "__main__":
import os import os
if PY2:
out = sys.stdout
else:
out = sys.stdout.buffer
if os.environ.get('PATH_TRANSLATED'): if os.environ.get('PATH_TRANSLATED'):
filepath = os.environ.get('PATH_TRANSLATED') filepath = os.environ.get('PATH_TRANSLATED')
print 'Content-Type: text/html; charset="iso-8859-1"\n' print('Content-Type: text/html; charset="iso-8859-1"\n')
colorize_file(filepath) colorize_file(filepath, out)
elif len(sys.argv) > 1: elif len(sys.argv) > 1:
filepath = sys.argv[1] filepath = sys.argv[1]
colorize_file(filepath) colorize_file(filepath, out)
else: else:
colorize_file() colorize_file()

View File

@ -18,12 +18,14 @@
# Free Software Foundation, Inc., # Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from __future__ import print_function
import sys import sys
import rclexecm import rclexecm
try: try:
from rarfile import RarFile from rarfile import RarFile
except: except:
print "RECFILTERROR HELPERNOTFOUND python:rarfile" print("RECFILTERROR HELPERNOTFOUND python:rarfile")
sys.exit(1); sys.exit(1);
# Requires RarFile python module. Try "sudo pip install rarfile" # Requires RarFile python module. Try "sudo pip install rarfile"
@ -67,9 +69,7 @@ class RarExtractor:
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.rar.namelist()) -1: if self.currentindex >= len(self.rar.namelist()) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
if isinstance(ipath, unicode): return (ok, docdata, rclexecm.makebytes(ipath), iseof)
ipath = ipath.encode("utf-8")
return (ok, docdata, ipath, iseof)
###### File type handler api, used by rclexecm ----------> ###### File type handler api, used by rclexecm ---------->
def openfile(self, params): def openfile(self, params):

View File

@ -16,6 +16,8 @@
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
###################################### ######################################
from __future__ import print_function
import sys import sys
import rclexecm import rclexecm
import rclxslt import rclxslt
@ -130,19 +132,19 @@ class OOExtractor:
self.currentindex = 0 self.currentindex = 0
def extractone(self, params): def extractone(self, params):
if not params.has_key("filename:"): if "filename:" not in params:
self.em.rclog("extractone: no mime or file name") self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]
try: try:
zip = ZipFile(fn) zip = ZipFile(fn.decode('UTF-8'))
except Exception as err: except Exception as err:
self.em.rclog("unzip failed: " + str(err)) self.em.rclog("unzip failed: %s" % err)
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
docdata = '<html><head><meta http-equiv="Content-Type"' \ docdata = b'<html><head><meta http-equiv="Content-Type"' \
'content="text/html; charset=UTF-8"></head><body>' b'content="text/html; charset=UTF-8"></head><body>'
try: try:
metadata = zip.read("meta.xml") metadata = zip.read("meta.xml")
@ -160,9 +162,9 @@ class OOExtractor:
if content: if content:
res = rclxslt.apply_sheet_data(stylesheet_content, content) res = rclxslt.apply_sheet_data(stylesheet_content, content)
docdata += res docdata += res
docdata += '</body></html>' docdata += b'</body></html>'
except Exception as err: except Exception as err:
self.em.rclog("bad data in %s" % fn) self.em.rclog("bad data in %s: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext) return (True, docdata, "", rclexecm.RclExecM.eofnext)

View File

@ -16,6 +16,8 @@
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
###################################### ######################################
from __future__ import print_function
import sys import sys
import rclexecm import rclexecm
import rclxslt import rclxslt
@ -104,7 +106,7 @@ class SVGExtractor:
self.currentindex = 0 self.currentindex = 0
def extractone(self, params): def extractone(self, params):
if not params.has_key("filename:"): if "filename:" not in params:
self.em.rclog("extractone: no mime or file name") self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]

View File

@ -6,12 +6,14 @@
# It works not only for tar-files, but automatically for gzipped and # It works not only for tar-files, but automatically for gzipped and
# bzipped tar-files at well. # bzipped tar-files at well.
from __future__ import print_function
import rclexecm import rclexecm
try: try:
import tarfile import tarfile
except: except:
print "RECFILTERROR HELPERNOTFOUND python:tarfile" print("RECFILTERROR HELPERNOTFOUND python:tarfile")
sys.exit(1); sys.exit(1);
class TarExtractor: class TarExtractor:
@ -38,15 +40,15 @@ class TarExtractor:
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.namen) -1: if self.currentindex >= len(self.namen) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
if isinstance(ipath, unicode): return (ok, docdata, rclexecm.makebytes(ipath), iseof)
ipath = ipath.encode("utf-8")
return (ok, docdata, ipath, iseof)
def openfile(self, params): def openfile(self, params):
self.currentindex = -1 self.currentindex = -1
try: try:
self.tar = tarfile.open(name=params["filename:"],mode='r') self.tar = tarfile.open(name=params["filename:"],mode='r')
self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())] #self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
return True return True
except: except:
return False return False

View File

@ -1,10 +1,13 @@
#!/usr/bin/env python #!/usr/bin/env python
# Wrapping a text file. Recoll does it internally in most cases, but
# this is for use by another filter.
from __future__ import print_function
import rclexecm import rclexecm
import sys import sys
# Wrapping a text file. Recoll does it internally in most cases, but
# there is a reason this exists, just can't remember it ...
class TxtDump: class TxtDump:
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
@ -12,7 +15,7 @@ class TxtDump:
def extractone(self, params): def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \ #self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"])) #params["mimetype:"]))
if not params.has_key("filename:"): if not "filename:" in params:
self.em.rclog("extractone: no file name") self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -2,6 +2,8 @@
# WAR web archive filter for recoll. War file are gzipped tar files # WAR web archive filter for recoll. War file are gzipped tar files
from __future__ import print_function
import rclexecm import rclexecm
import tarfile import tarfile

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python2
# Extractor for Excel files.
# Mso-dumper is not compatible with Python3
import rclexecm import rclexecm
import rclexec1 import rclexec1
import xlsxmltocsv import xlsxmltocsv

View File

@ -62,7 +62,7 @@ class XMLExtractor:
self.currentindex = 0 self.currentindex = 0
def extractone(self, params): def extractone(self, params):
if not params.has_key("filename:"): if "filename:" not in params:
self.em.rclog("extractone: no mime or file name") self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]

View File

@ -17,36 +17,54 @@
# Helper module for xslt-based filters # Helper module for xslt-based filters
from __future__ import print_function
import sys import sys
try: PY2 = sys.version < '3'
import libxml2
import libxslt
except:
print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1"
sys.exit(1);
libxml2.substituteEntitiesDefault(1) if PY2:
try:
def apply_sheet_data(sheet, data): import libxml2
styledoc = libxml2.parseMemory(sheet, len(sheet)) import libxslt
style = libxslt.parseStylesheetDoc(styledoc) libxml2.substituteEntitiesDefault(1)
doc = libxml2.parseMemory(data, len(data)) except:
result = style.applyStylesheet(doc, None) print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
res = style.saveResultToString(result) sys.exit(1);
style.freeStylesheet() def apply_sheet_data(sheet, data):
doc.freeDoc() styledoc = libxml2.parseMemory(sheet, len(sheet))
result.freeDoc() style = libxslt.parseStylesheetDoc(styledoc)
return res doc = libxml2.parseMemory(data, len(data))
result = style.applyStylesheet(doc, None)
def apply_sheet_file(sheet, fn): res = style.saveResultToString(result)
styledoc = libxml2.parseMemory(sheet, len(sheet)) style.freeStylesheet()
style = libxslt.parseStylesheetDoc(styledoc) doc.freeDoc()
doc = libxml2.parseFile(fn) result.freeDoc()
result = style.applyStylesheet(doc, None) return res
res = style.saveResultToString(result) def apply_sheet_file(sheet, fn):
style.freeStylesheet() styledoc = libxml2.parseMemory(sheet, len(sheet))
doc.freeDoc() style = libxslt.parseStylesheetDoc(styledoc)
result.freeDoc() doc = libxml2.parseFile(fn)
return res result = style.applyStylesheet(doc, None)
res = style.saveResultToString(result)
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return res
else:
try:
from lxml import etree
except:
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
sys.exit(1);
def apply_sheet_data(sheet, data):
styledoc = etree.fromstring(sheet)
transform = etree.XSLT(styledoc)
doc = etree.fromstring(data)
return etree.tostring(transform(doc))
def apply_sheet_file(sheet, fn):
styledoc = etree.fromstring(sheet)
transform = etree.XSLT(styledoc)
doc = etree.parse(fn)
return etree.tostring(transform(doc))

View File

@ -1,6 +1,24 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2014 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Zip file filter for Recoll # Zip file extractor for Recoll
from __future__ import print_function
import os import os
import fnmatch import fnmatch
@ -78,9 +96,7 @@ class ZipExtractor:
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.zip.namelist()) -1: if self.currentindex >= len(self.zip.namelist()) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
if isinstance(ipath, unicode): return (ok, docdata, rclexecm.makebytes(ipath), iseof)
ipath = ipath.encode("utf-8")
return (ok, docdata, ipath, iseof)
###### File type handler api, used by rclexecm ----------> ###### File type handler api, used by rclexecm ---------->
def openfile(self, params): def openfile(self, params):
@ -96,7 +112,14 @@ class ZipExtractor:
self.skiplist = skipped.split(" ") self.skiplist = skipped.split(" ")
try: try:
self.zip = ZipFile(filename) if rclexecm.PY3:
# Note: python3 ZipFile wants an str file name, which
# is wrong: file names are binary. But it accepts an
# open file, and open() has no such restriction
f = open(filename, 'rb')
self.zip = ZipFile(f)
else:
self.zip = ZipFile(filename)
return True return True
except Exception as err: except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err) self.em.rclog("openfile: failed: [%s]" % err)

View File

@ -1,10 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python2
# #
# This Source Code Form is subject to the terms of the Mozilla Public # This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this # License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
# #
# mso-dumper is not compatible with python3
from __future__ import print_function
import sys, os.path, optparse import sys, os.path, optparse
sys.path.append(sys.path[0]+"/msodump.zip") sys.path.append(sys.path[0]+"/msodump.zip")
@ -97,7 +101,7 @@ class XLDumper(object):
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
except Exception as err: except Exception as err:
print >> sys.stderr, "xls-dump.py: error: %s" % err print("xls-dump.py: error: %s" % err, file=sys.stderr)
sys.exit(1) sys.exit(1)
def dump (self): def dump (self):

View File

@ -1,4 +1,9 @@
#!/usr/bin/env python #!/usr/bin/env python2
# Transform XML output from xls-dump.py into csv format.
# Note: msodumper is not compatible with python3.
from __future__ import print_function
import sys import sys
import xml.sax import xml.sax
@ -28,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
if "value" in attrs: if "value" in attrs:
value = attrs["value"].encode("UTF-8") value = attrs["value"].encode("UTF-8")
else: else:
value = unicode() value = b''
if "col" in attrs: if "col" in attrs:
self.cells[int(attrs["col"])] = value self.cells[int(attrs["col"])] = value
else: else:
@ -42,7 +47,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
def endElement(self, name, ): def endElement(self, name, ):
if name == "row": if name == "row":
curidx = 0 curidx = 0
for idx, value in self.cells.iteritems(): for idx, value in self.cells.items():
self.output += sepstring * (idx - curidx) self.output += sepstring * (idx - curidx)
self.output += "%s%s%s" % (dquote, value, dquote) self.output += "%s%s%s" % (dquote, value, dquote)
curidx = idx curidx = idx