more filters made compatible with python3

2015-11-07 16:59:17 +01:00 · 2015-11-07 16:59:17 +01:00 · dfe00ab11f
commit dfe00ab11f
parent f344e8fedd
18 changed files with 209 additions and 100 deletions
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 """Extract Html files from a Microsoft Compiled Html Help file (.chm)
 Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""

--- a/src/filters/rcllatinclass.py
+++ b/src/filters/rcllatinclass.py
@ -16,12 +16,15 @@ with acute accent ?
 from __future__ import print_function

 import sys
-import string
+PY3 = sys.version > '3'
+if not PY3:
+    import string
 import glob
 import os
 import os.path
 from zipfile import ZipFile

+
 class European8859TextClassifier:
    def __init__(self, langzip=""):
        """langzip contains text files. Each text file is named like lang_code.txt
@ -33,9 +36,12 @@ class European8859TextClassifier:
        self.readlanguages(langzip)

        # Table to translate from punctuation to spaces
-        self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
-        spaces = len(self.punct) * " "
-        self.spacetable = string.maketrans(self.punct, spaces)
+        self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r"
+        spaces = len(self.punct) * b' '
+        if PY3:
+            self.spacetable = bytes.maketrans(self.punct, spaces)
+        else:
+            self.spacetable = string.maketrans(self.punct, spaces)

    def readlanguages(self, langzip):
        """Extract the stop words lists from the zip file.
@ -53,7 +59,7 @@ class European8859TextClassifier:
            text = zip.read(fn)
            words = text.split()
            for word in words:
-                if self.allwords.has_key(word):
+                if word in self.allwords:
                    self.allwords[word].append((lang, code))
                else:
                    self.allwords[word] = [(lang, code)]
@ -64,7 +70,7 @@ class European8859TextClassifier:

        # Limit to reasonable size.
        if len(rawtext) > 10000:
-            i = rawtext.find(" ", 9000)
+            i = rawtext.find(b' ', 9000)
            if i == -1:
                i = 9000
            rawtext = rawtext[0:i]
@ -79,9 +85,9 @@ class European8859TextClassifier:
        dict = {}
        for w in words:
            dict[w] = dict.get(w, 0) + 1
-        lfreq = [a[0] for a in sorted(dict.iteritems(), \
+        lfreq = [a[0] for a in sorted(dict.items(), \
                       key=lambda entry: entry[1], reverse=True)[0:ntest]]
-        #print lfreq
+        #print(lfreq)

        # Build a dict (lang,code)->matchcount
        langstats = {}
@ -91,9 +97,9 @@ class European8859TextClassifier:
                langstats[lc] = langstats.get(lc, 0) + 1

        # Get a list of (lang,code) sorted by match count
-        lcfreq = sorted(langstats.iteritems(), \
+        lcfreq = sorted(langstats.items(), \
                        key=lambda entry: entry[1], reverse=True)
-        #print lcfreq[0:3]
+        #print(lcfreq[0:3])
        if len(lcfreq) != 0:
            lc,maxcount = lcfreq[0]
            maxlang = lc[0]
--- a/src/filters/rclmpdf.py
+++ b/src/filters/rclmpdf.py
@ -17,6 +17,8 @@

 # Recoll PDF extractor, with support for attachments

+from __future__ import print_function
+
 import os
 import sys
 import re
@ -89,7 +91,7 @@ class PDFExtractor:
                                   "output", tmpdir])
            self.attachlist = sorted(os.listdir(tmpdir))
            return True
-        except Exception, e:
+        except Exception as e:
            self.em.rclog("extractAttach: failed: %s" % e)
            # Return true anyway, pdf attachments are no big deal
            return True
--- a/src/filters/rclopxml.py
+++ b/src/filters/rclopxml.py
@ -146,7 +146,7 @@ class OXExtractor:
        return stylesheet
    
    def extractone(self, params):
-        if not params.has_key("filename:"):
+        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@ -1,4 +1,10 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
+
+# Recoll PPT text extractor
+# Msodump is not compatible with Python3 AFAIK, so this is stuck to
+# Python2 too
+
+from __future__ import print_function

 import rclexecm
 import rclexec1
--- a/src/filters/rclpython
+++ b/src/filters/rclpython
@ -22,6 +22,8 @@
 # - parse script encoding and allow output in any encoding by using unicode
 #   as intermediate

+from __future__ import print_function
+
 __version__ = '0.3'
 __date__ = '2005-07-04'
 __license__ = 'GPL'
@ -29,9 +31,26 @@ __author__ = 'J


 # Imports
-import cgi, string, sys, cStringIO
+import cgi, string, sys
+PY2 = sys.version < '3'
+if PY2:
+    import cStringIO
+else:
+    import io
 import keyword, token, tokenize

+if PY2:
+    def makebytes(data):
+        if isinstance(data, unicode):
+            return data.encode("UTF-8")
+        else:
+            return data
+else:
+    def makebytes(data):
+        if isinstance(data, bytes):
+            return data
+        else:
+            return data.encode("UTF-8")

 #############################################################################
 ### Python Source Parser (does Hilighting)
@ -57,7 +76,7 @@ _HTML_HEADER = """\
 <html>
 <head>
  <title>%%(title)s</title>
-  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  <meta name="Generator" content="colorize.py (version %s)">
 </head>
 <body>
@ -114,7 +133,7 @@ class Parser:
    def __init__(self, raw, out=sys.stdout):
        """ Store the source text.
        """
-        self.raw = string.strip(string.expandtabs(raw))
+        self.raw = raw.expandtabs().strip()
        self.out = out

    def format(self):
@ -124,35 +143,44 @@ class Parser:
        self.lines = [0, 0]
        pos = 0
        while 1:
-            pos = string.find(self.raw, '\n', pos) + 1
+            pos = self.raw.find(b'\n', pos) + 1
            if not pos: break
            self.lines.append(pos)
        self.lines.append(len(self.raw))

        # parse the source and write it
        self.pos = 0
-        text = cStringIO.StringIO(self.raw)
-        self.out.write(self.stylesheet)
-        self.out.write('<pre class="code">\n')
+        if PY2:
+            text = cStringIO.StringIO(self.raw)
+        else:
+            text = io.BytesIO(self.raw)
+        self.out.write(makebytes(self.stylesheet))
+        self.out.write(b'<pre class="code">\n')
        try:
-            tokenize.tokenize(text.readline, self)
-        except tokenize.TokenError, ex:
+            if PY2:
+                tokenize.tokenize(text.readline, self)
+            else:
+                for a,b,c,d,e in tokenize.tokenize(text.readline):
+                    self(a,b,c,d,e)
+        except tokenize.TokenError as ex:
            msg = ex[0]
            line = ex[1][0]
            self.out.write("<h3>ERROR: %s</h3>%s\n" % (
                msg, self.raw[self.lines[line]:]))
-        except IndentationError, ex:
+        except IndentationError as ex:
            msg = ex[0]
            self.out.write("<h3>ERROR: %s</h3>\n" % (msg))
-        self.out.write('\n</pre>')
+        self.out.write(b'\n</pre>')

-    def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line):
+    def __call__(self, toktype, toktext, startpos, endpos, line):
        """ Token handler.
        """
        if 0:
-            print "type", toktype, token.tok_name[toktype], "text", toktext,
-            print "start", srow,scol, "end", erow,ecol, "<br>"
-
+            print("type %s %s text %s start %s %s end %s %s<br>\n" % \
+                  (toktype, token.tok_name[toktype], toktext, \
+                   srow, scol,erow,ecol))
+        srow, scol = startpos
+        erow, ecol = endpos
        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
@ -160,7 +188,7 @@ class Parser:

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
-            self.out.write('\n')
+            self.out.write(b'\n')
            return

        # send the original whitespace, if needed
@ -180,9 +208,9 @@ class Parser:
        css_class = _css_classes.get(toktype, 'text')

        # send text
-        self.out.write('<span class="%s">' % (css_class,))
-        self.out.write(cgi.escape(toktext))
-        self.out.write('</span>')
+        self.out.write(makebytes('<span class="%s">' % (css_class,)))
+        self.out.write(makebytes(cgi.escape(toktext)))
+        self.out.write(b'</span>')


 def colorize_file(file=None, outstream=sys.stdout, standalone=True):
@ -205,7 +233,7 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
            filename = 'STREAM'
    elif file is not None:
        try:
-            sourcefile = open(file)
+            sourcefile = open(file, 'rb')
            filename = basename(file)
        except IOError:
            raise SystemExit("File %s unknown." % file)
@ -215,22 +243,26 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
    source = sourcefile.read()

    if standalone:
-        outstream.write(_HTML_HEADER % {'title': filename})
+        outstream.write(makebytes(_HTML_HEADER % {'title': filename}))
    Parser(source, out=outstream).format()
    if standalone:
-        outstream.write(_HTML_FOOTER)
+        outstream.write(makebytes(_HTML_FOOTER))

    if file:
        sourcefile.close()

 if __name__ == "__main__":
    import os
+    if PY2:
+        out = sys.stdout
+    else:
+        out = sys.stdout.buffer
    if os.environ.get('PATH_TRANSLATED'):
        filepath = os.environ.get('PATH_TRANSLATED')
-        print 'Content-Type: text/html; charset="iso-8859-1"\n'
-        colorize_file(filepath)
+        print('Content-Type: text/html; charset="iso-8859-1"\n')
+        colorize_file(filepath, out)
    elif len(sys.argv) > 1:
        filepath = sys.argv[1]
-        colorize_file(filepath)
+        colorize_file(filepath, out)
    else:
        colorize_file()
--- a/src/filters/rclrar
+++ b/src/filters/rclrar
@ -18,12 +18,14 @@
 #   Free Software Foundation, Inc.,
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

+from __future__ import print_function
+
 import sys
 import rclexecm
 try:
    from rarfile import RarFile
 except:
-    print "RECFILTERROR HELPERNOTFOUND python:rarfile"
+    print("RECFILTERROR HELPERNOTFOUND python:rarfile")
    sys.exit(1);

 # Requires RarFile python module. Try "sudo pip install rarfile"
@ -67,9 +69,7 @@ class RarExtractor:
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.rar.namelist()) -1:
            iseof = rclexecm.RclExecM.eofnext
-        if isinstance(ipath, unicode):
-            ipath = ipath.encode("utf-8")
-        return (ok, docdata, ipath, iseof)
+        return (ok, docdata, rclexecm.makebytes(ipath), iseof)

    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
--- a/src/filters/rclsoff.py
+++ b/src/filters/rclsoff.py
@ -16,6 +16,8 @@
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ######################################

+from __future__ import print_function
+
 import sys
 import rclexecm
 import rclxslt
@ -130,19 +132,19 @@ class OOExtractor:
        self.currentindex = 0

    def extractone(self, params):
-        if not params.has_key("filename:"):
+        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
-            zip = ZipFile(fn)
+            zip = ZipFile(fn.decode('UTF-8'))
        except Exception as err:
-            self.em.rclog("unzip failed: " + str(err))
+            self.em.rclog("unzip failed: %s" % err)
            return (False, "", "", rclexecm.RclExecM.eofnow)

-        docdata = '<html><head><meta http-equiv="Content-Type"' \
-                  'content="text/html; charset=UTF-8"></head><body>'
+        docdata = b'<html><head><meta http-equiv="Content-Type"' \
+                  b'content="text/html; charset=UTF-8"></head><body>'

        try:
            metadata = zip.read("meta.xml")
@ -160,9 +162,9 @@ class OOExtractor:
            if content:
                res = rclxslt.apply_sheet_data(stylesheet_content, content)
                docdata += res
-            docdata += '</body></html>'
+            docdata += b'</body></html>'
        except Exception as err:
-            self.em.rclog("bad data in %s" % fn)
+            self.em.rclog("bad data in %s: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
--- a/src/filters/rclsvg.py
+++ b/src/filters/rclsvg.py
@ -16,6 +16,8 @@
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ######################################

+from __future__ import print_function
+
 import sys
 import rclexecm
 import rclxslt
@ -104,7 +106,7 @@ class SVGExtractor:
        self.currentindex = 0

    def extractone(self, params):
-        if not params.has_key("filename:"):
+        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
--- a/src/filters/rcltar
+++ b/src/filters/rcltar
@ -6,12 +6,14 @@
 # It works not only for tar-files, but automatically for gzipped and
 # bzipped tar-files at well.

+from __future__ import print_function
+
 import rclexecm

 try:
    import tarfile
 except:
-    print "RECFILTERROR HELPERNOTFOUND python:tarfile"
+    print("RECFILTERROR HELPERNOTFOUND python:tarfile")
    sys.exit(1);

 class TarExtractor:
@ -38,15 +40,15 @@ class TarExtractor:
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.namen) -1:
            iseof = rclexecm.RclExecM.eofnext
-        if isinstance(ipath, unicode):
-            ipath = ipath.encode("utf-8")
-        return (ok, docdata, ipath, iseof)
+        return (ok, docdata, rclexecm.makebytes(ipath), iseof)

    def openfile(self, params):
        self.currentindex = -1
        try:
            self.tar = tarfile.open(name=params["filename:"],mode='r')
-            self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
+            #self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
+            self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
+
            return True
        except:
            return False
--- a/src/filters/rcltext.py
+++ b/src/filters/rcltext.py
@ -1,10 +1,13 @@
 #!/usr/bin/env python

+# Wrapping a text file. Recoll does it internally in most cases, but
+# this is for use by another filter.
+
+from __future__ import print_function
+
 import rclexecm
 import sys

-# Wrapping a text file. Recoll does it internally in most cases, but
-# there is a reason this exists, just can't remember it ...
 class TxtDump:
    def __init__(self, em):
        self.em = em
@ -12,7 +15,7 @@ class TxtDump:
    def extractone(self, params):
        #self.em.rclog("extractone %s %s" % (params["filename:"], \
        #params["mimetype:"]))
-        if not params.has_key("filename:"):
+        if not "filename:" in params:
            self.em.rclog("extractone: no file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)

--- a/src/filters/rclwar
+++ b/src/filters/rclwar
@ -2,6 +2,8 @@

 # WAR web archive filter for recoll. War file are gzipped tar files

+from __future__ import print_function
+
 import rclexecm
 import tarfile

--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@ -1,5 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2

+# Extractor for Excel files.
+# Mso-dumper is not compatible with Python3
 import rclexecm
 import rclexec1
 import xlsxmltocsv
--- a/src/filters/rclxml.py
+++ b/src/filters/rclxml.py
@ -62,7 +62,7 @@ class XMLExtractor:
        self.currentindex = 0

    def extractone(self, params):
-        if not params.has_key("filename:"):
+        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
--- a/src/filters/rclxslt.py
+++ b/src/filters/rclxslt.py
@ -17,36 +17,54 @@

 # Helper module for xslt-based filters

+from __future__ import print_function
+
 import sys

-try:
-    import libxml2
-    import libxslt
-except:
-    print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1"
-    sys.exit(1);
+PY2 = sys.version < '3'

-libxml2.substituteEntitiesDefault(1)
-
-def apply_sheet_data(sheet, data):
-    styledoc = libxml2.parseMemory(sheet, len(sheet))
-    style = libxslt.parseStylesheetDoc(styledoc)
-    doc = libxml2.parseMemory(data, len(data))
-    result = style.applyStylesheet(doc, None)
-    res = style.saveResultToString(result)
-    style.freeStylesheet()
-    doc.freeDoc()
-    result.freeDoc()
-    return res
-
-def apply_sheet_file(sheet, fn):
-    styledoc = libxml2.parseMemory(sheet, len(sheet))
-    style = libxslt.parseStylesheetDoc(styledoc)
-    doc = libxml2.parseFile(fn)
-    result = style.applyStylesheet(doc, None)
-    res = style.saveResultToString(result)
-    style.freeStylesheet()
-    doc.freeDoc()
-    result.freeDoc()
-    return res
+if PY2:
+    try:
+        import libxml2
+        import libxslt
+        libxml2.substituteEntitiesDefault(1)
+    except:
+        print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
+        sys.exit(1);
+    def apply_sheet_data(sheet, data):
+        styledoc = libxml2.parseMemory(sheet, len(sheet))
+        style = libxslt.parseStylesheetDoc(styledoc)
+        doc = libxml2.parseMemory(data, len(data))
+        result = style.applyStylesheet(doc, None)
+        res = style.saveResultToString(result)
+        style.freeStylesheet()
+        doc.freeDoc()
+        result.freeDoc()
+        return res
+    def apply_sheet_file(sheet, fn):
+        styledoc = libxml2.parseMemory(sheet, len(sheet))
+        style = libxslt.parseStylesheetDoc(styledoc)
+        doc = libxml2.parseFile(fn)
+        result = style.applyStylesheet(doc, None)
+        res = style.saveResultToString(result)
+        style.freeStylesheet()
+        doc.freeDoc()
+        result.freeDoc()
+        return res
+else:
+    try:
+        from lxml import etree
+    except:
+        print("RECFILTERROR HELPERNOTFOUND python3:lxml")
+        sys.exit(1);
+    def apply_sheet_data(sheet, data):
+        styledoc = etree.fromstring(sheet)
+        transform = etree.XSLT(styledoc)
+        doc = etree.fromstring(data)
+        return etree.tostring(transform(doc))
+    def apply_sheet_file(sheet, fn):
+        styledoc = etree.fromstring(sheet)
+        transform = etree.XSLT(styledoc)
+        doc = etree.parse(fn)
+        return etree.tostring(transform(doc))

--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@ -1,6 +1,24 @@
 #!/usr/bin/env python
+# Copyright (C) 2014 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#

-# Zip file filter for Recoll
+# Zip file extractor for Recoll
+
+from __future__ import print_function

 import os
 import fnmatch
@ -78,9 +96,7 @@ class ZipExtractor:
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.zip.namelist()) -1:
            iseof = rclexecm.RclExecM.eofnext
-        if isinstance(ipath, unicode):
-            ipath = ipath.encode("utf-8")
-        return (ok, docdata, ipath, iseof)
+        return (ok, docdata, rclexecm.makebytes(ipath), iseof)

    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
@ -96,7 +112,14 @@ class ZipExtractor:
                self.skiplist = skipped.split(" ")

        try:
-            self.zip = ZipFile(filename)
+            if rclexecm.PY3:
+                # Note: python3 ZipFile wants an str file name, which
+                # is wrong: file names are binary. But it accepts an
+                # open file, and open() has no such restriction
+                f = open(filename, 'rb')
+                self.zip = ZipFile(f)
+            else:
+                self.zip = ZipFile(filename)
            return True
        except Exception as err:
            self.em.rclog("openfile: failed: [%s]" % err)
--- a/src/filters/xls-dump.py
+++ b/src/filters/xls-dump.py
@ -1,10 +1,14 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #

+# mso-dumper is not compatible with python3
+
+from __future__ import print_function
+
 import sys, os.path, optparse
 sys.path.append(sys.path[0]+"/msodump.zip")

@ -97,7 +101,7 @@ class XLDumper(object):
                node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)

        except Exception as err:
-            print >> sys.stderr, "xls-dump.py: error: %s" % err
+            print("xls-dump.py: error: %s" % err, file=sys.stderr)
            sys.exit(1)

    def dump (self):
--- a/src/filters/xlsxmltocsv.py
+++ b/src/filters/xlsxmltocsv.py
@ -1,4 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
+
+# Transform XML output from xls-dump.py into csv format.
+# Note: msodumper is not compatible with python3.
+
+from __future__ import print_function

 import sys
 import xml.sax
@ -28,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
            if "value" in attrs:
                value = attrs["value"].encode("UTF-8")
            else:
-                value = unicode()
+                value = b''
            if "col" in attrs:
                self.cells[int(attrs["col"])] = value
            else:
@ -42,7 +47,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
    def endElement(self, name, ):
        if name == "row":
            curidx = 0
-            for idx, value in self.cells.iteritems():
+            for idx, value in self.cells.items():
                self.output += sepstring * (idx - curidx)
                self.output += "%s%s%s" % (dquote, value, dquote)
                curidx = idx