diff --git a/src/filters/rclchm b/src/filters/rclchm index e9cf0291..e3046d39 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 """Extract Html files from a Microsoft Compiled Html Help file (.chm) Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" diff --git a/src/filters/rcllatinclass.py b/src/filters/rcllatinclass.py index ad5d3efe..fa9504b9 100755 --- a/src/filters/rcllatinclass.py +++ b/src/filters/rcllatinclass.py @@ -16,12 +16,15 @@ with acute accent ? from __future__ import print_function import sys -import string +PY3 = sys.version > '3' +if not PY3: + import string import glob import os import os.path from zipfile import ZipFile + class European8859TextClassifier: def __init__(self, langzip=""): """langzip contains text files. Each text file is named like lang_code.txt @@ -33,9 +36,12 @@ class European8859TextClassifier: self.readlanguages(langzip) # Table to translate from punctuation to spaces - self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r" - spaces = len(self.punct) * " " - self.spacetable = string.maketrans(self.punct, spaces) + self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r" + spaces = len(self.punct) * b' ' + if PY3: + self.spacetable = bytes.maketrans(self.punct, spaces) + else: + self.spacetable = string.maketrans(self.punct, spaces) def readlanguages(self, langzip): """Extract the stop words lists from the zip file. @@ -53,7 +59,7 @@ class European8859TextClassifier: text = zip.read(fn) words = text.split() for word in words: - if self.allwords.has_key(word): + if word in self.allwords: self.allwords[word].append((lang, code)) else: self.allwords[word] = [(lang, code)] @@ -64,7 +70,7 @@ class European8859TextClassifier: # Limit to reasonable size. if len(rawtext) > 10000: - i = rawtext.find(" ", 9000) + i = rawtext.find(b' ', 9000) if i == -1: i = 9000 rawtext = rawtext[0:i] @@ -79,9 +85,9 @@ class European8859TextClassifier: dict = {} for w in words: dict[w] = dict.get(w, 0) + 1 - lfreq = [a[0] for a in sorted(dict.iteritems(), \ + lfreq = [a[0] for a in sorted(dict.items(), \ key=lambda entry: entry[1], reverse=True)[0:ntest]] - #print lfreq + #print(lfreq) # Build a dict (lang,code)->matchcount langstats = {} @@ -91,9 +97,9 @@ class European8859TextClassifier: langstats[lc] = langstats.get(lc, 0) + 1 # Get a list of (lang,code) sorted by match count - lcfreq = sorted(langstats.iteritems(), \ + lcfreq = sorted(langstats.items(), \ key=lambda entry: entry[1], reverse=True) - #print lcfreq[0:3] + #print(lcfreq[0:3]) if len(lcfreq) != 0: lc,maxcount = lcfreq[0] maxlang = lc[0] diff --git a/src/filters/rclmpdf.py b/src/filters/rclmpdf.py index e78d76b9..0b5ba836 100755 --- a/src/filters/rclmpdf.py +++ b/src/filters/rclmpdf.py @@ -17,6 +17,8 @@ # Recoll PDF extractor, with support for attachments +from __future__ import print_function + import os import sys import re @@ -89,7 +91,7 @@ class PDFExtractor: "output", tmpdir]) self.attachlist = sorted(os.listdir(tmpdir)) return True - except Exception, e: + except Exception as e: self.em.rclog("extractAttach: failed: %s" % e) # Return true anyway, pdf attachments are no big deal return True diff --git a/src/filters/rclopxml.py b/src/filters/rclopxml.py index 0073e17b..cc9948b0 100755 --- a/src/filters/rclopxml.py +++ b/src/filters/rclopxml.py @@ -146,7 +146,7 @@ class OXExtractor: return stylesheet def extractone(self, params): - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py index c2319e18..211d822a 100755 --- a/src/filters/rclppt.py +++ b/src/filters/rclppt.py @@ -1,4 +1,10 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 + +# Recoll PPT text extractor +# Msodump is not compatible with Python3 AFAIK, so this is stuck to +# Python2 too + +from __future__ import print_function import rclexecm import rclexec1 diff --git a/src/filters/rclpython b/src/filters/rclpython index 990d03b5..362d8a4e 100755 --- a/src/filters/rclpython +++ b/src/filters/rclpython @@ -22,6 +22,8 @@ # - parse script encoding and allow output in any encoding by using unicode # as intermediate +from __future__ import print_function + __version__ = '0.3' __date__ = '2005-07-04' __license__ = 'GPL' @@ -29,9 +31,26 @@ __author__ = 'J # Imports -import cgi, string, sys, cStringIO +import cgi, string, sys +PY2 = sys.version < '3' +if PY2: + import cStringIO +else: + import io import keyword, token, tokenize +if PY2: + def makebytes(data): + if isinstance(data, unicode): + return data.encode("UTF-8") + else: + return data +else: + def makebytes(data): + if isinstance(data, bytes): + return data + else: + return data.encode("UTF-8") ############################################################################# ### Python Source Parser (does Hilighting) @@ -57,7 +76,7 @@ _HTML_HEADER = """\ %%(title)s - + @@ -114,7 +133,7 @@ class Parser: def __init__(self, raw, out=sys.stdout): """ Store the source text. """ - self.raw = string.strip(string.expandtabs(raw)) + self.raw = raw.expandtabs().strip() self.out = out def format(self): @@ -124,35 +143,44 @@ class Parser: self.lines = [0, 0] pos = 0 while 1: - pos = string.find(self.raw, '\n', pos) + 1 + pos = self.raw.find(b'\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 - text = cStringIO.StringIO(self.raw) - self.out.write(self.stylesheet) - self.out.write('
\n')
+        if PY2:
+            text = cStringIO.StringIO(self.raw)
+        else:
+            text = io.BytesIO(self.raw)
+        self.out.write(makebytes(self.stylesheet))
+        self.out.write(b'
\n')
         try:
-            tokenize.tokenize(text.readline, self)
-        except tokenize.TokenError, ex:
+            if PY2:
+                tokenize.tokenize(text.readline, self)
+            else:
+                for a,b,c,d,e in tokenize.tokenize(text.readline):
+                    self(a,b,c,d,e)
+        except tokenize.TokenError as ex:
             msg = ex[0]
             line = ex[1][0]
             self.out.write("

ERROR: %s

%s\n" % ( msg, self.raw[self.lines[line]:])) - except IndentationError, ex: + except IndentationError as ex: msg = ex[0] self.out.write("

ERROR: %s

\n" % (msg)) - self.out.write('\n
') + self.out.write(b'\n
') - def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line): + def __call__(self, toktype, toktext, startpos, endpos, line): """ Token handler. """ if 0: - print "type", toktype, token.tok_name[toktype], "text", toktext, - print "start", srow,scol, "end", erow,ecol, "
" - + print("type %s %s text %s start %s %s end %s %s
\n" % \ + (toktype, token.tok_name[toktype], toktext, \ + srow, scol,erow,ecol)) + srow, scol = startpos + erow, ecol = endpos # calculate new positions oldpos = self.pos newpos = self.lines[srow] + scol @@ -160,7 +188,7 @@ class Parser: # handle newlines if toktype in [token.NEWLINE, tokenize.NL]: - self.out.write('\n') + self.out.write(b'\n') return # send the original whitespace, if needed @@ -180,9 +208,9 @@ class Parser: css_class = _css_classes.get(toktype, 'text') # send text - self.out.write('' % (css_class,)) - self.out.write(cgi.escape(toktext)) - self.out.write('') + self.out.write(makebytes('' % (css_class,))) + self.out.write(makebytes(cgi.escape(toktext))) + self.out.write(b'') def colorize_file(file=None, outstream=sys.stdout, standalone=True): @@ -205,7 +233,7 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True): filename = 'STREAM' elif file is not None: try: - sourcefile = open(file) + sourcefile = open(file, 'rb') filename = basename(file) except IOError: raise SystemExit("File %s unknown." % file) @@ -215,22 +243,26 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True): source = sourcefile.read() if standalone: - outstream.write(_HTML_HEADER % {'title': filename}) + outstream.write(makebytes(_HTML_HEADER % {'title': filename})) Parser(source, out=outstream).format() if standalone: - outstream.write(_HTML_FOOTER) + outstream.write(makebytes(_HTML_FOOTER)) if file: sourcefile.close() if __name__ == "__main__": import os + if PY2: + out = sys.stdout + else: + out = sys.stdout.buffer if os.environ.get('PATH_TRANSLATED'): filepath = os.environ.get('PATH_TRANSLATED') - print 'Content-Type: text/html; charset="iso-8859-1"\n' - colorize_file(filepath) + print('Content-Type: text/html; charset="iso-8859-1"\n') + colorize_file(filepath, out) elif len(sys.argv) > 1: filepath = sys.argv[1] - colorize_file(filepath) + colorize_file(filepath, out) else: colorize_file() diff --git a/src/filters/rclrar b/src/filters/rclrar index 0846263c..f11c2a39 100755 --- a/src/filters/rclrar +++ b/src/filters/rclrar @@ -18,12 +18,14 @@ # Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +from __future__ import print_function + import sys import rclexecm try: from rarfile import RarFile except: - print "RECFILTERROR HELPERNOTFOUND python:rarfile" + print("RECFILTERROR HELPERNOTFOUND python:rarfile") sys.exit(1); # Requires RarFile python module. Try "sudo pip install rarfile" @@ -67,9 +69,7 @@ class RarExtractor: iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.rar.namelist()) -1: iseof = rclexecm.RclExecM.eofnext - if isinstance(ipath, unicode): - ipath = ipath.encode("utf-8") - return (ok, docdata, ipath, iseof) + return (ok, docdata, rclexecm.makebytes(ipath), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): diff --git a/src/filters/rclsoff.py b/src/filters/rclsoff.py index d6a5b8c7..67e08014 100755 --- a/src/filters/rclsoff.py +++ b/src/filters/rclsoff.py @@ -16,6 +16,8 @@ # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ###################################### +from __future__ import print_function + import sys import rclexecm import rclxslt @@ -130,19 +132,19 @@ class OOExtractor: self.currentindex = 0 def extractone(self, params): - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: - zip = ZipFile(fn) + zip = ZipFile(fn.decode('UTF-8')) except Exception as err: - self.em.rclog("unzip failed: " + str(err)) + self.em.rclog("unzip failed: %s" % err) return (False, "", "", rclexecm.RclExecM.eofnow) - docdata = '' + docdata = b'' try: metadata = zip.read("meta.xml") @@ -160,9 +162,9 @@ class OOExtractor: if content: res = rclxslt.apply_sheet_data(stylesheet_content, content) docdata += res - docdata += '' + docdata += b'' except Exception as err: - self.em.rclog("bad data in %s" % fn) + self.em.rclog("bad data in %s: %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) return (True, docdata, "", rclexecm.RclExecM.eofnext) diff --git a/src/filters/rclsvg.py b/src/filters/rclsvg.py index 7fde9f2e..ef99664b 100755 --- a/src/filters/rclsvg.py +++ b/src/filters/rclsvg.py @@ -16,6 +16,8 @@ # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ###################################### +from __future__ import print_function + import sys import rclexecm import rclxslt @@ -104,7 +106,7 @@ class SVGExtractor: self.currentindex = 0 def extractone(self, params): - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] diff --git a/src/filters/rcltar b/src/filters/rcltar index 7dba94d3..f597bb79 100755 --- a/src/filters/rcltar +++ b/src/filters/rcltar @@ -6,12 +6,14 @@ # It works not only for tar-files, but automatically for gzipped and # bzipped tar-files at well. +from __future__ import print_function + import rclexecm try: import tarfile except: - print "RECFILTERROR HELPERNOTFOUND python:tarfile" + print("RECFILTERROR HELPERNOTFOUND python:tarfile") sys.exit(1); class TarExtractor: @@ -38,15 +40,15 @@ class TarExtractor: iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.namen) -1: iseof = rclexecm.RclExecM.eofnext - if isinstance(ipath, unicode): - ipath = ipath.encode("utf-8") - return (ok, docdata, ipath, iseof) + return (ok, docdata, rclexecm.makebytes(ipath), iseof) def openfile(self, params): self.currentindex = -1 try: self.tar = tarfile.open(name=params["filename:"],mode='r') - self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())] + #self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())] + self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]] + return True except: return False diff --git a/src/filters/rcltext.py b/src/filters/rcltext.py index 2605f047..847a80b2 100755 --- a/src/filters/rcltext.py +++ b/src/filters/rcltext.py @@ -1,10 +1,13 @@ #!/usr/bin/env python +# Wrapping a text file. Recoll does it internally in most cases, but +# this is for use by another filter. + +from __future__ import print_function + import rclexecm import sys -# Wrapping a text file. Recoll does it internally in most cases, but -# there is a reason this exists, just can't remember it ... class TxtDump: def __init__(self, em): self.em = em @@ -12,7 +15,7 @@ class TxtDump: def extractone(self, params): #self.em.rclog("extractone %s %s" % (params["filename:"], \ #params["mimetype:"])) - if not params.has_key("filename:"): + if not "filename:" in params: self.em.rclog("extractone: no file name") return (False, "", "", rclexecm.RclExecM.eofnow) diff --git a/src/filters/rclwar b/src/filters/rclwar index 30a95e9f..8b0dc35f 100755 --- a/src/filters/rclwar +++ b/src/filters/rclwar @@ -2,6 +2,8 @@ # WAR web archive filter for recoll. War file are gzipped tar files +from __future__ import print_function + import rclexecm import tarfile diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py index cbae1692..a8e1bf97 100755 --- a/src/filters/rclxls.py +++ b/src/filters/rclxls.py @@ -1,5 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 +# Extractor for Excel files. +# Mso-dumper is not compatible with Python3 import rclexecm import rclexec1 import xlsxmltocsv diff --git a/src/filters/rclxml.py b/src/filters/rclxml.py index 78e93f8a..1fd993f2 100755 --- a/src/filters/rclxml.py +++ b/src/filters/rclxml.py @@ -62,7 +62,7 @@ class XMLExtractor: self.currentindex = 0 def extractone(self, params): - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] diff --git a/src/filters/rclxslt.py b/src/filters/rclxslt.py index 4b0e2e8c..2441294e 100644 --- a/src/filters/rclxslt.py +++ b/src/filters/rclxslt.py @@ -17,36 +17,54 @@ # Helper module for xslt-based filters +from __future__ import print_function + import sys -try: - import libxml2 - import libxslt -except: - print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1" - sys.exit(1); +PY2 = sys.version < '3' -libxml2.substituteEntitiesDefault(1) - -def apply_sheet_data(sheet, data): - styledoc = libxml2.parseMemory(sheet, len(sheet)) - style = libxslt.parseStylesheetDoc(styledoc) - doc = libxml2.parseMemory(data, len(data)) - result = style.applyStylesheet(doc, None) - res = style.saveResultToString(result) - style.freeStylesheet() - doc.freeDoc() - result.freeDoc() - return res - -def apply_sheet_file(sheet, fn): - styledoc = libxml2.parseMemory(sheet, len(sheet)) - style = libxslt.parseStylesheetDoc(styledoc) - doc = libxml2.parseFile(fn) - result = style.applyStylesheet(doc, None) - res = style.saveResultToString(result) - style.freeStylesheet() - doc.freeDoc() - result.freeDoc() - return res +if PY2: + try: + import libxml2 + import libxslt + libxml2.substituteEntitiesDefault(1) + except: + print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1") + sys.exit(1); + def apply_sheet_data(sheet, data): + styledoc = libxml2.parseMemory(sheet, len(sheet)) + style = libxslt.parseStylesheetDoc(styledoc) + doc = libxml2.parseMemory(data, len(data)) + result = style.applyStylesheet(doc, None) + res = style.saveResultToString(result) + style.freeStylesheet() + doc.freeDoc() + result.freeDoc() + return res + def apply_sheet_file(sheet, fn): + styledoc = libxml2.parseMemory(sheet, len(sheet)) + style = libxslt.parseStylesheetDoc(styledoc) + doc = libxml2.parseFile(fn) + result = style.applyStylesheet(doc, None) + res = style.saveResultToString(result) + style.freeStylesheet() + doc.freeDoc() + result.freeDoc() + return res +else: + try: + from lxml import etree + except: + print("RECFILTERROR HELPERNOTFOUND python3:lxml") + sys.exit(1); + def apply_sheet_data(sheet, data): + styledoc = etree.fromstring(sheet) + transform = etree.XSLT(styledoc) + doc = etree.fromstring(data) + return etree.tostring(transform(doc)) + def apply_sheet_file(sheet, fn): + styledoc = etree.fromstring(sheet) + transform = etree.XSLT(styledoc) + doc = etree.parse(fn) + return etree.tostring(transform(doc)) diff --git a/src/filters/rclzip b/src/filters/rclzip index 9d88dc76..82974e54 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -1,6 +1,24 @@ #!/usr/bin/env python +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# -# Zip file filter for Recoll +# Zip file extractor for Recoll + +from __future__ import print_function import os import fnmatch @@ -78,9 +96,7 @@ class ZipExtractor: iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.zip.namelist()) -1: iseof = rclexecm.RclExecM.eofnext - if isinstance(ipath, unicode): - ipath = ipath.encode("utf-8") - return (ok, docdata, ipath, iseof) + return (ok, docdata, rclexecm.makebytes(ipath), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): @@ -96,7 +112,14 @@ class ZipExtractor: self.skiplist = skipped.split(" ") try: - self.zip = ZipFile(filename) + if rclexecm.PY3: + # Note: python3 ZipFile wants an str file name, which + # is wrong: file names are binary. But it accepts an + # open file, and open() has no such restriction + f = open(filename, 'rb') + self.zip = ZipFile(f) + else: + self.zip = ZipFile(filename) return True except Exception as err: self.em.rclog("openfile: failed: [%s]" % err) diff --git a/src/filters/xls-dump.py b/src/filters/xls-dump.py index d826654f..15613f35 100755 --- a/src/filters/xls-dump.py +++ b/src/filters/xls-dump.py @@ -1,10 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # +# mso-dumper is not compatible with python3 + +from __future__ import print_function + import sys, os.path, optparse sys.path.append(sys.path[0]+"/msodump.zip") @@ -97,7 +101,7 @@ class XLDumper(object): node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) except Exception as err: - print >> sys.stderr, "xls-dump.py: error: %s" % err + print("xls-dump.py: error: %s" % err, file=sys.stderr) sys.exit(1) def dump (self): diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py index 72850d3a..cfc39304 100755 --- a/src/filters/xlsxmltocsv.py +++ b/src/filters/xlsxmltocsv.py @@ -1,4 +1,9 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 + +# Transform XML output from xls-dump.py into csv format. +# Note: msodumper is not compatible with python3. + +from __future__ import print_function import sys import xml.sax @@ -28,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): if "value" in attrs: value = attrs["value"].encode("UTF-8") else: - value = unicode() + value = b'' if "col" in attrs: self.cells[int(attrs["col"])] = value else: @@ -42,7 +47,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): def endElement(self, name, ): if name == "row": curidx = 0 - for idx, value in self.cells.iteritems(): + for idx, value in self.cells.items(): self.output += sepstring * (idx - curidx) self.output += "%s%s%s" % (dquote, value, dquote) curidx = idx