From f344e8feddcbc5d31a4d6130f62d91fd8a14ff95 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Fri, 6 Nov 2015 16:49:03 +0100
Subject: [PATCH] first pass at converting the filters for python 2/3 compat
---
src/filters/ppt-dump.py | 2 +-
src/filters/rcl7z | 14 +--
src/filters/rclaudio | 30 ++---
src/filters/rclchm | 13 ++-
src/filters/rcldia | 6 +-
src/filters/rcldoc.py | 39 +++----
src/filters/rclepub | 15 +--
src/filters/rclexec1.py | 6 +-
src/filters/rclexecm.py | 152 +++++++++++++++-----------
src/filters/rclics | 70 ++++++------
src/filters/rclimg.py | 38 ++++---
src/filters/rclinfo | 95 ++++++++--------
src/filters/rclkar | 22 ++--
src/filters/rcllatinclass.py | 6 +-
src/filters/rclrar | 6 +-
src/filters/rclrtf.py | 19 ++--
src/filters/rcltar | 4 +-
src/filters/rclwar | 6 +-
src/filters/rclzip | 6 +-
src/python/recoll/recoll/rclconfig.py | 2 +-
tests/config/recoll.conf | 2 +
21 files changed, 298 insertions(+), 255 deletions(-)
diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py
index f05a5789..0a05559f 100755
--- a/src/filters/ppt-dump.py
+++ b/src/filters/ppt-dump.py
@@ -52,7 +52,7 @@ class PPTDumper(object):
try:
dirstrm = strm.getDirectoryStreamByName(dirname)
- except Exception, err:
+ except Exception as err:
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
# The previous version was killed by the exception
# here, so the equivalent is to break, but maybe there
diff --git a/src/filters/rcl7z b/src/filters/rcl7z
index c7ea935d..2af73ae6 100755
--- a/src/filters/rcl7z
+++ b/src/filters/rcl7z
@@ -15,7 +15,7 @@ try:
import pylzma
from py7zlib import Archive7z
except:
- print "RECFILTERROR HELPERNOTFOUND python:pylzma"
+ print("RECFILTERROR HELPERNOTFOUND python:pylzma")
sys.exit(1);
try:
@@ -40,19 +40,17 @@ class SevenZipExtractor:
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
- docdata = ""
+ docdata = b''
try:
docdata = self.sevenzip.getmember(ipath).read()
ok = True
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.sevenzip.getnames()) -1:
iseof = rclexecm.RclExecM.eofnext
- if isinstance(ipath, unicode):
- ipath = ipath.encode("utf-8")
- return (ok, docdata, ipath, iseof)
+ return (ok, docdata, rclexecm.makebytes(ipath), iseof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
@@ -71,7 +69,7 @@ class SevenZipExtractor:
fp = open(filename, 'rb')
self.sevenzip = Archive7z(fp)
return True
- except Exception, err:
+ except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err)
return False
@@ -84,7 +82,7 @@ class SevenZipExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
- except Exception, err:
+ except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
diff --git a/src/filters/rclaudio b/src/filters/rclaudio
index d717adc1..03f95ad9 100755
--- a/src/filters/rclaudio
+++ b/src/filters/rclaudio
@@ -12,7 +12,7 @@ try:
from mutagen.flac import FLAC
from mutagen.oggvorbis import OggVorbis
except:
- print "RECFILTERROR HELPERNOTFOUND python:mutagen"
+ print("RECFILTERROR HELPERNOTFOUND python:mutagen")
sys.exit(1);
# prototype for the html document we're returning
@@ -42,23 +42,24 @@ class AudioTagExtractor:
#self.em.rclog("extractone %s %s" % (params["filename:"], params["mimetype:"]))
docdata = ""
ok = False
- if not params.has_key("mimetype:") or not params.has_key("filename:"):
+ if not "mimetype:" in params or not "filename:" in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
mimetype = params["mimetype:"]
try:
- if mimetype == "audio/mpeg":
+ if mimetype == b'audio/mpeg':
tags = MP3(filename, ID3=EasyID3)
- elif mimetype == "application/ogg":
+ elif mimetype == b'application/ogg' or \
+ mimetype == b'audio/x-vorbis+ogg':
tags = OggVorbis(filename)
- elif mimetype == "application/x-flac" or \
- mimetype == "audio/x-flac" or \
- mimetype == "audio/flac":
+ elif mimetype == b'application/x-flac' or \
+ mimetype == 'audio/x-flac' or \
+ mimetype == b'audio/flac':
tags = FLAC(filename)
else:
- raise Exception, "Bad mime type %s" % mimetype
- except Exception, err:
+ raise Exception("Bad mime type %s" % mimetype)
+ except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
@@ -66,21 +67,22 @@ class AudioTagExtractor:
artist = ""
title = ""
try:
- album = self.em.htmlescape(tags["album"][0].encode("utf-8"))
+ album = self.em.htmlescape(tags["album"][0])
except:
pass
try:
- artist = self.em.htmlescape(tags["artist"][0].encode("utf-8"))
+ artist = self.em.htmlescape(tags["artist"][0])
except:
pass
try:
- title = self.em.htmlescape(tags["title"][0].encode("utf-8"))
+ title = self.em.htmlescape(tags["title"][0])
except:
pass
self.em.setmimetype("text/html")
- alldata = self.em.htmlescape(tags.pprint().encode("utf-8"))
+ alldata = self.em.htmlescape(tags.pprint())
alldata = alldata.replace("\n", "
")
- docdata = htmltemplate % (album, artist, title, alldata)
+ docdata = (htmltemplate % (album, artist, title, alldata))\
+ .encode('UTF-8')
ok = True
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
diff --git a/src/filters/rclchm b/src/filters/rclchm
index a9c2bbc7..e9cf0291 100755
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@@ -2,6 +2,11 @@
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
+from __future__ import print_function
+
+# Note: this is not converted to python3, libchm does not have a
+# python3 wrapper at this point (2015-11)
+
# Do we return individual chapters as html pages or concatenate everything?
rclchm_catenate = 0
# Use special html type to allow for mimeconf/mimeview Open magic,
@@ -23,13 +28,13 @@ import rclexecm
try:
from chm import chm,chmlib
except:
- print "RECFILTERROR HELPERNOTFOUND python:chm"
+ print("RECFILTERROR HELPERNOTFOUND python:chm")
sys.exit(1);
try:
from HTMLParser import HTMLParser
except:
- print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
+ print("RECFILTERROR HELPERNOTFOUND python:HTMLParser")
sys.exit(1);
# Small helper routines
@@ -37,11 +42,11 @@ def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
- #print "ResolveObject failed", path
+ #print("ResolveObject failed: %s" % path, file=sys.stderr)
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
- print "RetrieveObject failed", path
+ print("RetrieveObject failed: %s" % path, file=sys.stderr)
return ""
return doc
diff --git a/src/filters/rcldia b/src/filters/rcldia
index 937204f5..1d00ea76 100755
--- a/src/filters/rcldia
+++ b/src/filters/rcldia
@@ -1,5 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import print_function
+
# dia (http://live.gnome.org/Dia) file filter for recoll
# stefan.friedel@iwr.uni-heidelberg.de 2012
#
@@ -66,7 +68,7 @@ class DiaExtractor:
try:
docdata = self.ExtractDiaText()
ok = True
- except Exception, err:
+ except Exception as err:
ok = False
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
@@ -76,7 +78,7 @@ class DiaExtractor:
def openfile(self, params):
try:
self.dia = GzipFile(params["filename:"], 'r')
- # Dial files are sometimes not compressed. Quite weirdly,
+ # Dia files are sometimes not compressed. Quite weirdly,
# GzipFile does not complain until we try to read. Have to do it
# here to be able to retry an uncompressed open.
data = self.dia.readline()
diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
index 75078f16..262226cb 100755
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
import rclexecm
import rclexec1
@@ -11,32 +12,32 @@ import os
class WordProcessData:
def __init__(self, em):
self.em = em
- self.out = ""
- self.cont = ""
+ self.out = b''
+ self.cont = b''
self.gotdata = False
# Line with continued word (ending in -)
# we strip the - which is not nice for actually hyphenated word.
# What to do ?
- self.patcont = re.compile('''[\w][-]$''')
+ self.patcont = re.compile(b'''[\w][-]$''')
# Pattern for breaking continuation at last word start
- self.patws = re.compile('''([\s])([\w]+)(-)$''')
+ self.patws = re.compile(b'''([\s])([\w]+)(-)$''')
def takeLine(self, line):
if not self.gotdata:
- if line == "":
+ if line == b'':
return
- self.out = '' + \
- '' + \
- ''
+ self.out = b'
' + \
+ b'' + \
+ b''
self.gotdata = True
if self.cont:
line = self.cont + line
self.cont = ""
- if line == "\f":
- self.out += "
"
+ if line == b'\f':
+ self.out += '
'
return
if self.patcont.search(line):
@@ -47,16 +48,16 @@ class WordProcessData:
line = line[0:match.start(1)]
else:
self.cont = line
- line = ""
+ line = b''
if line:
- self.out += self.em.htmlescape(line) + "
"
+ self.out += self.em.htmlescape(line) + b'
'
else:
- self.out += "
"
+ self.out += b'
'
def wrapData(self):
if self.gotdata:
- self.out += "
"
+ self.out += b'
'
self.em.setmimetype("text/html")
return self.out
@@ -65,7 +66,7 @@ class WordProcessData:
# output HTML
class WordPassData:
def __init__(self, em):
- self.out = ""
+ self.out = b''
self.em = em
def takeLine(self, line):
@@ -96,8 +97,8 @@ class WordFilter:
return False
def mimetype(self, fn):
- rtfprolog ="{\\rtf1"
- docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+ rtfprolog = b'{\\rtf1'
+ docprolog = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
try:
f = open(fn, "rb")
except:
@@ -132,7 +133,7 @@ class WordFilter:
mt = self.mimetype(fn)
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
if mt == "text/plain":
- return ([python, os.path.join(self.execdir, "rcltext.py")],
+ return (["python", os.path.join(self.execdir, "rcltext.py")],
WordPassData(self.em))
elif mt == "text/rtf":
cmd = ["python", os.path.join(self.execdir, "rclrtf.py"),
diff --git a/src/filters/rclepub b/src/filters/rclepub
index 1c50592f..c4868d26 100755
--- a/src/filters/rclepub
+++ b/src/filters/rclepub
@@ -1,5 +1,6 @@
#!/usr/bin/env python
"""Extract Html content from an EPUB file (.chm)"""
+from __future__ import print_function
rclepub_html_mtype = "text/html"
@@ -12,7 +13,7 @@ import rclexecm
try:
import epub
except:
- print "RECFILTERROR HELPERNOTFOUND python:epub"
+ print("RECFILTERROR HELPERNOTFOUND python:epub")
sys.exit(1);
class rclEPUB:
@@ -63,11 +64,11 @@ class rclEPUB:
if item is None:
raise Exception("Item not found for id %s" % (id,))
doc = self.book.read_item(item)
- doc = re.sub('''[hH][eE][aA][dD]>''',
- '''''', doc)
+ doc = re.sub(b'''[hH][eE][aA][dD]>''',
+ b'''''', doc)
self.em.setmimetype(rclepub_html_mtype)
return (True, doc, id, iseof)
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
return (False, "", id, iseof)
@@ -76,11 +77,11 @@ class rclEPUB:
self.currentindex = -1
self.contents = []
try:
- self.book = epub.open(params["filename:"])
- except Exception, err:
+ self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
+ except Exception as err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
- for id, item in self.book.opf.manifest.iteritems():
+ for id, item in self.book.opf.manifest.items():
if item.media_type == 'application/xhtml+xml':
self.contents.append(id)
return True
diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py
index ffa68c53..d26d9b60 100644
--- a/src/filters/rclexec1.py
+++ b/src/filters/rclexec1.py
@@ -26,6 +26,8 @@
# this would be to slow. So this helps implementing a permanent script
# to repeatedly execute single commands.
+from __future__ import print_function
+
import subprocess
import rclexecm
@@ -74,8 +76,8 @@ class Executor:
# params["mimetype:"]))
self.flt.reset()
ok = False
- if not params.has_key("filename:"):
- self.em.rclog("extractone: no mime or file name")
+ if not "filename:" in params:
+ self.em.rclog("extractone: no file name")
return (ok, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py
index adcb54e5..26c9764e 100644
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@@ -16,6 +16,9 @@
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
########################################################
## Recoll multifilter communication module and utilities
+#
+# All data is binary. This is important for Python3
+# All parameter names are converted to and processed as str/unicode
from __future__ import print_function
@@ -26,6 +29,21 @@ import shutil
import getopt
import rclconfig
+PY3 = sys.version > '3'
+
+if PY3:
+ def makebytes(data):
+ if isinstance(data, bytes):
+ return data
+ else:
+ return data.encode("UTF-8")
+else:
+ def makebytes(data):
+ if isinstance(data, unicode):
+ return data.encode("UTF-8")
+ else:
+ return data
+
my_config = rclconfig.RclConfig()
############################################
@@ -33,7 +51,7 @@ my_config = rclconfig.RclConfig()
# communication protocol with the recollindex process. It calls the
# object specific of the document type to actually get the data.
class RclExecM:
- noteof = 0
+ noteof = 0
eofnext = 1
eofnow = 2
@@ -46,7 +64,7 @@ class RclExecM:
self.myname = os.path.basename(sys.argv[0])
except:
self.myname = "???"
- self.mimetype = ""
+ self.mimetype = b""
if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"):
self.maxmembersize = \
@@ -60,7 +78,7 @@ class RclExecM:
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
self.debugfile = None
if self.debugfile:
- self.errfout = open(self.debugfile, "ab")
+ self.errfout = open(self.debugfile, "a")
else:
self.errfout = sys.stderr
@@ -93,77 +111,84 @@ class RclExecM:
# Note: tried replacing this with a multiple replacer according to
# http://stackoverflow.com/a/15221068, which was **10 times** slower
def htmlescape(self, txt):
- # This must stay first (it somehow had managed to skip after
- # the next line, with rather interesting results)
- txt = txt.replace("&", "&")
-
- txt = txt.replace("<", "<")
- txt = txt.replace(">", ">")
- txt = txt.replace('"', """)
+ # & must stay first (it somehow had managed to skip
+ # after the next replace, with rather interesting results)
+ try:
+ txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
+ replace(b'>', b'>').replace(b'"', b'"')
+ except:
+ txt = txt.replace("&", "&").replace("<", "<").\
+ replace(">", ">").replace("\"", """)
return txt
# Our worker sometimes knows the mime types of the data it sends
def setmimetype(self, mt):
- self.mimetype = mt
+ self.mimetype = makebytes(mt)
# Read single parameter from process input: line with param name and size
- # followed by data.
+ # followed by data. The param name is returned as str/unicode, the data
+ # as bytes
def readparam(self):
- s = sys.stdin.readline()
- if s == '':
+ if PY3:
+ inf = sys.stdin.buffer
+ else:
+ inf = sys.stdin
+ s = inf.readline()
+ if s == b'':
sys.exit(0)
-# self.rclog(": EOF on input", 1, 0)
- s = s.rstrip("\n")
+ s = s.rstrip(b'\n')
- if s == "":
- return ("","")
+ if s == b'':
+ return ('', b'')
l = s.split()
if len(l) != 2:
- self.rclog("bad line: [" + s + "]", 1, 1)
+ self.rclog(b'bad line: [' + s + b']', 1, 1)
- paramname = l[0].lower()
+ paramname = l[0].decode('ASCII').lower()
paramsize = int(l[1])
if paramsize > 0:
- paramdata = sys.stdin.read(paramsize)
+ paramdata = inf.read(paramsize)
if len(paramdata) != paramsize:
self.rclog("Bad read: wanted %d, got %d" %
- (paramsize, len(paramdata)), 1,1)
+ (paramsize, len(paramdata)), 1, 1)
else:
- paramdata = ""
+ paramdata = b''
#self.rclog("paramname [%s] paramsize %d value [%s]" %
# (paramname, paramsize, paramdata))
return (paramname, paramdata)
+ if PY3:
+ def senditem(self, nm, len, data):
+ sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, len)))
+ self.breakwrite(sys.stdout.buffer, makebytes(data))
+ else:
+ def senditem(self, nm, len, data):
+ sys.stdout.write(makebytes("%s: %d\n" % (nm, len)))
+ self.breakwrite(sys.stdout, makebytes(data))
+
# Send answer: document, ipath, possible eof.
def answer(self, docdata, ipath, iseof = noteof, iserror = noerror):
if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow:
- if isinstance(docdata, unicode):
- self.rclog("GOT UNICODE for ipath [%s]" % (ipath,))
- docdata = docdata.encode("UTF-8")
-
- print("Document: %d" % len(docdata))
- self.breakwrite(sys.stdout, docdata)
+ self.senditem("Document", len(docdata), docdata)
if len(ipath):
- print("Ipath: %d" % len(ipath))
- sys.stdout.write(ipath)
+ self.senditem("Ipath", len(ipath), ipath)
if len(self.mimetype):
- print("Mimetype: %d" % len(self.mimetype))
- sys.stdout.write(self.mimetype)
+ self.senditem("Mimetype", len(self.mimetype), self.mimetype)
# If we're at the end of the contents, say so
if iseof == RclExecM.eofnow:
- print("Eofnow: 0")
+ self.senditem("Eofnow", 0, b'')
elif iseof == RclExecM.eofnext:
- print("Eofnext: 0")
+ self.senditem("Eofnext", 0, b'')
if iserror == RclExecM.subdocerror:
- print("Subdocerror: 0")
+ self.senditem("Subdocerror", 0, b'')
elif iserror == RclExecM.fileerror:
- print("Fileerror: 0")
+ self.senditem("Fileerror", 0, b'')
# End of message
print()
@@ -173,7 +198,8 @@ class RclExecM:
def processmessage(self, processor, params):
# We must have a filename entry (even empty). Else exit
- if not params.has_key("filename:"):
+ if "filename:" not in params:
+ print("%s" % params, file=sys.stderr)
self.rclog("no filename ??", 1, 1)
# If we're given a file name, open it.
@@ -182,7 +208,7 @@ class RclExecM:
if not processor.openfile(params):
self.answer("", "", iserror = RclExecM.fileerror)
return
- except Exception, err:
+ except Exception as err:
self.rclog("processmessage: openfile raised: [%s]" % err)
self.answer("", "", iserror = RclExecM.fileerror)
return
@@ -192,11 +218,11 @@ class RclExecM:
eof = True
self.mimetype = ""
try:
- if params.has_key("ipath:") and len(params["ipath:"]):
+ if "ipath:" in params and len(params["ipath:"]):
ok, data, ipath, eof = processor.getipath(params)
else:
ok, data, ipath, eof = processor.getnext(params)
- except Exception, err:
+ except Exception as err:
self.answer("", "", eof, RclExecM.fileerror)
return
@@ -311,7 +337,7 @@ def main(proto, extract):
actAsSingle = False
debugDumpData = False
- ipath = ""
+ ipath = b""
args = sys.argv[1:]
opts, args = getopt.getopt(args, "hdsi:w:")
@@ -321,7 +347,7 @@ def main(proto, extract):
elif opt in ['-s']:
actAsSingle = True
elif opt in ['-i']:
- ipath = arg
+ ipath = makebytes(arg)
elif opt in ['-w']:
ret = which(arg)
if ret:
@@ -344,17 +370,17 @@ def main(proto, extract):
lst = fileout.split(':')
mimetype = lst[len(lst)-1].strip()
lst = mimetype.split(';')
- return lst[0].strip()
+ return makebytes(lst[0].strip())
def mimetype_with_xdg(f):
cmd = 'xdg-mime query filetype "' + f + '"'
- return os.popen(cmd).read().strip()
+ return makebytes(os.popen(cmd).read().strip())
- def debprint(s):
+ def debprint(out, s):
if not actAsSingle:
- print(s)
+ proto.breakwrite(out, makebytes(s+'\n'))
- params = {'filename:': args[0]}
+ params = {'filename:': makebytes(args[0])}
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
mimetype = mimetype_with_xdg(args[0])
params['mimetype:'] = mimetype
@@ -363,19 +389,20 @@ def main(proto, extract):
print("Open error", file=sys.stderr)
sys.exit(1)
- if ipath != "" or actAsSingle:
+ if PY3:
+ ioout = sys.stdout.buffer
+ else:
+ ioout = sys.stdout
+ if ipath != b"" or actAsSingle:
params['ipath:'] = ipath
ok, data, ipath, eof = extract.getipath(params)
if ok:
- debprint("== Found entry for ipath %s (mimetype [%s]):" % \
+ debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \
(ipath, proto.mimetype))
- if isinstance(data, unicode):
- bdata = data.encode("UTF-8")
- else:
- bdata = data
+ bdata = makebytes(data)
if debugDumpData or actAsSingle:
- proto.breakwrite(sys.stdout, bdata)
- print()
+ proto.breakwrite(ioout, bdata)
+ ioout.write(b'\n')
sys.exit(0)
else:
print("Got error, eof %d"%eof, file=sys.stderr)
@@ -386,15 +413,12 @@ def main(proto, extract):
ok, data, ipath, eof = extract.getnext(params)
if ok:
ecnt = ecnt + 1
- debprint("== Entry %d ipath %s (mimetype [%s]):" % \
- (ecnt, ipath, proto.mimetype))
- if isinstance(data, unicode):
- bdata = data.encode("UTF-8")
- else:
- bdata = data
+ bdata = makebytes(data)
+ debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \
+ (ecnt, len(data), ipath, proto.mimetype))
if debugDumpData:
- proto.breakwrite(sys.stdout, bdata)
- print()
+ proto.breakwrite(ioout, bdata)
+ ioout.write(b'\n')
if eof != RclExecM.noteof:
sys.exit(0)
else:
diff --git a/src/filters/rclics b/src/filters/rclics
index 6ad3f632..3f28a057 100755
--- a/src/filters/rclics
+++ b/src/filters/rclics
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
# Read an ICS file, break it into "documents" which are events, todos,
# or journal entries, and interface with recoll execm
@@ -13,36 +14,36 @@ import rclexecm
import sys
# Decide how we'll process the file.
-modules = ('internal', 'icalendar', 'vobject')
-usemodule = 'internal'
+modules = ("internal", "icalendar", "vobject")
+usemodule = "internal"
forcevobject = 0
-if usemodule != 'internal':
+if usemodule != "internal":
try:
if forcevobject:
raise Exception
from icalendar import Calendar, Event
- usemodule = 'icalendar'
+ usemodule = "icalendar"
except:
try:
import vobject
- usemodule = 'vobject'
+ usemodule = "vobject"
except:
- print "RECFILTERROR HELPERNOTFOUND python:icalendar"
- print "RECFILTERROR HELPERNOTFOUND python:vobject"
+ print("RECFILTERROR HELPERNOTFOUND python:icalendar")
+ print("RECFILTERROR HELPERNOTFOUND python:vobject")
sys.exit(1);
class IcalExtractor:
def __init__(self, em):
self.file = ""
- self.contents = []
+ self.contents = []
self.em = em
def extractone(self, index):
if index >= len(self.contents):
return(False, "", "", True)
docdata = self.contents[index]
- #self.em.rclog(docdata)
+ #self.em.rclog(docdata)
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.contents) -1:
@@ -55,32 +56,32 @@ class IcalExtractor:
self.file = params["filename:"]
try:
- calstr = open(self.file, 'rb')
- except Exception, e:
+ calstr = open(self.file, "rb")
+ except Exception as e:
self.em.rclog("Openfile: open: %s" % str(e))
return False
self.currentindex = -1
- if usemodule == 'internal':
+ if usemodule == "internal":
self.contents = ICalSimpleSplitter().splitcalendar(calstr)
- elif usemodule == 'icalendar':
+ elif usemodule == "icalendar":
try:
cal = Calendar.from_string(calstr.read())
- except Exception, e:
+ except Exception as e:
self.em.rclog("Openfile: read or parse error: %s" % str(e))
return False
self.contents = cal.walk()
self.contents = [item.as_string() for item in self.contents
- if (item.name == 'VEVENT' or item.name == 'VTODO'
- or item.name == 'VJOURNAL')]
+ if (item.name == "VEVENT" or item.name == "VTODO"
+ or item.name == "VJOURNAL")]
else:
try:
cal = vobject.readOne(calstr)
- except Exception, e:
+ except Exception as e:
self.em.rclog("Openfile: cant parse object: %s" % str(e))
return False
- for lstnm in ('vevent_list', 'vtodo_list', 'vjournal_list'):
+ for lstnm in ("vevent_list", "vtodo_list", "vjournal_list"):
lst = getattr(cal, lstnm, [])
for ev in lst:
self.contents.append(ev.serialize())
@@ -90,7 +91,10 @@ class IcalExtractor:
def getipath(self, params):
try:
- index = int(params["ipath:"])
+ if params["ipath:"] == b'':
+ index = 0
+ else:
+ index = int(params["ipath:"])
except:
return (False, "", "", True)
return self.extractone(index)
@@ -100,7 +104,7 @@ class IcalExtractor:
if self.currentindex == -1:
# Return "self" doc
self.currentindex = 0
- self.em.setmimetype('text/plain')
+ self.em.setmimetype(b'text/plain')
if len(self.contents) == 0:
eof = rclexecm.RclExecM.eofnext
else:
@@ -121,44 +125,44 @@ class ICalSimpleSplitter:
# Note that if an 'interesting' element is nested inside another one,
# it will not be extracted (stay as text in external event). This is
# not an issue and I don't think it can happen with the current list
- interesting = ('VTODO', 'VEVENT', 'VJOURNAL')
+ interesting = (b'VTODO', b'VEVENT', b'VJOURNAL')
def splitcalendar(self, fin):
- curblkname = ''
- curblk = ''
+ curblkname = b''
+ curblk = b''
lo = []
for line in fin:
line = line.rstrip()
- if line == '':
+ if line == b'':
continue
if curblkname:
- curblk = curblk + line + "\n"
+ curblk = curblk + line + b'\n'
- l = line.split(":")
+ l = line.split(b':')
if len(l) < 2:
continue
# If not currently inside a block and we see an
# 'interesting' BEGIN, start block
- if curblkname == '' and l[0].upper() == "BEGIN" :
+ if curblkname == b'' and l[0].upper() == b'BEGIN':
name = l[1].upper()
if name in ICalSimpleSplitter.interesting:
curblkname = name
- curblk = curblk + line + "\n"
+ curblk = curblk + line + b'\n'
# If currently accumulating block lines, check for end
- if curblkname and l[0].upper() == "END" and \
+ if curblkname and l[0].upper() == b'END' and \
l[1].upper() == curblkname:
lo.append(curblk)
- curblkname = ''
- curblk = ''
+ curblkname = b''
+ curblk = b''
if curblk:
lo.append(curblk)
- curblkname = ''
- curblk = ''
+ curblkname = b''
+ curblk = b''
return lo
diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py
index ac21d130..8892a9ae 100755
--- a/src/filters/rclimg.py
+++ b/src/filters/rclimg.py
@@ -1,11 +1,12 @@
#!/usr/bin/env python
-# Python-based Image Tag extractor for Recoll. This is less thorough than the
-# Perl-based rclimg script, but useful if you don't want to have to install Perl
-# (e.g. on Windows).
+# Python-based Image Tag extractor for Recoll. This is less thorough
+# than the Perl-based rclimg script, but useful if you don't want to
+# have to install Perl (e.g. on Windows).
#
# Uses pyexiv2. Also tried Pillow, found it useless for tags.
#
+from __future__ import print_function
import sys
import os
@@ -15,7 +16,7 @@ import re
try:
import pyexiv2
except:
- print "RECFILTERROR HELPERNOTFOUND python:pyexiv2"
+ print("RECFILTERROR HELPERNOTFOUND python:pyexiv2")
sys.exit(1);
khexre = re.compile('.*\.0[xX][0-9a-fA-F]+$')
@@ -48,7 +49,7 @@ class ImgTagExtractor:
def extractone(self, params):
#self.em.rclog("extractone %s" % params["filename:"])
ok = False
- if not params.has_key("filename:"):
+ if "filename:" not in params:
self.em.rclog("extractone: no file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
@@ -62,11 +63,11 @@ class ImgTagExtractor:
# we skip numeric keys and undecoded makernote data
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
mdic[k] = str(metadata[k].raw_value)
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, "", "", rclexecm.RclExecM.eofnow)
- docdata = "\n"
+ docdata = b'\n'
ttdata = set()
for k in pyexiv2_titles:
@@ -77,25 +78,28 @@ class ImgTagExtractor:
for v in ttdata:
v = v.replace('[', '').replace(']', '').replace("'", "")
title += v + " "
- docdata += '' + title + '\n'
+ docdata += rclexecm.makebytes("" + title + "\n")
for k in exiv2_dates:
if k in mdic:
# Recoll wants: %Y-%m-%d %H:%M:%S.
# We get 2014:06:27 14:58:47
- dt = mdic[k].replace(':', '-', 2)
- docdata += '\n'
+ dt = mdic[k].replace(":", "-", 2)
+ docdata += b'\n'
break
- for k,v in mdic.iteritems():
+ for k,v in mdic.items():
if k == 'Xmp.digiKam.TagsList':
- docdata += '\n'
+ docdata += b'\n'
- docdata += "\n"
- for k,v in mdic.iteritems():
- docdata += k + " : " + self.em.htmlescape(mdic[k]) + "
\n"
- docdata += ""
+ docdata += b'\n'
+ for k,v in mdic.items():
+ docdata += rclexecm.makebytes(k + " : " + \
+ self.em.htmlescape(mdic[k]) + "
\n")
+ docdata += b''
self.em.setmimetype("text/html")
diff --git a/src/filters/rclinfo b/src/filters/rclinfo
index c6b8a8b1..575121cc 100755
--- a/src/filters/rclinfo
+++ b/src/filters/rclinfo
@@ -3,6 +3,7 @@
# Read a file in GNU info format and output its nodes as subdocs,
# interfacing with recoll execm
+from __future__ import print_function
import rclexecm
import sys
@@ -16,24 +17,12 @@ import subprocess
# Some info source docs contain charset info like:
# @documentencoding ISO-2022-JP
# But this seems to be absent from outputs.
-htmltemplate = '''
-
-
- %s
-
-
-
-
- %s
-
-
-'''
# RclExecm interface
class InfoExtractor:
def __init__(self, em):
self.file = ""
- self.contents = []
+ self.contents = []
self.em = em
def extractone(self, index):
@@ -43,8 +32,13 @@ class InfoExtractor:
nodename, docdata = self.contents[index]
nodename = self.em.htmlescape(nodename)
docdata = self.em.htmlescape(docdata)
-
- docdata = htmltemplate % (nodename, docdata)
+ # strange whitespace to avoid changing the module tests (same as old)
+ docdata = b'\n\n \n ' + nodename + \
+ b'\n' + \
+ ' \n' + \
+ b' \n \n' + \
+ b' \n ' + \
+ docdata + b'\n
\n\n'
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.contents) -1:
@@ -60,19 +54,18 @@ class InfoExtractor:
self.em.rclog("Openfile: %s is not a file" % self.file)
return False
- cmd = "info --subnodes -o - -f " + self.file
+ cmd = b'info --subnodes -o - -f ' + self.file
nullstream = open("/dev/null", 'w')
try:
infostream = subprocess.Popen(cmd, shell=True, bufsize=1,
stderr=nullstream,
stdout=subprocess.PIPE).stdout
- except Exception, e:
+ except Exception as e:
# Consider this as permanently fatal.
self.em.rclog("Openfile: exec info: %s" % str(e))
- print "RECFILTERROR HELPERNOTFOUND info"
+ print("RECFILTERROR HELPERNOTFOUND info")
sys.exit(1);
-
self.currentindex = -1
self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream)
@@ -117,9 +110,9 @@ class InfoSimpleSplitter:
index = 0
listout = []
node_dict = {}
- node = ""
+ node = b''
infofile = os.path.basename(filename)
- nodename = "Unknown"
+ nodename = b'Unknown'
for line in fin:
@@ -128,41 +121,41 @@ class InfoSimpleSplitter:
# beginning with spaces (it's a bug probably, only seen it once)
# Maybe we'd actually be better off directly interpreting the
# info files
- if gotblankline and line.lstrip(" ").startswith("File: "):
+ if gotblankline and line.lstrip(b' ').startswith(b'File: '):
prevnodename = nodename
- line = line.rstrip("\n\r")
- pairs = line.split(",")
- up = "Top"
+ line = line.rstrip(b'\n\r')
+ pairs = line.split(b',')
+ up = b'Top'
nodename = str(index)
try:
for pair in pairs:
- name, value = pair.split(':')
- name = name.strip(" ")
- value = value.strip(" ")
- if name == "Node":
+ name, value = pair.split(b':')
+ name = name.strip(b' ')
+ value = value.strip(b' ')
+ if name == b'Node':
nodename = value
- if name == "Up":
+ if name == b'Up':
up = value
- if name == "File":
+ if name == b'File':
infofile = value
- except:
- print >> sys.stderr, "rclinfo: bad line in %s: [%s]\n" % \
- (infofile, line)
+ except Exception as err:
+ print("rclinfo: bad line in %s: [%s] %s\n" % \
+ (infofile, line, err), file = sys.stderr)
nodename = prevnodename
node += line
continue
- if node_dict.has_key(nodename):
- print >> sys.stderr, "Info file", filename, \
- "Dup node: ", nodename
+ if nodename in node_dict:
+ print("Info file %s Dup node: %s" % (filename, nodename), \
+ file=sys.stderr)
node_dict[nodename] = up
if index != 0:
listout.append((prevnodename, node))
- node = ""
+ node = b''
index += 1
- if line.rstrip("\n\r") == '':
+ if line.rstrip(b'\n\r') == b'':
gotblankline = 1
else:
gotblankline = 0
@@ -170,7 +163,7 @@ class InfoSimpleSplitter:
node += line
# File done, add last dangling node
- if node != "":
+ if node != b'':
listout.append((nodename, node))
# Compute node paths (concatenate "Up" values), to be used
@@ -178,34 +171,34 @@ class InfoSimpleSplitter:
# the info file tree is bad
listout1 = []
for nodename, node in listout:
- title = ""
+ title = b''
loop = 0
error = 0
- while nodename != "Top":
- title = nodename + " / " + title
- if node_dict.has_key(nodename):
+ while nodename != b'Top':
+ title = nodename + b' / ' + title
+ if nodename in node_dict:
nodename = node_dict[nodename]
else:
- print >> sys.stderr, \
+ print(
"Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \
- (infofile, title, nodename)
+ (infofile, title, nodename), sys.stderr)
error = 1
break
loop += 1
if loop > 50:
- print >> sys.stderr, "Infofile: bad tree (looping)", \
- infofile
+ print("Infofile: bad tree (looping) %s" % infofile, \
+ file = sys.stderr)
error = 1
break
if error:
continue
- if title == "":
+ if title == b'':
title = infofile
else:
- title = infofile + " / " + title
- title = title.rstrip(" / ")
+ title = infofile + b' / ' + title
+ title = title.rstrip(b' / ')
listout1.append((title, node))
return listout1
diff --git a/src/filters/rclkar b/src/filters/rclkar
index 83c0207c..00432b15 100755
--- a/src/filters/rclkar
+++ b/src/filters/rclkar
@@ -1,6 +1,8 @@
#!/usr/bin/env python
# Read a .kar midi karaoke file and translate to recoll indexable format
+# This does not work with Python3 yet because python:midi doesn't
+from __future__ import print_function
import rclexecm
import sys
@@ -15,9 +17,9 @@ except:
pass
try:
- import midi
+ from midi import midi
except:
- print "RECFILTERROR HELPERNOTFOUND python:midi"
+ print("RECFILTERROR HELPERNOTFOUND python:midi")
sys.exit(1);
try:
@@ -106,12 +108,12 @@ class KarTextExtractor:
if data:
try:
data = data.decode(self.encoding, 'ignore')
- except Exception, err:
+ except Exception as err:
self.em.rclog("Decode failed: " + str(err))
return ""
try:
data = data.encode('utf-8')
- except Exception, err:
+ except Exception as err:
self.em.rclog("Encode failed: " + str(err))
return ""
@@ -127,7 +129,7 @@ class KarTextExtractor:
just one our users could use if there is trouble with guessing
encodings'''
- rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$'
+ rexp = b'''\(([^\)]+)\)\.[a-zA-Z]+$'''
m = re.search(rexp, fn)
if m:
return m.group(1)
@@ -165,7 +167,7 @@ class KarTextExtractor:
if count > 0:
confidence = 1.0
encoding = code
- except Exception, err:
+ except Exception as err:
self.em.rclog("stopwords-based classifier failed: %s" % err)
return (encoding, confidence)
@@ -177,7 +179,7 @@ class KarTextExtractor:
docdata = ""
ok = False
- if not params.has_key("filename:"):
+ if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
@@ -191,7 +193,7 @@ class KarTextExtractor:
self.encoding = ""
# Mimetype not used for now
- if not params.has_key("mimetype:"):
+ if "mimetype:" not in params:
mimetype = 'audio/x-midi'
else:
mimetype = params["mimetype:"]
@@ -199,8 +201,8 @@ class KarTextExtractor:
# Read in and midi-decode the file
try:
stream = midi.read_midifile(filename)
- except Exception, err:
- self.em.rclog("extractone: midi extract failed: [%s]" % err)
+ except Exception as err:
+ self.em.rclog("extractone: read_midifile failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
title = None
diff --git a/src/filters/rcllatinclass.py b/src/filters/rcllatinclass.py
index 529aadab..ad5d3efe 100755
--- a/src/filters/rcllatinclass.py
+++ b/src/filters/rcllatinclass.py
@@ -13,6 +13,8 @@ epsilon with dasia (in unicode but not iso). Can this be replaced by either epsi
with acute accent ?
"""
+from __future__ import print_function
+
import sys
import string
import glob
@@ -117,7 +119,7 @@ if __name__ == "__main__":
lang,code,count = classifier.classify(rawtext)
if count > 0:
- print "%s %s %d" % (code, lang, count)
+ print("%s %s %d" % (code, lang, count))
else:
- print "UNKNOWN UNKNOWN 0"
+ print("UNKNOWN UNKNOWN 0")
diff --git a/src/filters/rclrar b/src/filters/rclrar
index b661f510..0846263c 100755
--- a/src/filters/rclrar
+++ b/src/filters/rclrar
@@ -43,7 +43,7 @@ class RarExtractor:
try:
rarinfo = self.rar.getinfo(ipath)
isdir = rarinfo.isdir()
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: getinfo failed: [%s]" % err)
return (True, docdata, ipath, false)
@@ -56,7 +56,7 @@ class RarExtractor:
else:
docdata = self.rar.read(ipath)
ok = True
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
else:
@@ -89,7 +89,7 @@ class RarExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
- except Exception, err:
+ except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py
index c7031030..5a9a68ac 100755
--- a/src/filters/rclrtf.py
+++ b/src/filters/rclrtf.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
import rclexecm
import rclexec1
@@ -10,24 +11,24 @@ import os
class RTFProcessData:
def __init__(self, em):
self.em = em
- self.out = ""
+ self.out = b''
self.gothead = 0
- self.patendhead = re.compile('''''')
- self.patcharset = re.compile('''^''')
+ self.patcharset = re.compile(b'''^' + "\n"
- self.out += line + "\n"
+ self.out += b'' + b'\n'
+ self.out += line + b'\n'
self.gothead = 1
elif not self.patcharset.search(line):
- self.out += line + "\n"
+ self.out += line + b'\n'
else:
- self.out += line + "\n"
+ self.out += line + b'\n'
def wrapData(self):
return self.out
@@ -52,7 +53,7 @@ class RTFFilter:
if __name__ == '__main__':
if not rclexecm.which("unrtf"):
- print("RECFILTERROR HELPERNOTFOUND antiword")
+ print("RECFILTERROR HELPERNOTFOUND unrtf")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = RTFFilter(proto)
diff --git a/src/filters/rcltar b/src/filters/rcltar
index 3d6508e0..7dba94d3 100755
--- a/src/filters/rcltar
+++ b/src/filters/rcltar
@@ -33,7 +33,7 @@ class TarExtractor:
else:
docdata = self.tar.extractfile(ipath).read()
ok = True
- except Exception, err:
+ except Exception as err:
ok = False
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.namen) -1:
@@ -59,7 +59,7 @@ class TarExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
- except Exception, err:
+ except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
diff --git a/src/filters/rclwar b/src/filters/rclwar
index 8fe46638..30a95e9f 100755
--- a/src/filters/rclwar
+++ b/src/filters/rclwar
@@ -15,7 +15,7 @@ class WarExtractor:
member = self.tar.extractfile(tarinfo)
docdata = member.read()
ok = True
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof)
@@ -26,7 +26,7 @@ class WarExtractor:
try:
self.tar = tarfile.open(params["filename:"])
return True
- except Exception, err:
+ except Exception as err:
self.em.rclog(str(err))
return False
@@ -34,7 +34,7 @@ class WarExtractor:
ipath = params["ipath:"]
try:
tarinfo = self.tar.getmember(ipath)
- except Exception, err:
+ except Exception as err:
self.em.rclog(str(err))
return (False, "", ipath, rclexecm.RclExecM.noteof)
return self.extractone(tarinfo)
diff --git a/src/filters/rclzip b/src/filters/rclzip
index a3afb06e..9d88dc76 100755
--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@@ -72,7 +72,7 @@ class ZipExtractor:
else:
docdata = self.zip.read(ipath)
ok = True
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof
@@ -98,7 +98,7 @@ class ZipExtractor:
try:
self.zip = ZipFile(filename)
return True
- except Exception, err:
+ except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err)
return False
@@ -111,7 +111,7 @@ class ZipExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
- except Exception, err:
+ except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
diff --git a/src/python/recoll/recoll/rclconfig.py b/src/python/recoll/recoll/rclconfig.py
index 28cb4e5a..8fc8aaff 100755
--- a/src/python/recoll/recoll/rclconfig.py
+++ b/src/python/recoll/recoll/rclconfig.py
@@ -75,7 +75,7 @@ class ConfSimple:
def getNames(self, sk = ''):
if not sk in self.submaps:
return None
- return self.submaps[sk].keys()
+ return list(self.submaps[sk].keys())
class ConfTree(ConfSimple):
"""A ConfTree adds path-hierarchical interpretation of the section keys,
diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf
index 4e66ddb2..19f3d8d6 100644
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
daemloglevel = 6
daemlogfilename = /tmp/rclmontrace
+systemfilecommand = xdg-mime query filetype
+
indexStripChars = 1
detectxattronly = 1