diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py
index f05a5789..0a05559f 100755
--- a/src/filters/ppt-dump.py
+++ b/src/filters/ppt-dump.py
@@ -52,7 +52,7 @@ class PPTDumper(object):
try:
dirstrm = strm.getDirectoryStreamByName(dirname)
- except Exception, err:
+ except Exception as err:
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
# The previous version was killed by the exception
# here, so the equivalent is to break, but maybe there
diff --git a/src/filters/rcl7z b/src/filters/rcl7z
index c7ea935d..2af73ae6 100755
--- a/src/filters/rcl7z
+++ b/src/filters/rcl7z
@@ -15,7 +15,7 @@ try:
import pylzma
from py7zlib import Archive7z
except:
- print "RECFILTERROR HELPERNOTFOUND python:pylzma"
+ print("RECFILTERROR HELPERNOTFOUND python:pylzma")
sys.exit(1);
try:
@@ -40,19 +40,17 @@ class SevenZipExtractor:
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
- docdata = ""
+ docdata = b''
try:
docdata = self.sevenzip.getmember(ipath).read()
ok = True
- except Exception, err:
+ except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.sevenzip.getnames()) -1:
iseof = rclexecm.RclExecM.eofnext
- if isinstance(ipath, unicode):
- ipath = ipath.encode("utf-8")
- return (ok, docdata, ipath, iseof)
+ return (ok, docdata, rclexecm.makebytes(ipath), iseof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
@@ -71,7 +69,7 @@ class SevenZipExtractor:
fp = open(filename, 'rb')
self.sevenzip = Archive7z(fp)
return True
- except Exception, err:
+ except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err)
return False
@@ -84,7 +82,7 @@ class SevenZipExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
- except Exception, err:
+ except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
diff --git a/src/filters/rclaudio b/src/filters/rclaudio
index d717adc1..03f95ad9 100755
--- a/src/filters/rclaudio
+++ b/src/filters/rclaudio
@@ -12,7 +12,7 @@ try:
from mutagen.flac import FLAC
from mutagen.oggvorbis import OggVorbis
except:
- print "RECFILTERROR HELPERNOTFOUND python:mutagen"
+ print("RECFILTERROR HELPERNOTFOUND python:mutagen")
sys.exit(1);
# prototype for the html document we're returning
@@ -42,23 +42,24 @@ class AudioTagExtractor:
#self.em.rclog("extractone %s %s" % (params["filename:"], params["mimetype:"]))
docdata = ""
ok = False
- if not params.has_key("mimetype:") or not params.has_key("filename:"):
+ if not "mimetype:" in params or not "filename:" in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
mimetype = params["mimetype:"]
try:
- if mimetype == "audio/mpeg":
+ if mimetype == b'audio/mpeg':
tags = MP3(filename, ID3=EasyID3)
- elif mimetype == "application/ogg":
+ elif mimetype == b'application/ogg' or \
+ mimetype == b'audio/x-vorbis+ogg':
tags = OggVorbis(filename)
- elif mimetype == "application/x-flac" or \
- mimetype == "audio/x-flac" or \
- mimetype == "audio/flac":
+ elif mimetype == b'application/x-flac' or \
+ mimetype == 'audio/x-flac' or \
+ mimetype == b'audio/flac':
tags = FLAC(filename)
else:
- raise Exception, "Bad mime type %s" % mimetype
- except Exception, err:
+ raise Exception("Bad mime type %s" % mimetype)
+ except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
@@ -66,21 +67,22 @@ class AudioTagExtractor:
artist = ""
title = ""
try:
- album = self.em.htmlescape(tags["album"][0].encode("utf-8"))
+ album = self.em.htmlescape(tags["album"][0])
except:
pass
try:
- artist = self.em.htmlescape(tags["artist"][0].encode("utf-8"))
+ artist = self.em.htmlescape(tags["artist"][0])
except:
pass
try:
- title = self.em.htmlescape(tags["title"][0].encode("utf-8"))
+ title = self.em.htmlescape(tags["title"][0])
except:
pass
self.em.setmimetype("text/html")
- alldata = self.em.htmlescape(tags.pprint().encode("utf-8"))
+ alldata = self.em.htmlescape(tags.pprint())
alldata = alldata.replace("\n", "
")
- docdata = htmltemplate % (album, artist, title, alldata)
+ docdata = (htmltemplate % (album, artist, title, alldata))\
+ .encode('UTF-8')
ok = True
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
diff --git a/src/filters/rclchm b/src/filters/rclchm
index a9c2bbc7..e9cf0291 100755
--- a/src/filters/rclchm
+++ b/src/filters/rclchm
@@ -2,6 +2,11 @@
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
+from __future__ import print_function
+
+# Note: this is not converted to python3, libchm does not have a
+# python3 wrapper at this point (2015-11)
+
# Do we return individual chapters as html pages or concatenate everything?
rclchm_catenate = 0
# Use special html type to allow for mimeconf/mimeview Open magic,
@@ -23,13 +28,13 @@ import rclexecm
try:
from chm import chm,chmlib
except:
- print "RECFILTERROR HELPERNOTFOUND python:chm"
+ print("RECFILTERROR HELPERNOTFOUND python:chm")
sys.exit(1);
try:
from HTMLParser import HTMLParser
except:
- print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
+ print("RECFILTERROR HELPERNOTFOUND python:HTMLParser")
sys.exit(1);
# Small helper routines
@@ -37,11 +42,11 @@ def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
- #print "ResolveObject failed", path
+ #print("ResolveObject failed: %s" % path, file=sys.stderr)
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
- print "RetrieveObject failed", path
+ print("RetrieveObject failed: %s" % path, file=sys.stderr)
return ""
return doc
diff --git a/src/filters/rcldia b/src/filters/rcldia
index 937204f5..1d00ea76 100755
--- a/src/filters/rcldia
+++ b/src/filters/rcldia
@@ -1,5 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import print_function
+
# dia (http://live.gnome.org/Dia) file filter for recoll
# stefan.friedel@iwr.uni-heidelberg.de 2012
#
@@ -66,7 +68,7 @@ class DiaExtractor:
try:
docdata = self.ExtractDiaText()
ok = True
- except Exception, err:
+ except Exception as err:
ok = False
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
@@ -76,7 +78,7 @@ class DiaExtractor:
def openfile(self, params):
try:
self.dia = GzipFile(params["filename:"], 'r')
- # Dial files are sometimes not compressed. Quite weirdly,
+ # Dia files are sometimes not compressed. Quite weirdly,
# GzipFile does not complain until we try to read. Have to do it
# here to be able to retry an uncompressed open.
data = self.dia.readline()
diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
index 75078f16..262226cb 100755
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
import rclexecm
import rclexec1
@@ -11,32 +12,32 @@ import os
class WordProcessData:
def __init__(self, em):
self.em = em
- self.out = ""
- self.cont = ""
+ self.out = b''
+ self.cont = b''
self.gotdata = False
# Line with continued word (ending in -)
# we strip the - which is not nice for actually hyphenated word.
# What to do ?
- self.patcont = re.compile('''[\w][-]$''')
+ self.patcont = re.compile(b'''[\w][-]$''')
# Pattern for breaking continuation at last word start
- self.patws = re.compile('''([\s])([\w]+)(-)$''')
+ self.patws = re.compile(b'''([\s])([\w]+)(-)$''')
def takeLine(self, line):
if not self.gotdata:
- if line == "":
+ if line == b'':
return
- self.out = '
' + self.out = b'
' self.gotdata = True if self.cont: line = self.cont + line self.cont = "" - if line == "\f": - self.out += "
" + if line == b'\f': + self.out += '
'
return
if self.patcont.search(line):
@@ -47,16 +48,16 @@ class WordProcessData:
line = line[0:match.start(1)]
else:
self.cont = line
- line = ""
+ line = b''
if line:
- self.out += self.em.htmlescape(line) + "
"
+ self.out += self.em.htmlescape(line) + b'
'
else:
- self.out += "
"
+ self.out += b'
'
def wrapData(self):
if self.gotdata:
- self.out += "
- %s -- -''' # RclExecm interface class InfoExtractor: def __init__(self, em): self.file = "" - self.contents = [] + self.contents = [] self.em = em def extractone(self, index): @@ -43,8 +32,13 @@ class InfoExtractor: nodename, docdata = self.contents[index] nodename = self.em.htmlescape(nodename) docdata = self.em.htmlescape(docdata) - - docdata = htmltemplate % (nodename, docdata) + # strange whitespace to avoid changing the module tests (same as old) + docdata = b'\n\n \n
\n ' + \ + docdata + b'\n\n\n' iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: @@ -60,19 +54,18 @@ class InfoExtractor: self.em.rclog("Openfile: %s is not a file" % self.file) return False - cmd = "info --subnodes -o - -f " + self.file + cmd = b'info --subnodes -o - -f ' + self.file nullstream = open("/dev/null", 'w') try: infostream = subprocess.Popen(cmd, shell=True, bufsize=1, stderr=nullstream, stdout=subprocess.PIPE).stdout - except Exception, e: + except Exception as e: # Consider this as permanently fatal. self.em.rclog("Openfile: exec info: %s" % str(e)) - print "RECFILTERROR HELPERNOTFOUND info" + print("RECFILTERROR HELPERNOTFOUND info") sys.exit(1); - self.currentindex = -1 self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream) @@ -117,9 +110,9 @@ class InfoSimpleSplitter: index = 0 listout = [] node_dict = {} - node = "" + node = b'' infofile = os.path.basename(filename) - nodename = "Unknown" + nodename = b'Unknown' for line in fin: @@ -128,41 +121,41 @@ class InfoSimpleSplitter: # beginning with spaces (it's a bug probably, only seen it once) # Maybe we'd actually be better off directly interpreting the # info files - if gotblankline and line.lstrip(" ").startswith("File: "): + if gotblankline and line.lstrip(b' ').startswith(b'File: '): prevnodename = nodename - line = line.rstrip("\n\r") - pairs = line.split(",") - up = "Top" + line = line.rstrip(b'\n\r') + pairs = line.split(b',') + up = b'Top' nodename = str(index) try: for pair in pairs: - name, value = pair.split(':') - name = name.strip(" ") - value = value.strip(" ") - if name == "Node": + name, value = pair.split(b':') + name = name.strip(b' ') + value = value.strip(b' ') + if name == b'Node': nodename = value - if name == "Up": + if name == b'Up': up = value - if name == "File": + if name == b'File': infofile = value - except: - print >> sys.stderr, "rclinfo: bad line in %s: [%s]\n" % \ - (infofile, line) + except Exception as err: + print("rclinfo: bad line in %s: [%s] %s\n" % \ + (infofile, line, err), file = sys.stderr) nodename = prevnodename node += line continue - if node_dict.has_key(nodename): - print >> sys.stderr, "Info file", filename, \ - "Dup node: ", nodename + if nodename in node_dict: + print("Info file %s Dup node: %s" % (filename, nodename), \ + file=sys.stderr) node_dict[nodename] = up if index != 0: listout.append((prevnodename, node)) - node = "" + node = b'' index += 1 - if line.rstrip("\n\r") == '': + if line.rstrip(b'\n\r') == b'': gotblankline = 1 else: gotblankline = 0 @@ -170,7 +163,7 @@ class InfoSimpleSplitter: node += line # File done, add last dangling node - if node != "": + if node != b'': listout.append((nodename, node)) # Compute node paths (concatenate "Up" values), to be used @@ -178,34 +171,34 @@ class InfoSimpleSplitter: # the info file tree is bad listout1 = [] for nodename, node in listout: - title = "" + title = b'' loop = 0 error = 0 - while nodename != "Top": - title = nodename + " / " + title - if node_dict.has_key(nodename): + while nodename != b'Top': + title = nodename + b' / ' + title + if nodename in node_dict: nodename = node_dict[nodename] else: - print >> sys.stderr, \ + print( "Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \ - (infofile, title, nodename) + (infofile, title, nodename), sys.stderr) error = 1 break loop += 1 if loop > 50: - print >> sys.stderr, "Infofile: bad tree (looping)", \ - infofile + print("Infofile: bad tree (looping) %s" % infofile, \ + file = sys.stderr) error = 1 break if error: continue - if title == "": + if title == b'': title = infofile else: - title = infofile + " / " + title - title = title.rstrip(" / ") + title = infofile + b' / ' + title + title = title.rstrip(b' / ') listout1.append((title, node)) return listout1 diff --git a/src/filters/rclkar b/src/filters/rclkar index 83c0207c..00432b15 100755 --- a/src/filters/rclkar +++ b/src/filters/rclkar @@ -1,6 +1,8 @@ #!/usr/bin/env python # Read a .kar midi karaoke file and translate to recoll indexable format +# This does not work with Python3 yet because python:midi doesn't +from __future__ import print_function import rclexecm import sys @@ -15,9 +17,9 @@ except: pass try: - import midi + from midi import midi except: - print "RECFILTERROR HELPERNOTFOUND python:midi" + print("RECFILTERROR HELPERNOTFOUND python:midi") sys.exit(1); try: @@ -106,12 +108,12 @@ class KarTextExtractor: if data: try: data = data.decode(self.encoding, 'ignore') - except Exception, err: + except Exception as err: self.em.rclog("Decode failed: " + str(err)) return "" try: data = data.encode('utf-8') - except Exception, err: + except Exception as err: self.em.rclog("Encode failed: " + str(err)) return "" @@ -127,7 +129,7 @@ class KarTextExtractor: just one our users could use if there is trouble with guessing encodings''' - rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$' + rexp = b'''\(([^\)]+)\)\.[a-zA-Z]+$''' m = re.search(rexp, fn) if m: return m.group(1) @@ -165,7 +167,7 @@ class KarTextExtractor: if count > 0: confidence = 1.0 encoding = code - except Exception, err: + except Exception as err: self.em.rclog("stopwords-based classifier failed: %s" % err) return (encoding, confidence) @@ -177,7 +179,7 @@ class KarTextExtractor: docdata = "" ok = False - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] @@ -191,7 +193,7 @@ class KarTextExtractor: self.encoding = "" # Mimetype not used for now - if not params.has_key("mimetype:"): + if "mimetype:" not in params: mimetype = 'audio/x-midi' else: mimetype = params["mimetype:"] @@ -199,8 +201,8 @@ class KarTextExtractor: # Read in and midi-decode the file try: stream = midi.read_midifile(filename) - except Exception, err: - self.em.rclog("extractone: midi extract failed: [%s]" % err) + except Exception as err: + self.em.rclog("extractone: read_midifile failed: [%s]" % err) return (ok, docdata, "", rclexecm.RclExecM.eofnow) title = None diff --git a/src/filters/rcllatinclass.py b/src/filters/rcllatinclass.py index 529aadab..ad5d3efe 100755 --- a/src/filters/rcllatinclass.py +++ b/src/filters/rcllatinclass.py @@ -13,6 +13,8 @@ epsilon with dasia (in unicode but not iso). Can this be replaced by either epsi with acute accent ? """ +from __future__ import print_function + import sys import string import glob @@ -117,7 +119,7 @@ if __name__ == "__main__": lang,code,count = classifier.classify(rawtext) if count > 0: - print "%s %s %d" % (code, lang, count) + print("%s %s %d" % (code, lang, count)) else: - print "UNKNOWN UNKNOWN 0" + print("UNKNOWN UNKNOWN 0") diff --git a/src/filters/rclrar b/src/filters/rclrar index b661f510..0846263c 100755 --- a/src/filters/rclrar +++ b/src/filters/rclrar @@ -43,7 +43,7 @@ class RarExtractor: try: rarinfo = self.rar.getinfo(ipath) isdir = rarinfo.isdir() - except Exception, err: + except Exception as err: self.em.rclog("extractone: getinfo failed: [%s]" % err) return (True, docdata, ipath, false) @@ -56,7 +56,7 @@ class RarExtractor: else: docdata = self.rar.read(ipath) ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False else: @@ -89,7 +89,7 @@ class RarExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py index c7031030..5a9a68ac 100755 --- a/src/filters/rclrtf.py +++ b/src/filters/rclrtf.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import rclexecm import rclexec1 @@ -10,24 +11,24 @@ import os class RTFProcessData: def __init__(self, em): self.em = em - self.out = "" + self.out = b'' self.gothead = 0 - self.patendhead = re.compile('''''') - self.patcharset = re.compile('''^''') + self.patcharset = re.compile(b'''^' + "\n" - self.out += line + "\n" + self.out += b'' + b'\n' + self.out += line + b'\n' self.gothead = 1 elif not self.patcharset.search(line): - self.out += line + "\n" + self.out += line + b'\n' else: - self.out += line + "\n" + self.out += line + b'\n' def wrapData(self): return self.out @@ -52,7 +53,7 @@ class RTFFilter: if __name__ == '__main__': if not rclexecm.which("unrtf"): - print("RECFILTERROR HELPERNOTFOUND antiword") + print("RECFILTERROR HELPERNOTFOUND unrtf") sys.exit(1) proto = rclexecm.RclExecM() filter = RTFFilter(proto) diff --git a/src/filters/rcltar b/src/filters/rcltar index 3d6508e0..7dba94d3 100755 --- a/src/filters/rcltar +++ b/src/filters/rcltar @@ -33,7 +33,7 @@ class TarExtractor: else: docdata = self.tar.extractfile(ipath).read() ok = True - except Exception, err: + except Exception as err: ok = False iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.namen) -1: @@ -59,7 +59,7 @@ class TarExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/filters/rclwar b/src/filters/rclwar index 8fe46638..30a95e9f 100755 --- a/src/filters/rclwar +++ b/src/filters/rclwar @@ -15,7 +15,7 @@ class WarExtractor: member = self.tar.extractfile(tarinfo) docdata = member.read() ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof) @@ -26,7 +26,7 @@ class WarExtractor: try: self.tar = tarfile.open(params["filename:"]) return True - except Exception, err: + except Exception as err: self.em.rclog(str(err)) return False @@ -34,7 +34,7 @@ class WarExtractor: ipath = params["ipath:"] try: tarinfo = self.tar.getmember(ipath) - except Exception, err: + except Exception as err: self.em.rclog(str(err)) return (False, "", ipath, rclexecm.RclExecM.noteof) return self.extractone(tarinfo) diff --git a/src/filters/rclzip b/src/filters/rclzip index a3afb06e..9d88dc76 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -72,7 +72,7 @@ class ZipExtractor: else: docdata = self.zip.read(ipath) ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False iseof = rclexecm.RclExecM.noteof @@ -98,7 +98,7 @@ class ZipExtractor: try: self.zip = ZipFile(filename) return True - except Exception, err: + except Exception as err: self.em.rclog("openfile: failed: [%s]" % err) return False @@ -111,7 +111,7 @@ class ZipExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/python/recoll/recoll/rclconfig.py b/src/python/recoll/recoll/rclconfig.py index 28cb4e5a..8fc8aaff 100755 --- a/src/python/recoll/recoll/rclconfig.py +++ b/src/python/recoll/recoll/rclconfig.py @@ -75,7 +75,7 @@ class ConfSimple: def getNames(self, sk = ''): if not sk in self.submaps: return None - return self.submaps[sk].keys() + return list(self.submaps[sk].keys()) class ConfTree(ConfSimple): """A ConfTree adds path-hierarchical interpretation of the section keys, diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index 4e66ddb2..19f3d8d6 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst daemloglevel = 6 daemlogfilename = /tmp/rclmontrace +systemfilecommand = xdg-mime query filetype + indexStripChars = 1 detectxattronly = 1