diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py index f05a5789..0a05559f 100755 --- a/src/filters/ppt-dump.py +++ b/src/filters/ppt-dump.py @@ -52,7 +52,7 @@ class PPTDumper(object): try: dirstrm = strm.getDirectoryStreamByName(dirname) - except Exception, err: + except Exception as err: error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath)) # The previous version was killed by the exception # here, so the equivalent is to break, but maybe there diff --git a/src/filters/rcl7z b/src/filters/rcl7z index c7ea935d..2af73ae6 100755 --- a/src/filters/rcl7z +++ b/src/filters/rcl7z @@ -15,7 +15,7 @@ try: import pylzma from py7zlib import Archive7z except: - print "RECFILTERROR HELPERNOTFOUND python:pylzma" + print("RECFILTERROR HELPERNOTFOUND python:pylzma") sys.exit(1); try: @@ -40,19 +40,17 @@ class SevenZipExtractor: def extractone(self, ipath): #self.em.rclog("extractone: [%s]" % ipath) - docdata = "" + docdata = b'' try: docdata = self.sevenzip.getmember(ipath).read() ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.sevenzip.getnames()) -1: iseof = rclexecm.RclExecM.eofnext - if isinstance(ipath, unicode): - ipath = ipath.encode("utf-8") - return (ok, docdata, ipath, iseof) + return (ok, docdata, rclexecm.makebytes(ipath), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): @@ -71,7 +69,7 @@ class SevenZipExtractor: fp = open(filename, 'rb') self.sevenzip = Archive7z(fp) return True - except Exception, err: + except Exception as err: self.em.rclog("openfile: failed: [%s]" % err) return False @@ -84,7 +82,7 @@ class SevenZipExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/filters/rclaudio b/src/filters/rclaudio index d717adc1..03f95ad9 100755 --- a/src/filters/rclaudio +++ b/src/filters/rclaudio @@ -12,7 +12,7 @@ try: from mutagen.flac import FLAC from mutagen.oggvorbis import OggVorbis except: - print "RECFILTERROR HELPERNOTFOUND python:mutagen" + print("RECFILTERROR HELPERNOTFOUND python:mutagen") sys.exit(1); # prototype for the html document we're returning @@ -42,23 +42,24 @@ class AudioTagExtractor: #self.em.rclog("extractone %s %s" % (params["filename:"], params["mimetype:"])) docdata = "" ok = False - if not params.has_key("mimetype:") or not params.has_key("filename:"): + if not "mimetype:" in params or not "filename:" in params: self.em.rclog("extractone: no mime or file name") return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] mimetype = params["mimetype:"] try: - if mimetype == "audio/mpeg": + if mimetype == b'audio/mpeg': tags = MP3(filename, ID3=EasyID3) - elif mimetype == "application/ogg": + elif mimetype == b'application/ogg' or \ + mimetype == b'audio/x-vorbis+ogg': tags = OggVorbis(filename) - elif mimetype == "application/x-flac" or \ - mimetype == "audio/x-flac" or \ - mimetype == "audio/flac": + elif mimetype == b'application/x-flac' or \ + mimetype == 'audio/x-flac' or \ + mimetype == b'audio/flac': tags = FLAC(filename) else: - raise Exception, "Bad mime type %s" % mimetype - except Exception, err: + raise Exception("Bad mime type %s" % mimetype) + except Exception as err: self.em.rclog("extractone: extract failed: [%s]" % err) return (ok, docdata, "", rclexecm.RclExecM.eofnow) @@ -66,21 +67,22 @@ class AudioTagExtractor: artist = "" title = "" try: - album = self.em.htmlescape(tags["album"][0].encode("utf-8")) + album = self.em.htmlescape(tags["album"][0]) except: pass try: - artist = self.em.htmlescape(tags["artist"][0].encode("utf-8")) + artist = self.em.htmlescape(tags["artist"][0]) except: pass try: - title = self.em.htmlescape(tags["title"][0].encode("utf-8")) + title = self.em.htmlescape(tags["title"][0]) except: pass self.em.setmimetype("text/html") - alldata = self.em.htmlescape(tags.pprint().encode("utf-8")) + alldata = self.em.htmlescape(tags.pprint()) alldata = alldata.replace("\n", "
") - docdata = htmltemplate % (album, artist, title, alldata) + docdata = (htmltemplate % (album, artist, title, alldata))\ + .encode('UTF-8') ok = True return (ok, docdata, "", rclexecm.RclExecM.eofnext) diff --git a/src/filters/rclchm b/src/filters/rclchm index a9c2bbc7..e9cf0291 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -2,6 +2,11 @@ """Extract Html files from a Microsoft Compiled Html Help file (.chm) Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)""" +from __future__ import print_function + +# Note: this is not converted to python3, libchm does not have a +# python3 wrapper at this point (2015-11) + # Do we return individual chapters as html pages or concatenate everything? rclchm_catenate = 0 # Use special html type to allow for mimeconf/mimeview Open magic, @@ -23,13 +28,13 @@ import rclexecm try: from chm import chm,chmlib except: - print "RECFILTERROR HELPERNOTFOUND python:chm" + print("RECFILTERROR HELPERNOTFOUND python:chm") sys.exit(1); try: from HTMLParser import HTMLParser except: - print "RECFILTERROR HELPERNOTFOUND python:HTMLParser" + print("RECFILTERROR HELPERNOTFOUND python:HTMLParser") sys.exit(1); # Small helper routines @@ -37,11 +42,11 @@ def getfile(chmfile, path): """Extract internal file text from chm object, given path""" res, ui = chmfile.ResolveObject(path) if res != chmlib.CHM_RESOLVE_SUCCESS: - #print "ResolveObject failed", path + #print("ResolveObject failed: %s" % path, file=sys.stderr) return "" res, doc = chmfile.RetrieveObject(ui) if not res: - print "RetrieveObject failed", path + print("RetrieveObject failed: %s" % path, file=sys.stderr) return "" return doc diff --git a/src/filters/rcldia b/src/filters/rcldia index 937204f5..1d00ea76 100755 --- a/src/filters/rcldia +++ b/src/filters/rcldia @@ -1,5 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import print_function + # dia (http://live.gnome.org/Dia) file filter for recoll # stefan.friedel@iwr.uni-heidelberg.de 2012 # @@ -66,7 +68,7 @@ class DiaExtractor: try: docdata = self.ExtractDiaText() ok = True - except Exception, err: + except Exception as err: ok = False iseof = rclexecm.RclExecM.eofnext self.em.setmimetype("text/plain") @@ -76,7 +78,7 @@ class DiaExtractor: def openfile(self, params): try: self.dia = GzipFile(params["filename:"], 'r') - # Dial files are sometimes not compressed. Quite weirdly, + # Dia files are sometimes not compressed. Quite weirdly, # GzipFile does not complain until we try to read. Have to do it # here to be able to retry an uncompressed open. data = self.dia.readline() diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index 75078f16..262226cb 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import rclexecm import rclexec1 @@ -11,32 +12,32 @@ import os class WordProcessData: def __init__(self, em): self.em = em - self.out = "" - self.cont = "" + self.out = b'' + self.cont = b'' self.gotdata = False # Line with continued word (ending in -) # we strip the - which is not nice for actually hyphenated word. # What to do ? - self.patcont = re.compile('''[\w][-]$''') + self.patcont = re.compile(b'''[\w][-]$''') # Pattern for breaking continuation at last word start - self.patws = re.compile('''([\s])([\w]+)(-)$''') + self.patws = re.compile(b'''([\s])([\w]+)(-)$''') def takeLine(self, line): if not self.gotdata: - if line == "": + if line == b'': return - self.out = '' + \ - '' + \ - '

' + self.out = b'' + \ + b'' + \ + b'

' self.gotdata = True if self.cont: line = self.cont + line self.cont = "" - if line == "\f": - self.out += "


" + if line == b'\f': + self.out += '


' return if self.patcont.search(line): @@ -47,16 +48,16 @@ class WordProcessData: line = line[0:match.start(1)] else: self.cont = line - line = "" + line = b'' if line: - self.out += self.em.htmlescape(line) + "
" + self.out += self.em.htmlescape(line) + b'
' else: - self.out += "
" + self.out += b'
' def wrapData(self): if self.gotdata: - self.out += "

" + self.out += b'

' self.em.setmimetype("text/html") return self.out @@ -65,7 +66,7 @@ class WordProcessData: # output HTML class WordPassData: def __init__(self, em): - self.out = "" + self.out = b'' self.em = em def takeLine(self, line): @@ -96,8 +97,8 @@ class WordFilter: return False def mimetype(self, fn): - rtfprolog ="{\\rtf1" - docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + rtfprolog = b'{\\rtf1' + docprolog = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' try: f = open(fn, "rb") except: @@ -132,7 +133,7 @@ class WordFilter: mt = self.mimetype(fn) self.em.rclog("rcldoc.py: actual MIME type %s" % mt) if mt == "text/plain": - return ([python, os.path.join(self.execdir, "rcltext.py")], + return (["python", os.path.join(self.execdir, "rcltext.py")], WordPassData(self.em)) elif mt == "text/rtf": cmd = ["python", os.path.join(self.execdir, "rclrtf.py"), diff --git a/src/filters/rclepub b/src/filters/rclepub index 1c50592f..c4868d26 100755 --- a/src/filters/rclepub +++ b/src/filters/rclepub @@ -1,5 +1,6 @@ #!/usr/bin/env python """Extract Html content from an EPUB file (.chm)""" +from __future__ import print_function rclepub_html_mtype = "text/html" @@ -12,7 +13,7 @@ import rclexecm try: import epub except: - print "RECFILTERROR HELPERNOTFOUND python:epub" + print("RECFILTERROR HELPERNOTFOUND python:epub") sys.exit(1); class rclEPUB: @@ -63,11 +64,11 @@ class rclEPUB: if item is None: raise Exception("Item not found for id %s" % (id,)) doc = self.book.read_item(item) - doc = re.sub('''''', - '''''', doc) + doc = re.sub(b'''''', + b'''''', doc) self.em.setmimetype(rclepub_html_mtype) return (True, doc, id, iseof) - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) return (False, "", id, iseof) @@ -76,11 +77,11 @@ class rclEPUB: self.currentindex = -1 self.contents = [] try: - self.book = epub.open(params["filename:"]) - except Exception, err: + self.book = epub.open_epub(params["filename:"].decode('UTF-8')) + except Exception as err: self.em.rclog("openfile: epub.open failed: [%s]" % err) return False - for id, item in self.book.opf.manifest.iteritems(): + for id, item in self.book.opf.manifest.items(): if item.media_type == 'application/xhtml+xml': self.contents.append(id) return True diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py index ffa68c53..d26d9b60 100644 --- a/src/filters/rclexec1.py +++ b/src/filters/rclexec1.py @@ -26,6 +26,8 @@ # this would be to slow. So this helps implementing a permanent script # to repeatedly execute single commands. +from __future__ import print_function + import subprocess import rclexecm @@ -74,8 +76,8 @@ class Executor: # params["mimetype:"])) self.flt.reset() ok = False - if not params.has_key("filename:"): - self.em.rclog("extractone: no mime or file name") + if not "filename:" in params: + self.em.rclog("extractone: no file name") return (ok, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index adcb54e5..26c9764e 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -16,6 +16,9 @@ # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ######################################################## ## Recoll multifilter communication module and utilities +# +# All data is binary. This is important for Python3 +# All parameter names are converted to and processed as str/unicode from __future__ import print_function @@ -26,6 +29,21 @@ import shutil import getopt import rclconfig +PY3 = sys.version > '3' + +if PY3: + def makebytes(data): + if isinstance(data, bytes): + return data + else: + return data.encode("UTF-8") +else: + def makebytes(data): + if isinstance(data, unicode): + return data.encode("UTF-8") + else: + return data + my_config = rclconfig.RclConfig() ############################################ @@ -33,7 +51,7 @@ my_config = rclconfig.RclConfig() # communication protocol with the recollindex process. It calls the # object specific of the document type to actually get the data. class RclExecM: - noteof = 0 + noteof = 0 eofnext = 1 eofnow = 2 @@ -46,7 +64,7 @@ class RclExecM: self.myname = os.path.basename(sys.argv[0]) except: self.myname = "???" - self.mimetype = "" + self.mimetype = b"" if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"): self.maxmembersize = \ @@ -60,7 +78,7 @@ class RclExecM: msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) self.debugfile = None if self.debugfile: - self.errfout = open(self.debugfile, "ab") + self.errfout = open(self.debugfile, "a") else: self.errfout = sys.stderr @@ -93,77 +111,84 @@ class RclExecM: # Note: tried replacing this with a multiple replacer according to # http://stackoverflow.com/a/15221068, which was **10 times** slower def htmlescape(self, txt): - # This must stay first (it somehow had managed to skip after - # the next line, with rather interesting results) - txt = txt.replace("&", "&") - - txt = txt.replace("<", "<") - txt = txt.replace(">", ">") - txt = txt.replace('"', """) + # & must stay first (it somehow had managed to skip + # after the next replace, with rather interesting results) + try: + txt = txt.replace(b'&', b'&').replace(b'<', b'<').\ + replace(b'>', b'>').replace(b'"', b'"') + except: + txt = txt.replace("&", "&").replace("<", "<").\ + replace(">", ">").replace("\"", """) return txt # Our worker sometimes knows the mime types of the data it sends def setmimetype(self, mt): - self.mimetype = mt + self.mimetype = makebytes(mt) # Read single parameter from process input: line with param name and size - # followed by data. + # followed by data. The param name is returned as str/unicode, the data + # as bytes def readparam(self): - s = sys.stdin.readline() - if s == '': + if PY3: + inf = sys.stdin.buffer + else: + inf = sys.stdin + s = inf.readline() + if s == b'': sys.exit(0) -# self.rclog(": EOF on input", 1, 0) - s = s.rstrip("\n") + s = s.rstrip(b'\n') - if s == "": - return ("","") + if s == b'': + return ('', b'') l = s.split() if len(l) != 2: - self.rclog("bad line: [" + s + "]", 1, 1) + self.rclog(b'bad line: [' + s + b']', 1, 1) - paramname = l[0].lower() + paramname = l[0].decode('ASCII').lower() paramsize = int(l[1]) if paramsize > 0: - paramdata = sys.stdin.read(paramsize) + paramdata = inf.read(paramsize) if len(paramdata) != paramsize: self.rclog("Bad read: wanted %d, got %d" % - (paramsize, len(paramdata)), 1,1) + (paramsize, len(paramdata)), 1, 1) else: - paramdata = "" + paramdata = b'' #self.rclog("paramname [%s] paramsize %d value [%s]" % # (paramname, paramsize, paramdata)) return (paramname, paramdata) + if PY3: + def senditem(self, nm, len, data): + sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, len))) + self.breakwrite(sys.stdout.buffer, makebytes(data)) + else: + def senditem(self, nm, len, data): + sys.stdout.write(makebytes("%s: %d\n" % (nm, len))) + self.breakwrite(sys.stdout, makebytes(data)) + # Send answer: document, ipath, possible eof. def answer(self, docdata, ipath, iseof = noteof, iserror = noerror): if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow: - if isinstance(docdata, unicode): - self.rclog("GOT UNICODE for ipath [%s]" % (ipath,)) - docdata = docdata.encode("UTF-8") - - print("Document: %d" % len(docdata)) - self.breakwrite(sys.stdout, docdata) + self.senditem("Document", len(docdata), docdata) if len(ipath): - print("Ipath: %d" % len(ipath)) - sys.stdout.write(ipath) + self.senditem("Ipath", len(ipath), ipath) if len(self.mimetype): - print("Mimetype: %d" % len(self.mimetype)) - sys.stdout.write(self.mimetype) + self.senditem("Mimetype", len(self.mimetype), self.mimetype) # If we're at the end of the contents, say so if iseof == RclExecM.eofnow: - print("Eofnow: 0") + self.senditem("Eofnow", 0, b'') elif iseof == RclExecM.eofnext: - print("Eofnext: 0") + self.senditem("Eofnext", 0, b'') if iserror == RclExecM.subdocerror: - print("Subdocerror: 0") + self.senditem("Subdocerror", 0, b'') elif iserror == RclExecM.fileerror: - print("Fileerror: 0") + self.senditem("Fileerror", 0, b'') # End of message print() @@ -173,7 +198,8 @@ class RclExecM: def processmessage(self, processor, params): # We must have a filename entry (even empty). Else exit - if not params.has_key("filename:"): + if "filename:" not in params: + print("%s" % params, file=sys.stderr) self.rclog("no filename ??", 1, 1) # If we're given a file name, open it. @@ -182,7 +208,7 @@ class RclExecM: if not processor.openfile(params): self.answer("", "", iserror = RclExecM.fileerror) return - except Exception, err: + except Exception as err: self.rclog("processmessage: openfile raised: [%s]" % err) self.answer("", "", iserror = RclExecM.fileerror) return @@ -192,11 +218,11 @@ class RclExecM: eof = True self.mimetype = "" try: - if params.has_key("ipath:") and len(params["ipath:"]): + if "ipath:" in params and len(params["ipath:"]): ok, data, ipath, eof = processor.getipath(params) else: ok, data, ipath, eof = processor.getnext(params) - except Exception, err: + except Exception as err: self.answer("", "", eof, RclExecM.fileerror) return @@ -311,7 +337,7 @@ def main(proto, extract): actAsSingle = False debugDumpData = False - ipath = "" + ipath = b"" args = sys.argv[1:] opts, args = getopt.getopt(args, "hdsi:w:") @@ -321,7 +347,7 @@ def main(proto, extract): elif opt in ['-s']: actAsSingle = True elif opt in ['-i']: - ipath = arg + ipath = makebytes(arg) elif opt in ['-w']: ret = which(arg) if ret: @@ -344,17 +370,17 @@ def main(proto, extract): lst = fileout.split(':') mimetype = lst[len(lst)-1].strip() lst = mimetype.split(';') - return lst[0].strip() + return makebytes(lst[0].strip()) def mimetype_with_xdg(f): cmd = 'xdg-mime query filetype "' + f + '"' - return os.popen(cmd).read().strip() + return makebytes(os.popen(cmd).read().strip()) - def debprint(s): + def debprint(out, s): if not actAsSingle: - print(s) + proto.breakwrite(out, makebytes(s+'\n')) - params = {'filename:': args[0]} + params = {'filename:': makebytes(args[0])} # Some filters (e.g. rclaudio) need/get a MIME type from the indexer mimetype = mimetype_with_xdg(args[0]) params['mimetype:'] = mimetype @@ -363,19 +389,20 @@ def main(proto, extract): print("Open error", file=sys.stderr) sys.exit(1) - if ipath != "" or actAsSingle: + if PY3: + ioout = sys.stdout.buffer + else: + ioout = sys.stdout + if ipath != b"" or actAsSingle: params['ipath:'] = ipath ok, data, ipath, eof = extract.getipath(params) if ok: - debprint("== Found entry for ipath %s (mimetype [%s]):" % \ + debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \ (ipath, proto.mimetype)) - if isinstance(data, unicode): - bdata = data.encode("UTF-8") - else: - bdata = data + bdata = makebytes(data) if debugDumpData or actAsSingle: - proto.breakwrite(sys.stdout, bdata) - print() + proto.breakwrite(ioout, bdata) + ioout.write(b'\n') sys.exit(0) else: print("Got error, eof %d"%eof, file=sys.stderr) @@ -386,15 +413,12 @@ def main(proto, extract): ok, data, ipath, eof = extract.getnext(params) if ok: ecnt = ecnt + 1 - debprint("== Entry %d ipath %s (mimetype [%s]):" % \ - (ecnt, ipath, proto.mimetype)) - if isinstance(data, unicode): - bdata = data.encode("UTF-8") - else: - bdata = data + bdata = makebytes(data) + debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \ + (ecnt, len(data), ipath, proto.mimetype)) if debugDumpData: - proto.breakwrite(sys.stdout, bdata) - print() + proto.breakwrite(ioout, bdata) + ioout.write(b'\n') if eof != RclExecM.noteof: sys.exit(0) else: diff --git a/src/filters/rclics b/src/filters/rclics index 6ad3f632..3f28a057 100755 --- a/src/filters/rclics +++ b/src/filters/rclics @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function # Read an ICS file, break it into "documents" which are events, todos, # or journal entries, and interface with recoll execm @@ -13,36 +14,36 @@ import rclexecm import sys # Decide how we'll process the file. -modules = ('internal', 'icalendar', 'vobject') -usemodule = 'internal' +modules = ("internal", "icalendar", "vobject") +usemodule = "internal" forcevobject = 0 -if usemodule != 'internal': +if usemodule != "internal": try: if forcevobject: raise Exception from icalendar import Calendar, Event - usemodule = 'icalendar' + usemodule = "icalendar" except: try: import vobject - usemodule = 'vobject' + usemodule = "vobject" except: - print "RECFILTERROR HELPERNOTFOUND python:icalendar" - print "RECFILTERROR HELPERNOTFOUND python:vobject" + print("RECFILTERROR HELPERNOTFOUND python:icalendar") + print("RECFILTERROR HELPERNOTFOUND python:vobject") sys.exit(1); class IcalExtractor: def __init__(self, em): self.file = "" - self.contents = [] + self.contents = [] self.em = em def extractone(self, index): if index >= len(self.contents): return(False, "", "", True) docdata = self.contents[index] - #self.em.rclog(docdata) + #self.em.rclog(docdata) iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: @@ -55,32 +56,32 @@ class IcalExtractor: self.file = params["filename:"] try: - calstr = open(self.file, 'rb') - except Exception, e: + calstr = open(self.file, "rb") + except Exception as e: self.em.rclog("Openfile: open: %s" % str(e)) return False self.currentindex = -1 - if usemodule == 'internal': + if usemodule == "internal": self.contents = ICalSimpleSplitter().splitcalendar(calstr) - elif usemodule == 'icalendar': + elif usemodule == "icalendar": try: cal = Calendar.from_string(calstr.read()) - except Exception, e: + except Exception as e: self.em.rclog("Openfile: read or parse error: %s" % str(e)) return False self.contents = cal.walk() self.contents = [item.as_string() for item in self.contents - if (item.name == 'VEVENT' or item.name == 'VTODO' - or item.name == 'VJOURNAL')] + if (item.name == "VEVENT" or item.name == "VTODO" + or item.name == "VJOURNAL")] else: try: cal = vobject.readOne(calstr) - except Exception, e: + except Exception as e: self.em.rclog("Openfile: cant parse object: %s" % str(e)) return False - for lstnm in ('vevent_list', 'vtodo_list', 'vjournal_list'): + for lstnm in ("vevent_list", "vtodo_list", "vjournal_list"): lst = getattr(cal, lstnm, []) for ev in lst: self.contents.append(ev.serialize()) @@ -90,7 +91,10 @@ class IcalExtractor: def getipath(self, params): try: - index = int(params["ipath:"]) + if params["ipath:"] == b'': + index = 0 + else: + index = int(params["ipath:"]) except: return (False, "", "", True) return self.extractone(index) @@ -100,7 +104,7 @@ class IcalExtractor: if self.currentindex == -1: # Return "self" doc self.currentindex = 0 - self.em.setmimetype('text/plain') + self.em.setmimetype(b'text/plain') if len(self.contents) == 0: eof = rclexecm.RclExecM.eofnext else: @@ -121,44 +125,44 @@ class ICalSimpleSplitter: # Note that if an 'interesting' element is nested inside another one, # it will not be extracted (stay as text in external event). This is # not an issue and I don't think it can happen with the current list - interesting = ('VTODO', 'VEVENT', 'VJOURNAL') + interesting = (b'VTODO', b'VEVENT', b'VJOURNAL') def splitcalendar(self, fin): - curblkname = '' - curblk = '' + curblkname = b'' + curblk = b'' lo = [] for line in fin: line = line.rstrip() - if line == '': + if line == b'': continue if curblkname: - curblk = curblk + line + "\n" + curblk = curblk + line + b'\n' - l = line.split(":") + l = line.split(b':') if len(l) < 2: continue # If not currently inside a block and we see an # 'interesting' BEGIN, start block - if curblkname == '' and l[0].upper() == "BEGIN" : + if curblkname == b'' and l[0].upper() == b'BEGIN': name = l[1].upper() if name in ICalSimpleSplitter.interesting: curblkname = name - curblk = curblk + line + "\n" + curblk = curblk + line + b'\n' # If currently accumulating block lines, check for end - if curblkname and l[0].upper() == "END" and \ + if curblkname and l[0].upper() == b'END' and \ l[1].upper() == curblkname: lo.append(curblk) - curblkname = '' - curblk = '' + curblkname = b'' + curblk = b'' if curblk: lo.append(curblk) - curblkname = '' - curblk = '' + curblkname = b'' + curblk = b'' return lo diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py index ac21d130..8892a9ae 100755 --- a/src/filters/rclimg.py +++ b/src/filters/rclimg.py @@ -1,11 +1,12 @@ #!/usr/bin/env python -# Python-based Image Tag extractor for Recoll. This is less thorough than the -# Perl-based rclimg script, but useful if you don't want to have to install Perl -# (e.g. on Windows). +# Python-based Image Tag extractor for Recoll. This is less thorough +# than the Perl-based rclimg script, but useful if you don't want to +# have to install Perl (e.g. on Windows). # # Uses pyexiv2. Also tried Pillow, found it useless for tags. # +from __future__ import print_function import sys import os @@ -15,7 +16,7 @@ import re try: import pyexiv2 except: - print "RECFILTERROR HELPERNOTFOUND python:pyexiv2" + print("RECFILTERROR HELPERNOTFOUND python:pyexiv2") sys.exit(1); khexre = re.compile('.*\.0[xX][0-9a-fA-F]+$') @@ -48,7 +49,7 @@ class ImgTagExtractor: def extractone(self, params): #self.em.rclog("extractone %s" % params["filename:"]) ok = False - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no file name") return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] @@ -62,11 +63,11 @@ class ImgTagExtractor: # we skip numeric keys and undecoded makernote data if k != 'Exif.Photo.MakerNote' and not khexre.match(k): mdic[k] = str(metadata[k].raw_value) - except Exception, err: + except Exception as err: self.em.rclog("extractone: extract failed: [%s]" % err) return (ok, "", "", rclexecm.RclExecM.eofnow) - docdata = "\n" + docdata = b'\n' ttdata = set() for k in pyexiv2_titles: @@ -77,25 +78,28 @@ class ImgTagExtractor: for v in ttdata: v = v.replace('[', '').replace(']', '').replace("'", "") title += v + " " - docdata += '' + title + '\n' + docdata += rclexecm.makebytes("" + title + "\n") for k in exiv2_dates: if k in mdic: # Recoll wants: %Y-%m-%d %H:%M:%S. # We get 2014:06:27 14:58:47 - dt = mdic[k].replace(':', '-', 2) - docdata += '\n' + dt = mdic[k].replace(":", "-", 2) + docdata += b'\n' break - for k,v in mdic.iteritems(): + for k,v in mdic.items(): if k == 'Xmp.digiKam.TagsList': - docdata += '\n' + docdata += b'\n' - docdata += "\n" - for k,v in mdic.iteritems(): - docdata += k + " : " + self.em.htmlescape(mdic[k]) + "
\n" - docdata += "" + docdata += b'\n' + for k,v in mdic.items(): + docdata += rclexecm.makebytes(k + " : " + \ + self.em.htmlescape(mdic[k]) + "
\n") + docdata += b'' self.em.setmimetype("text/html") diff --git a/src/filters/rclinfo b/src/filters/rclinfo index c6b8a8b1..575121cc 100755 --- a/src/filters/rclinfo +++ b/src/filters/rclinfo @@ -3,6 +3,7 @@ # Read a file in GNU info format and output its nodes as subdocs, # interfacing with recoll execm +from __future__ import print_function import rclexecm import sys @@ -16,24 +17,12 @@ import subprocess # Some info source docs contain charset info like: # @documentencoding ISO-2022-JP # But this seems to be absent from outputs. -htmltemplate = ''' - - - %s - - - -
-   %s
-   
- -''' # RclExecm interface class InfoExtractor: def __init__(self, em): self.file = "" - self.contents = [] + self.contents = [] self.em = em def extractone(self, index): @@ -43,8 +32,13 @@ class InfoExtractor: nodename, docdata = self.contents[index] nodename = self.em.htmlescape(nodename) docdata = self.em.htmlescape(docdata) - - docdata = htmltemplate % (nodename, docdata) + # strange whitespace to avoid changing the module tests (same as old) + docdata = b'\n\n \n ' + nodename + \ + b'\n' + \ + ' \n' + \ + b' \n \n' + \ + b'
\n   ' + \
+                  docdata + b'\n   
\n\n' iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.contents) -1: @@ -60,19 +54,18 @@ class InfoExtractor: self.em.rclog("Openfile: %s is not a file" % self.file) return False - cmd = "info --subnodes -o - -f " + self.file + cmd = b'info --subnodes -o - -f ' + self.file nullstream = open("/dev/null", 'w') try: infostream = subprocess.Popen(cmd, shell=True, bufsize=1, stderr=nullstream, stdout=subprocess.PIPE).stdout - except Exception, e: + except Exception as e: # Consider this as permanently fatal. self.em.rclog("Openfile: exec info: %s" % str(e)) - print "RECFILTERROR HELPERNOTFOUND info" + print("RECFILTERROR HELPERNOTFOUND info") sys.exit(1); - self.currentindex = -1 self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream) @@ -117,9 +110,9 @@ class InfoSimpleSplitter: index = 0 listout = [] node_dict = {} - node = "" + node = b'' infofile = os.path.basename(filename) - nodename = "Unknown" + nodename = b'Unknown' for line in fin: @@ -128,41 +121,41 @@ class InfoSimpleSplitter: # beginning with spaces (it's a bug probably, only seen it once) # Maybe we'd actually be better off directly interpreting the # info files - if gotblankline and line.lstrip(" ").startswith("File: "): + if gotblankline and line.lstrip(b' ').startswith(b'File: '): prevnodename = nodename - line = line.rstrip("\n\r") - pairs = line.split(",") - up = "Top" + line = line.rstrip(b'\n\r') + pairs = line.split(b',') + up = b'Top' nodename = str(index) try: for pair in pairs: - name, value = pair.split(':') - name = name.strip(" ") - value = value.strip(" ") - if name == "Node": + name, value = pair.split(b':') + name = name.strip(b' ') + value = value.strip(b' ') + if name == b'Node': nodename = value - if name == "Up": + if name == b'Up': up = value - if name == "File": + if name == b'File': infofile = value - except: - print >> sys.stderr, "rclinfo: bad line in %s: [%s]\n" % \ - (infofile, line) + except Exception as err: + print("rclinfo: bad line in %s: [%s] %s\n" % \ + (infofile, line, err), file = sys.stderr) nodename = prevnodename node += line continue - if node_dict.has_key(nodename): - print >> sys.stderr, "Info file", filename, \ - "Dup node: ", nodename + if nodename in node_dict: + print("Info file %s Dup node: %s" % (filename, nodename), \ + file=sys.stderr) node_dict[nodename] = up if index != 0: listout.append((prevnodename, node)) - node = "" + node = b'' index += 1 - if line.rstrip("\n\r") == '': + if line.rstrip(b'\n\r') == b'': gotblankline = 1 else: gotblankline = 0 @@ -170,7 +163,7 @@ class InfoSimpleSplitter: node += line # File done, add last dangling node - if node != "": + if node != b'': listout.append((nodename, node)) # Compute node paths (concatenate "Up" values), to be used @@ -178,34 +171,34 @@ class InfoSimpleSplitter: # the info file tree is bad listout1 = [] for nodename, node in listout: - title = "" + title = b'' loop = 0 error = 0 - while nodename != "Top": - title = nodename + " / " + title - if node_dict.has_key(nodename): + while nodename != b'Top': + title = nodename + b' / ' + title + if nodename in node_dict: nodename = node_dict[nodename] else: - print >> sys.stderr, \ + print( "Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \ - (infofile, title, nodename) + (infofile, title, nodename), sys.stderr) error = 1 break loop += 1 if loop > 50: - print >> sys.stderr, "Infofile: bad tree (looping)", \ - infofile + print("Infofile: bad tree (looping) %s" % infofile, \ + file = sys.stderr) error = 1 break if error: continue - if title == "": + if title == b'': title = infofile else: - title = infofile + " / " + title - title = title.rstrip(" / ") + title = infofile + b' / ' + title + title = title.rstrip(b' / ') listout1.append((title, node)) return listout1 diff --git a/src/filters/rclkar b/src/filters/rclkar index 83c0207c..00432b15 100755 --- a/src/filters/rclkar +++ b/src/filters/rclkar @@ -1,6 +1,8 @@ #!/usr/bin/env python # Read a .kar midi karaoke file and translate to recoll indexable format +# This does not work with Python3 yet because python:midi doesn't +from __future__ import print_function import rclexecm import sys @@ -15,9 +17,9 @@ except: pass try: - import midi + from midi import midi except: - print "RECFILTERROR HELPERNOTFOUND python:midi" + print("RECFILTERROR HELPERNOTFOUND python:midi") sys.exit(1); try: @@ -106,12 +108,12 @@ class KarTextExtractor: if data: try: data = data.decode(self.encoding, 'ignore') - except Exception, err: + except Exception as err: self.em.rclog("Decode failed: " + str(err)) return "" try: data = data.encode('utf-8') - except Exception, err: + except Exception as err: self.em.rclog("Encode failed: " + str(err)) return "" @@ -127,7 +129,7 @@ class KarTextExtractor: just one our users could use if there is trouble with guessing encodings''' - rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$' + rexp = b'''\(([^\)]+)\)\.[a-zA-Z]+$''' m = re.search(rexp, fn) if m: return m.group(1) @@ -165,7 +167,7 @@ class KarTextExtractor: if count > 0: confidence = 1.0 encoding = code - except Exception, err: + except Exception as err: self.em.rclog("stopwords-based classifier failed: %s" % err) return (encoding, confidence) @@ -177,7 +179,7 @@ class KarTextExtractor: docdata = "" ok = False - if not params.has_key("filename:"): + if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] @@ -191,7 +193,7 @@ class KarTextExtractor: self.encoding = "" # Mimetype not used for now - if not params.has_key("mimetype:"): + if "mimetype:" not in params: mimetype = 'audio/x-midi' else: mimetype = params["mimetype:"] @@ -199,8 +201,8 @@ class KarTextExtractor: # Read in and midi-decode the file try: stream = midi.read_midifile(filename) - except Exception, err: - self.em.rclog("extractone: midi extract failed: [%s]" % err) + except Exception as err: + self.em.rclog("extractone: read_midifile failed: [%s]" % err) return (ok, docdata, "", rclexecm.RclExecM.eofnow) title = None diff --git a/src/filters/rcllatinclass.py b/src/filters/rcllatinclass.py index 529aadab..ad5d3efe 100755 --- a/src/filters/rcllatinclass.py +++ b/src/filters/rcllatinclass.py @@ -13,6 +13,8 @@ epsilon with dasia (in unicode but not iso). Can this be replaced by either epsi with acute accent ? """ +from __future__ import print_function + import sys import string import glob @@ -117,7 +119,7 @@ if __name__ == "__main__": lang,code,count = classifier.classify(rawtext) if count > 0: - print "%s %s %d" % (code, lang, count) + print("%s %s %d" % (code, lang, count)) else: - print "UNKNOWN UNKNOWN 0" + print("UNKNOWN UNKNOWN 0") diff --git a/src/filters/rclrar b/src/filters/rclrar index b661f510..0846263c 100755 --- a/src/filters/rclrar +++ b/src/filters/rclrar @@ -43,7 +43,7 @@ class RarExtractor: try: rarinfo = self.rar.getinfo(ipath) isdir = rarinfo.isdir() - except Exception, err: + except Exception as err: self.em.rclog("extractone: getinfo failed: [%s]" % err) return (True, docdata, ipath, false) @@ -56,7 +56,7 @@ class RarExtractor: else: docdata = self.rar.read(ipath) ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False else: @@ -89,7 +89,7 @@ class RarExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py index c7031030..5a9a68ac 100755 --- a/src/filters/rclrtf.py +++ b/src/filters/rclrtf.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import rclexecm import rclexec1 @@ -10,24 +11,24 @@ import os class RTFProcessData: def __init__(self, em): self.em = em - self.out = "" + self.out = b'' self.gothead = 0 - self.patendhead = re.compile('''''') - self.patcharset = re.compile('''^''') + self.patcharset = re.compile(b'''^' + "\n" - self.out += line + "\n" + self.out += b'' + b'\n' + self.out += line + b'\n' self.gothead = 1 elif not self.patcharset.search(line): - self.out += line + "\n" + self.out += line + b'\n' else: - self.out += line + "\n" + self.out += line + b'\n' def wrapData(self): return self.out @@ -52,7 +53,7 @@ class RTFFilter: if __name__ == '__main__': if not rclexecm.which("unrtf"): - print("RECFILTERROR HELPERNOTFOUND antiword") + print("RECFILTERROR HELPERNOTFOUND unrtf") sys.exit(1) proto = rclexecm.RclExecM() filter = RTFFilter(proto) diff --git a/src/filters/rcltar b/src/filters/rcltar index 3d6508e0..7dba94d3 100755 --- a/src/filters/rcltar +++ b/src/filters/rcltar @@ -33,7 +33,7 @@ class TarExtractor: else: docdata = self.tar.extractfile(ipath).read() ok = True - except Exception, err: + except Exception as err: ok = False iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.namen) -1: @@ -59,7 +59,7 @@ class TarExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/filters/rclwar b/src/filters/rclwar index 8fe46638..30a95e9f 100755 --- a/src/filters/rclwar +++ b/src/filters/rclwar @@ -15,7 +15,7 @@ class WarExtractor: member = self.tar.extractfile(tarinfo) docdata = member.read() ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof) @@ -26,7 +26,7 @@ class WarExtractor: try: self.tar = tarfile.open(params["filename:"]) return True - except Exception, err: + except Exception as err: self.em.rclog(str(err)) return False @@ -34,7 +34,7 @@ class WarExtractor: ipath = params["ipath:"] try: tarinfo = self.tar.getmember(ipath) - except Exception, err: + except Exception as err: self.em.rclog(str(err)) return (False, "", ipath, rclexecm.RclExecM.noteof) return self.extractone(tarinfo) diff --git a/src/filters/rclzip b/src/filters/rclzip index a3afb06e..9d88dc76 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -72,7 +72,7 @@ class ZipExtractor: else: docdata = self.zip.read(ipath) ok = True - except Exception, err: + except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False iseof = rclexecm.RclExecM.noteof @@ -98,7 +98,7 @@ class ZipExtractor: try: self.zip = ZipFile(filename) return True - except Exception, err: + except Exception as err: self.em.rclog("openfile: failed: [%s]" % err) return False @@ -111,7 +111,7 @@ class ZipExtractor: try: ipath = ipath.decode("utf-8") return self.extractone(ipath) - except Exception, err: + except Exception as err: return (ok, data, ipath, eof) def getnext(self, params): diff --git a/src/python/recoll/recoll/rclconfig.py b/src/python/recoll/recoll/rclconfig.py index 28cb4e5a..8fc8aaff 100755 --- a/src/python/recoll/recoll/rclconfig.py +++ b/src/python/recoll/recoll/rclconfig.py @@ -75,7 +75,7 @@ class ConfSimple: def getNames(self, sk = ''): if not sk in self.submaps: return None - return self.submaps[sk].keys() + return list(self.submaps[sk].keys()) class ConfTree(ConfSimple): """A ConfTree adds path-hierarchical interpretation of the section keys, diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index 4e66ddb2..19f3d8d6 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst daemloglevel = 6 daemlogfilename = /tmp/rclmontrace +systemfilecommand = xdg-mime query filetype + indexStripChars = 1 detectxattronly = 1