first pass at converting the filters for python 2/3 compat

This commit is contained in:
Jean-Francois Dockes 2015-11-06 16:49:03 +01:00
parent cc68331f3d
commit f344e8fedd
21 changed files with 298 additions and 255 deletions

View File

@ -52,7 +52,7 @@ class PPTDumper(object):
try:
dirstrm = strm.getDirectoryStreamByName(dirname)
except Exception, err:
except Exception as err:
error("getDirectoryStreamByName(%s): %s - %s\n" % (dirname,str(err),self.filepath))
# The previous version was killed by the exception
# here, so the equivalent is to break, but maybe there

View File

@ -15,7 +15,7 @@ try:
import pylzma
from py7zlib import Archive7z
except:
print "RECFILTERROR HELPERNOTFOUND python:pylzma"
print("RECFILTERROR HELPERNOTFOUND python:pylzma")
sys.exit(1);
try:
@ -40,19 +40,17 @@ class SevenZipExtractor:
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
docdata = ""
docdata = b''
try:
docdata = self.sevenzip.getmember(ipath).read()
ok = True
except Exception, err:
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.sevenzip.getnames()) -1:
iseof = rclexecm.RclExecM.eofnext
if isinstance(ipath, unicode):
ipath = ipath.encode("utf-8")
return (ok, docdata, ipath, iseof)
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
@ -71,7 +69,7 @@ class SevenZipExtractor:
fp = open(filename, 'rb')
self.sevenzip = Archive7z(fp)
return True
except Exception, err:
except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err)
return False
@ -84,7 +82,7 @@ class SevenZipExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception, err:
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):

View File

@ -12,7 +12,7 @@ try:
from mutagen.flac import FLAC
from mutagen.oggvorbis import OggVorbis
except:
print "RECFILTERROR HELPERNOTFOUND python:mutagen"
print("RECFILTERROR HELPERNOTFOUND python:mutagen")
sys.exit(1);
# prototype for the html document we're returning
@ -42,23 +42,24 @@ class AudioTagExtractor:
#self.em.rclog("extractone %s %s" % (params["filename:"], params["mimetype:"]))
docdata = ""
ok = False
if not params.has_key("mimetype:") or not params.has_key("filename:"):
if not "mimetype:" in params or not "filename:" in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
mimetype = params["mimetype:"]
try:
if mimetype == "audio/mpeg":
if mimetype == b'audio/mpeg':
tags = MP3(filename, ID3=EasyID3)
elif mimetype == "application/ogg":
elif mimetype == b'application/ogg' or \
mimetype == b'audio/x-vorbis+ogg':
tags = OggVorbis(filename)
elif mimetype == "application/x-flac" or \
mimetype == "audio/x-flac" or \
mimetype == "audio/flac":
elif mimetype == b'application/x-flac' or \
mimetype == 'audio/x-flac' or \
mimetype == b'audio/flac':
tags = FLAC(filename)
else:
raise Exception, "Bad mime type %s" % mimetype
except Exception, err:
raise Exception("Bad mime type %s" % mimetype)
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
@ -66,21 +67,22 @@ class AudioTagExtractor:
artist = ""
title = ""
try:
album = self.em.htmlescape(tags["album"][0].encode("utf-8"))
album = self.em.htmlescape(tags["album"][0])
except:
pass
try:
artist = self.em.htmlescape(tags["artist"][0].encode("utf-8"))
artist = self.em.htmlescape(tags["artist"][0])
except:
pass
try:
title = self.em.htmlescape(tags["title"][0].encode("utf-8"))
title = self.em.htmlescape(tags["title"][0])
except:
pass
self.em.setmimetype("text/html")
alldata = self.em.htmlescape(tags.pprint().encode("utf-8"))
alldata = self.em.htmlescape(tags.pprint())
alldata = alldata.replace("\n", "<br>")
docdata = htmltemplate % (album, artist, title, alldata)
docdata = (htmltemplate % (album, artist, title, alldata))\
.encode('UTF-8')
ok = True
return (ok, docdata, "", rclexecm.RclExecM.eofnext)

View File

@ -2,6 +2,11 @@
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
from __future__ import print_function
# Note: this is not converted to python3, libchm does not have a
# python3 wrapper at this point (2015-11)
# Do we return individual chapters as html pages or concatenate everything?
rclchm_catenate = 0
# Use special html type to allow for mimeconf/mimeview Open magic,
@ -23,13 +28,13 @@ import rclexecm
try:
from chm import chm,chmlib
except:
print "RECFILTERROR HELPERNOTFOUND python:chm"
print("RECFILTERROR HELPERNOTFOUND python:chm")
sys.exit(1);
try:
from HTMLParser import HTMLParser
except:
print "RECFILTERROR HELPERNOTFOUND python:HTMLParser"
print("RECFILTERROR HELPERNOTFOUND python:HTMLParser")
sys.exit(1);
# Small helper routines
@ -37,11 +42,11 @@ def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
#print("ResolveObject failed: %s" % path, file=sys.stderr)
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
print("RetrieveObject failed: %s" % path, file=sys.stderr)
return ""
return doc

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
# dia (http://live.gnome.org/Dia) file filter for recoll
# stefan.friedel@iwr.uni-heidelberg.de 2012
#
@ -66,7 +68,7 @@ class DiaExtractor:
try:
docdata = self.ExtractDiaText()
ok = True
except Exception, err:
except Exception as err:
ok = False
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
@ -76,7 +78,7 @@ class DiaExtractor:
def openfile(self, params):
try:
self.dia = GzipFile(params["filename:"], 'r')
# Dial files are sometimes not compressed. Quite weirdly,
# Dia files are sometimes not compressed. Quite weirdly,
# GzipFile does not complain until we try to read. Have to do it
# here to be able to retry an uncompressed open.
data = self.dia.readline()

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
import rclexecm
import rclexec1
@ -11,32 +12,32 @@ import os
class WordProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.cont = ""
self.out = b''
self.cont = b''
self.gotdata = False
# Line with continued word (ending in -)
# we strip the - which is not nice for actually hyphenated word.
# What to do ?
self.patcont = re.compile('''[\w][-]$''')
self.patcont = re.compile(b'''[\w][-]$''')
# Pattern for breaking continuation at last word start
self.patws = re.compile('''([\s])([\w]+)(-)$''')
self.patws = re.compile(b'''([\s])([\w]+)(-)$''')
def takeLine(self, line):
if not self.gotdata:
if line == "":
if line == b'':
return
self.out = '<html><head><title></title>' + \
'<meta http-equiv="Content-Type"' + \
'content="text/html;charset=UTF-8">' + \
'</head><body><p>'
self.out = b'<html><head><title></title>' + \
b'<meta http-equiv="Content-Type"' + \
b'content="text/html;charset=UTF-8">' + \
b'</head><body><p>'
self.gotdata = True
if self.cont:
line = self.cont + line
self.cont = ""
if line == "\f":
self.out += "</p><hr><p>"
if line == b'\f':
self.out += '</p><hr><p>'
return
if self.patcont.search(line):
@ -47,16 +48,16 @@ class WordProcessData:
line = line[0:match.start(1)]
else:
self.cont = line
line = ""
line = b''
if line:
self.out += self.em.htmlescape(line) + "<br>"
self.out += self.em.htmlescape(line) + b'<br>'
else:
self.out += "<br>"
self.out += b'<br>'
def wrapData(self):
if self.gotdata:
self.out += "</p></body></html>"
self.out += b'</p></body></html>'
self.em.setmimetype("text/html")
return self.out
@ -65,7 +66,7 @@ class WordProcessData:
# output HTML
class WordPassData:
def __init__(self, em):
self.out = ""
self.out = b''
self.em = em
def takeLine(self, line):
@ -96,8 +97,8 @@ class WordFilter:
return False
def mimetype(self, fn):
rtfprolog ="{\\rtf1"
docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
rtfprolog = b'{\\rtf1'
docprolog = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
try:
f = open(fn, "rb")
except:
@ -132,7 +133,7 @@ class WordFilter:
mt = self.mimetype(fn)
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
if mt == "text/plain":
return ([python, os.path.join(self.execdir, "rcltext.py")],
return (["python", os.path.join(self.execdir, "rcltext.py")],
WordPassData(self.em))
elif mt == "text/rtf":
cmd = ["python", os.path.join(self.execdir, "rclrtf.py"),

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python
"""Extract Html content from an EPUB file (.chm)"""
from __future__ import print_function
rclepub_html_mtype = "text/html"
@ -12,7 +13,7 @@ import rclexecm
try:
import epub
except:
print "RECFILTERROR HELPERNOTFOUND python:epub"
print("RECFILTERROR HELPERNOTFOUND python:epub")
sys.exit(1);
class rclEPUB:
@ -63,11 +64,11 @@ class rclEPUB:
if item is None:
raise Exception("Item not found for id %s" % (id,))
doc = self.book.read_item(item)
doc = re.sub('''</[hH][eE][aA][dD]>''',
'''<meta name="rclaptg" content="epub"></head>''', doc)
doc = re.sub(b'''</[hH][eE][aA][dD]>''',
b'''<meta name="rclaptg" content="epub"></head>''', doc)
self.em.setmimetype(rclepub_html_mtype)
return (True, doc, id, iseof)
except Exception, err:
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
return (False, "", id, iseof)
@ -76,11 +77,11 @@ class rclEPUB:
self.currentindex = -1
self.contents = []
try:
self.book = epub.open(params["filename:"])
except Exception, err:
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
except Exception as err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
for id, item in self.book.opf.manifest.iteritems():
for id, item in self.book.opf.manifest.items():
if item.media_type == 'application/xhtml+xml':
self.contents.append(id)
return True

View File

@ -26,6 +26,8 @@
# this would be to slow. So this helps implementing a permanent script
# to repeatedly execute single commands.
from __future__ import print_function
import subprocess
import rclexecm
@ -74,8 +76,8 @@ class Executor:
# params["mimetype:"]))
self.flt.reset()
ok = False
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (ok, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]

View File

@ -16,6 +16,9 @@
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
########################################################
## Recoll multifilter communication module and utilities
#
# All data is binary. This is important for Python3
# All parameter names are converted to and processed as str/unicode
from __future__ import print_function
@ -26,6 +29,21 @@ import shutil
import getopt
import rclconfig
PY3 = sys.version > '3'
if PY3:
def makebytes(data):
if isinstance(data, bytes):
return data
else:
return data.encode("UTF-8")
else:
def makebytes(data):
if isinstance(data, unicode):
return data.encode("UTF-8")
else:
return data
my_config = rclconfig.RclConfig()
############################################
@ -46,7 +64,7 @@ class RclExecM:
self.myname = os.path.basename(sys.argv[0])
except:
self.myname = "???"
self.mimetype = ""
self.mimetype = b""
if os.environ.get("RECOLL_FILTER_MAXMEMBERKB"):
self.maxmembersize = \
@ -60,7 +78,7 @@ class RclExecM:
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
self.debugfile = None
if self.debugfile:
self.errfout = open(self.debugfile, "ab")
self.errfout = open(self.debugfile, "a")
else:
self.errfout = sys.stderr
@ -93,77 +111,84 @@ class RclExecM:
# Note: tried replacing this with a multiple replacer according to
# http://stackoverflow.com/a/15221068, which was **10 times** slower
def htmlescape(self, txt):
# This must stay first (it somehow had managed to skip after
# the next line, with rather interesting results)
txt = txt.replace("&", "&amp;")
txt = txt.replace("<", "&lt;")
txt = txt.replace(">", "&gt;")
txt = txt.replace('"', "&quot;")
# &amp must stay first (it somehow had managed to skip
# after the next replace, with rather interesting results)
try:
txt = txt.replace(b'&', b'&amp;').replace(b'<', b'&lt;').\
replace(b'>', b'&gt;').replace(b'"', b'&quot;')
except:
txt = txt.replace("&", "&amp;").replace("<", "&lt;").\
replace(">", "&gt;").replace("\"", "&quot;")
return txt
# Our worker sometimes knows the mime types of the data it sends
def setmimetype(self, mt):
self.mimetype = mt
self.mimetype = makebytes(mt)
# Read single parameter from process input: line with param name and size
# followed by data.
# followed by data. The param name is returned as str/unicode, the data
# as bytes
def readparam(self):
s = sys.stdin.readline()
if s == '':
if PY3:
inf = sys.stdin.buffer
else:
inf = sys.stdin
s = inf.readline()
if s == b'':
sys.exit(0)
# self.rclog(": EOF on input", 1, 0)
s = s.rstrip("\n")
s = s.rstrip(b'\n')
if s == "":
return ("","")
if s == b'':
return ('', b'')
l = s.split()
if len(l) != 2:
self.rclog("bad line: [" + s + "]", 1, 1)
self.rclog(b'bad line: [' + s + b']', 1, 1)
paramname = l[0].lower()
paramname = l[0].decode('ASCII').lower()
paramsize = int(l[1])
if paramsize > 0:
paramdata = sys.stdin.read(paramsize)
paramdata = inf.read(paramsize)
if len(paramdata) != paramsize:
self.rclog("Bad read: wanted %d, got %d" %
(paramsize, len(paramdata)), 1, 1)
else:
paramdata = ""
paramdata = b''
#self.rclog("paramname [%s] paramsize %d value [%s]" %
# (paramname, paramsize, paramdata))
return (paramname, paramdata)
if PY3:
def senditem(self, nm, len, data):
sys.stdout.buffer.write(makebytes("%s: %d\n" % (nm, len)))
self.breakwrite(sys.stdout.buffer, makebytes(data))
else:
def senditem(self, nm, len, data):
sys.stdout.write(makebytes("%s: %d\n" % (nm, len)))
self.breakwrite(sys.stdout, makebytes(data))
# Send answer: document, ipath, possible eof.
def answer(self, docdata, ipath, iseof = noteof, iserror = noerror):
if iserror != RclExecM.fileerror and iseof != RclExecM.eofnow:
if isinstance(docdata, unicode):
self.rclog("GOT UNICODE for ipath [%s]" % (ipath,))
docdata = docdata.encode("UTF-8")
print("Document: %d" % len(docdata))
self.breakwrite(sys.stdout, docdata)
self.senditem("Document", len(docdata), docdata)
if len(ipath):
print("Ipath: %d" % len(ipath))
sys.stdout.write(ipath)
self.senditem("Ipath", len(ipath), ipath)
if len(self.mimetype):
print("Mimetype: %d" % len(self.mimetype))
sys.stdout.write(self.mimetype)
self.senditem("Mimetype", len(self.mimetype), self.mimetype)
# If we're at the end of the contents, say so
if iseof == RclExecM.eofnow:
print("Eofnow: 0")
self.senditem("Eofnow", 0, b'')
elif iseof == RclExecM.eofnext:
print("Eofnext: 0")
self.senditem("Eofnext", 0, b'')
if iserror == RclExecM.subdocerror:
print("Subdocerror: 0")
self.senditem("Subdocerror", 0, b'')
elif iserror == RclExecM.fileerror:
print("Fileerror: 0")
self.senditem("Fileerror", 0, b'')
# End of message
print()
@ -173,7 +198,8 @@ class RclExecM:
def processmessage(self, processor, params):
# We must have a filename entry (even empty). Else exit
if not params.has_key("filename:"):
if "filename:" not in params:
print("%s" % params, file=sys.stderr)
self.rclog("no filename ??", 1, 1)
# If we're given a file name, open it.
@ -182,7 +208,7 @@ class RclExecM:
if not processor.openfile(params):
self.answer("", "", iserror = RclExecM.fileerror)
return
except Exception, err:
except Exception as err:
self.rclog("processmessage: openfile raised: [%s]" % err)
self.answer("", "", iserror = RclExecM.fileerror)
return
@ -192,11 +218,11 @@ class RclExecM:
eof = True
self.mimetype = ""
try:
if params.has_key("ipath:") and len(params["ipath:"]):
if "ipath:" in params and len(params["ipath:"]):
ok, data, ipath, eof = processor.getipath(params)
else:
ok, data, ipath, eof = processor.getnext(params)
except Exception, err:
except Exception as err:
self.answer("", "", eof, RclExecM.fileerror)
return
@ -311,7 +337,7 @@ def main(proto, extract):
actAsSingle = False
debugDumpData = False
ipath = ""
ipath = b""
args = sys.argv[1:]
opts, args = getopt.getopt(args, "hdsi:w:")
@ -321,7 +347,7 @@ def main(proto, extract):
elif opt in ['-s']:
actAsSingle = True
elif opt in ['-i']:
ipath = arg
ipath = makebytes(arg)
elif opt in ['-w']:
ret = which(arg)
if ret:
@ -344,17 +370,17 @@ def main(proto, extract):
lst = fileout.split(':')
mimetype = lst[len(lst)-1].strip()
lst = mimetype.split(';')
return lst[0].strip()
return makebytes(lst[0].strip())
def mimetype_with_xdg(f):
cmd = 'xdg-mime query filetype "' + f + '"'
return os.popen(cmd).read().strip()
return makebytes(os.popen(cmd).read().strip())
def debprint(s):
def debprint(out, s):
if not actAsSingle:
print(s)
proto.breakwrite(out, makebytes(s+'\n'))
params = {'filename:': args[0]}
params = {'filename:': makebytes(args[0])}
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
mimetype = mimetype_with_xdg(args[0])
params['mimetype:'] = mimetype
@ -363,19 +389,20 @@ def main(proto, extract):
print("Open error", file=sys.stderr)
sys.exit(1)
if ipath != "" or actAsSingle:
if PY3:
ioout = sys.stdout.buffer
else:
ioout = sys.stdout
if ipath != b"" or actAsSingle:
params['ipath:'] = ipath
ok, data, ipath, eof = extract.getipath(params)
if ok:
debprint("== Found entry for ipath %s (mimetype [%s]):" % \
debprint(ioout, "== Found entry for ipath %s (mimetype [%s]):" % \
(ipath, proto.mimetype))
if isinstance(data, unicode):
bdata = data.encode("UTF-8")
else:
bdata = data
bdata = makebytes(data)
if debugDumpData or actAsSingle:
proto.breakwrite(sys.stdout, bdata)
print()
proto.breakwrite(ioout, bdata)
ioout.write(b'\n')
sys.exit(0)
else:
print("Got error, eof %d"%eof, file=sys.stderr)
@ -386,15 +413,12 @@ def main(proto, extract):
ok, data, ipath, eof = extract.getnext(params)
if ok:
ecnt = ecnt + 1
debprint("== Entry %d ipath %s (mimetype [%s]):" % \
(ecnt, ipath, proto.mimetype))
if isinstance(data, unicode):
bdata = data.encode("UTF-8")
else:
bdata = data
bdata = makebytes(data)
debprint(ioout, "== Entry %d dlen %d ipath %s (mimetype [%s]):" % \
(ecnt, len(data), ipath, proto.mimetype))
if debugDumpData:
proto.breakwrite(sys.stdout, bdata)
print()
proto.breakwrite(ioout, bdata)
ioout.write(b'\n')
if eof != RclExecM.noteof:
sys.exit(0)
else:

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
# Read an ICS file, break it into "documents" which are events, todos,
# or journal entries, and interface with recoll execm
@ -13,22 +14,22 @@ import rclexecm
import sys
# Decide how we'll process the file.
modules = ('internal', 'icalendar', 'vobject')
usemodule = 'internal'
modules = ("internal", "icalendar", "vobject")
usemodule = "internal"
forcevobject = 0
if usemodule != 'internal':
if usemodule != "internal":
try:
if forcevobject:
raise Exception
from icalendar import Calendar, Event
usemodule = 'icalendar'
usemodule = "icalendar"
except:
try:
import vobject
usemodule = 'vobject'
usemodule = "vobject"
except:
print "RECFILTERROR HELPERNOTFOUND python:icalendar"
print "RECFILTERROR HELPERNOTFOUND python:vobject"
print("RECFILTERROR HELPERNOTFOUND python:icalendar")
print("RECFILTERROR HELPERNOTFOUND python:vobject")
sys.exit(1);
@ -55,32 +56,32 @@ class IcalExtractor:
self.file = params["filename:"]
try:
calstr = open(self.file, 'rb')
except Exception, e:
calstr = open(self.file, "rb")
except Exception as e:
self.em.rclog("Openfile: open: %s" % str(e))
return False
self.currentindex = -1
if usemodule == 'internal':
if usemodule == "internal":
self.contents = ICalSimpleSplitter().splitcalendar(calstr)
elif usemodule == 'icalendar':
elif usemodule == "icalendar":
try:
cal = Calendar.from_string(calstr.read())
except Exception, e:
except Exception as e:
self.em.rclog("Openfile: read or parse error: %s" % str(e))
return False
self.contents = cal.walk()
self.contents = [item.as_string() for item in self.contents
if (item.name == 'VEVENT' or item.name == 'VTODO'
or item.name == 'VJOURNAL')]
if (item.name == "VEVENT" or item.name == "VTODO"
or item.name == "VJOURNAL")]
else:
try:
cal = vobject.readOne(calstr)
except Exception, e:
except Exception as e:
self.em.rclog("Openfile: cant parse object: %s" % str(e))
return False
for lstnm in ('vevent_list', 'vtodo_list', 'vjournal_list'):
for lstnm in ("vevent_list", "vtodo_list", "vjournal_list"):
lst = getattr(cal, lstnm, [])
for ev in lst:
self.contents.append(ev.serialize())
@ -90,6 +91,9 @@ class IcalExtractor:
def getipath(self, params):
try:
if params["ipath:"] == b'':
index = 0
else:
index = int(params["ipath:"])
except:
return (False, "", "", True)
@ -100,7 +104,7 @@ class IcalExtractor:
if self.currentindex == -1:
# Return "self" doc
self.currentindex = 0
self.em.setmimetype('text/plain')
self.em.setmimetype(b'text/plain')
if len(self.contents) == 0:
eof = rclexecm.RclExecM.eofnext
else:
@ -121,44 +125,44 @@ class ICalSimpleSplitter:
# Note that if an 'interesting' element is nested inside another one,
# it will not be extracted (stay as text in external event). This is
# not an issue and I don't think it can happen with the current list
interesting = ('VTODO', 'VEVENT', 'VJOURNAL')
interesting = (b'VTODO', b'VEVENT', b'VJOURNAL')
def splitcalendar(self, fin):
curblkname = ''
curblk = ''
curblkname = b''
curblk = b''
lo = []
for line in fin:
line = line.rstrip()
if line == '':
if line == b'':
continue
if curblkname:
curblk = curblk + line + "\n"
curblk = curblk + line + b'\n'
l = line.split(":")
l = line.split(b':')
if len(l) < 2:
continue
# If not currently inside a block and we see an
# 'interesting' BEGIN, start block
if curblkname == '' and l[0].upper() == "BEGIN" :
if curblkname == b'' and l[0].upper() == b'BEGIN':
name = l[1].upper()
if name in ICalSimpleSplitter.interesting:
curblkname = name
curblk = curblk + line + "\n"
curblk = curblk + line + b'\n'
# If currently accumulating block lines, check for end
if curblkname and l[0].upper() == "END" and \
if curblkname and l[0].upper() == b'END' and \
l[1].upper() == curblkname:
lo.append(curblk)
curblkname = ''
curblk = ''
curblkname = b''
curblk = b''
if curblk:
lo.append(curblk)
curblkname = ''
curblk = ''
curblkname = b''
curblk = b''
return lo

View File

@ -1,11 +1,12 @@
#!/usr/bin/env python
# Python-based Image Tag extractor for Recoll. This is less thorough than the
# Perl-based rclimg script, but useful if you don't want to have to install Perl
# (e.g. on Windows).
# Python-based Image Tag extractor for Recoll. This is less thorough
# than the Perl-based rclimg script, but useful if you don't want to
# have to install Perl (e.g. on Windows).
#
# Uses pyexiv2. Also tried Pillow, found it useless for tags.
#
from __future__ import print_function
import sys
import os
@ -15,7 +16,7 @@ import re
try:
import pyexiv2
except:
print "RECFILTERROR HELPERNOTFOUND python:pyexiv2"
print("RECFILTERROR HELPERNOTFOUND python:pyexiv2")
sys.exit(1);
khexre = re.compile('.*\.0[xX][0-9a-fA-F]+$')
@ -48,7 +49,7 @@ class ImgTagExtractor:
def extractone(self, params):
#self.em.rclog("extractone %s" % params["filename:"])
ok = False
if not params.has_key("filename:"):
if "filename:" not in params:
self.em.rclog("extractone: no file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
@ -62,11 +63,11 @@ class ImgTagExtractor:
# we skip numeric keys and undecoded makernote data
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
mdic[k] = str(metadata[k].raw_value)
except Exception, err:
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, "", "", rclexecm.RclExecM.eofnow)
docdata = "<html><head>\n"
docdata = b'<html><head>\n'
ttdata = set()
for k in pyexiv2_titles:
@ -77,25 +78,28 @@ class ImgTagExtractor:
for v in ttdata:
v = v.replace('[', '').replace(']', '').replace("'", "")
title += v + " "
docdata += '<title>' + title + '</title>\n'
docdata += rclexecm.makebytes("<title>" + title + "</title>\n")
for k in exiv2_dates:
if k in mdic:
# Recoll wants: %Y-%m-%d %H:%M:%S.
# We get 2014:06:27 14:58:47
dt = mdic[k].replace(':', '-', 2)
docdata += '<meta name="date" content="' + dt + '">\n'
dt = mdic[k].replace(":", "-", 2)
docdata += b'<meta name="date" content="' + \
rclexecm.makebytes(dt) + b'">\n'
break
for k,v in mdic.iteritems():
for k,v in mdic.items():
if k == 'Xmp.digiKam.TagsList':
docdata += '<meta name="keywords" content="' + \
self.em.htmlescape(mdic[k]) + '">\n'
docdata += b'<meta name="keywords" content="' + \
rclexecm.makebytes(self.em.htmlescape(mdic[k])) + \
b'">\n'
docdata += "</head><body>\n"
for k,v in mdic.iteritems():
docdata += k + " : " + self.em.htmlescape(mdic[k]) + "<br />\n"
docdata += "</body></html>"
docdata += b'</head><body>\n'
for k,v in mdic.items():
docdata += rclexecm.makebytes(k + " : " + \
self.em.htmlescape(mdic[k]) + "<br />\n")
docdata += b'</body></html>'
self.em.setmimetype("text/html")

View File

@ -3,6 +3,7 @@
# Read a file in GNU info format and output its nodes as subdocs,
# interfacing with recoll execm
from __future__ import print_function
import rclexecm
import sys
@ -16,18 +17,6 @@ import subprocess
# Some info source docs contain charset info like:
# @documentencoding ISO-2022-JP
# But this seems to be absent from outputs.
htmltemplate = '''
<html>
<head>
<title>%s</title>
<meta name="rclaptg" content="gnuinfo">
</head>
<body>
<pre style="white-space: pre-wrap">
%s
</pre></body>
</html>
'''
# RclExecm interface
class InfoExtractor:
@ -43,8 +32,13 @@ class InfoExtractor:
nodename, docdata = self.contents[index]
nodename = self.em.htmlescape(nodename)
docdata = self.em.htmlescape(docdata)
docdata = htmltemplate % (nodename, docdata)
# strange whitespace to avoid changing the module tests (same as old)
docdata = b'\n<html>\n <head>\n <title>' + nodename + \
b'</title>\n' + \
' <meta name="rclaptg" content="gnuinfo">\n' + \
b' </head>\n <body>\n' + \
b' <pre style="white-space: pre-wrap">\n ' + \
docdata + b'\n </pre></body>\n</html>\n'
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.contents) -1:
@ -60,19 +54,18 @@ class InfoExtractor:
self.em.rclog("Openfile: %s is not a file" % self.file)
return False
cmd = "info --subnodes -o - -f " + self.file
cmd = b'info --subnodes -o - -f ' + self.file
nullstream = open("/dev/null", 'w')
try:
infostream = subprocess.Popen(cmd, shell=True, bufsize=1,
stderr=nullstream,
stdout=subprocess.PIPE).stdout
except Exception, e:
except Exception as e:
# Consider this as permanently fatal.
self.em.rclog("Openfile: exec info: %s" % str(e))
print "RECFILTERROR HELPERNOTFOUND info"
print("RECFILTERROR HELPERNOTFOUND info")
sys.exit(1);
self.currentindex = -1
self.contents = InfoSimpleSplitter().splitinfo(self.file, infostream)
@ -117,9 +110,9 @@ class InfoSimpleSplitter:
index = 0
listout = []
node_dict = {}
node = ""
node = b''
infofile = os.path.basename(filename)
nodename = "Unknown"
nodename = b'Unknown'
for line in fin:
@ -128,41 +121,41 @@ class InfoSimpleSplitter:
# beginning with spaces (it's a bug probably, only seen it once)
# Maybe we'd actually be better off directly interpreting the
# info files
if gotblankline and line.lstrip(" ").startswith("File: "):
if gotblankline and line.lstrip(b' ').startswith(b'File: '):
prevnodename = nodename
line = line.rstrip("\n\r")
pairs = line.split(",")
up = "Top"
line = line.rstrip(b'\n\r')
pairs = line.split(b',')
up = b'Top'
nodename = str(index)
try:
for pair in pairs:
name, value = pair.split(':')
name = name.strip(" ")
value = value.strip(" ")
if name == "Node":
name, value = pair.split(b':')
name = name.strip(b' ')
value = value.strip(b' ')
if name == b'Node':
nodename = value
if name == "Up":
if name == b'Up':
up = value
if name == "File":
if name == b'File':
infofile = value
except:
print >> sys.stderr, "rclinfo: bad line in %s: [%s]\n" % \
(infofile, line)
except Exception as err:
print("rclinfo: bad line in %s: [%s] %s\n" % \
(infofile, line, err), file = sys.stderr)
nodename = prevnodename
node += line
continue
if node_dict.has_key(nodename):
print >> sys.stderr, "Info file", filename, \
"Dup node: ", nodename
if nodename in node_dict:
print("Info file %s Dup node: %s" % (filename, nodename), \
file=sys.stderr)
node_dict[nodename] = up
if index != 0:
listout.append((prevnodename, node))
node = ""
node = b''
index += 1
if line.rstrip("\n\r") == '':
if line.rstrip(b'\n\r') == b'':
gotblankline = 1
else:
gotblankline = 0
@ -170,7 +163,7 @@ class InfoSimpleSplitter:
node += line
# File done, add last dangling node
if node != "":
if node != b'':
listout.append((nodename, node))
# Compute node paths (concatenate "Up" values), to be used
@ -178,34 +171,34 @@ class InfoSimpleSplitter:
# the info file tree is bad
listout1 = []
for nodename, node in listout:
title = ""
title = b''
loop = 0
error = 0
while nodename != "Top":
title = nodename + " / " + title
if node_dict.has_key(nodename):
while nodename != b'Top':
title = nodename + b' / ' + title
if nodename in node_dict:
nodename = node_dict[nodename]
else:
print >> sys.stderr, \
print(
"Infofile: node's Up does not exist: file %s, path %s, up [%s]" % \
(infofile, title, nodename)
(infofile, title, nodename), sys.stderr)
error = 1
break
loop += 1
if loop > 50:
print >> sys.stderr, "Infofile: bad tree (looping)", \
infofile
print("Infofile: bad tree (looping) %s" % infofile, \
file = sys.stderr)
error = 1
break
if error:
continue
if title == "":
if title == b'':
title = infofile
else:
title = infofile + " / " + title
title = title.rstrip(" / ")
title = infofile + b' / ' + title
title = title.rstrip(b' / ')
listout1.append((title, node))
return listout1

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python
# Read a .kar midi karaoke file and translate to recoll indexable format
# This does not work with Python3 yet because python:midi doesn't
from __future__ import print_function
import rclexecm
import sys
@ -15,9 +17,9 @@ except:
pass
try:
import midi
from midi import midi
except:
print "RECFILTERROR HELPERNOTFOUND python:midi"
print("RECFILTERROR HELPERNOTFOUND python:midi")
sys.exit(1);
try:
@ -106,12 +108,12 @@ class KarTextExtractor:
if data:
try:
data = data.decode(self.encoding, 'ignore')
except Exception, err:
except Exception as err:
self.em.rclog("Decode failed: " + str(err))
return ""
try:
data = data.encode('utf-8')
except Exception, err:
except Exception as err:
self.em.rclog("Encode failed: " + str(err))
return ""
@ -127,7 +129,7 @@ class KarTextExtractor:
just one our users could use if there is trouble with guessing
encodings'''
rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$'
rexp = b'''\(([^\)]+)\)\.[a-zA-Z]+$'''
m = re.search(rexp, fn)
if m:
return m.group(1)
@ -165,7 +167,7 @@ class KarTextExtractor:
if count > 0:
confidence = 1.0
encoding = code
except Exception, err:
except Exception as err:
self.em.rclog("stopwords-based classifier failed: %s" % err)
return (encoding, confidence)
@ -177,7 +179,7 @@ class KarTextExtractor:
docdata = ""
ok = False
if not params.has_key("filename:"):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
@ -191,7 +193,7 @@ class KarTextExtractor:
self.encoding = ""
# Mimetype not used for now
if not params.has_key("mimetype:"):
if "mimetype:" not in params:
mimetype = 'audio/x-midi'
else:
mimetype = params["mimetype:"]
@ -199,8 +201,8 @@ class KarTextExtractor:
# Read in and midi-decode the file
try:
stream = midi.read_midifile(filename)
except Exception, err:
self.em.rclog("extractone: midi extract failed: [%s]" % err)
except Exception as err:
self.em.rclog("extractone: read_midifile failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
title = None

View File

@ -13,6 +13,8 @@ epsilon with dasia (in unicode but not iso). Can this be replaced by either epsi
with acute accent ?
"""
from __future__ import print_function
import sys
import string
import glob
@ -117,7 +119,7 @@ if __name__ == "__main__":
lang,code,count = classifier.classify(rawtext)
if count > 0:
print "%s %s %d" % (code, lang, count)
print("%s %s %d" % (code, lang, count))
else:
print "UNKNOWN UNKNOWN 0"
print("UNKNOWN UNKNOWN 0")

View File

@ -43,7 +43,7 @@ class RarExtractor:
try:
rarinfo = self.rar.getinfo(ipath)
isdir = rarinfo.isdir()
except Exception, err:
except Exception as err:
self.em.rclog("extractone: getinfo failed: [%s]" % err)
return (True, docdata, ipath, false)
@ -56,7 +56,7 @@ class RarExtractor:
else:
docdata = self.rar.read(ipath)
ok = True
except Exception, err:
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
else:
@ -89,7 +89,7 @@ class RarExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception, err:
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
import rclexecm
import rclexec1
@ -10,24 +11,24 @@ import os
class RTFProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.out = b''
self.gothead = 0
self.patendhead = re.compile('''</head>''')
self.patcharset = re.compile('''^<meta http-equiv=''')
self.patendhead = re.compile(b'''</head>''')
self.patcharset = re.compile(b'''^<meta http-equiv=''')
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gothead:
if self.patendhead.search(line):
self.out += '<meta http-equiv="Content-Type" ' + \
'content="text/html;charset=UTF-8">' + "\n"
self.out += line + "\n"
self.out += b'<meta http-equiv="Content-Type" ' + \
b'content="text/html;charset=UTF-8">' + b'\n'
self.out += line + b'\n'
self.gothead = 1
elif not self.patcharset.search(line):
self.out += line + "\n"
self.out += line + b'\n'
else:
self.out += line + "\n"
self.out += line + b'\n'
def wrapData(self):
return self.out
@ -52,7 +53,7 @@ class RTFFilter:
if __name__ == '__main__':
if not rclexecm.which("unrtf"):
print("RECFILTERROR HELPERNOTFOUND antiword")
print("RECFILTERROR HELPERNOTFOUND unrtf")
sys.exit(1)
proto = rclexecm.RclExecM()
filter = RTFFilter(proto)

View File

@ -33,7 +33,7 @@ class TarExtractor:
else:
docdata = self.tar.extractfile(ipath).read()
ok = True
except Exception, err:
except Exception as err:
ok = False
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.namen) -1:
@ -59,7 +59,7 @@ class TarExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception, err:
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):

View File

@ -15,7 +15,7 @@ class WarExtractor:
member = self.tar.extractfile(tarinfo)
docdata = member.read()
ok = True
except Exception, err:
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof)
@ -26,7 +26,7 @@ class WarExtractor:
try:
self.tar = tarfile.open(params["filename:"])
return True
except Exception, err:
except Exception as err:
self.em.rclog(str(err))
return False
@ -34,7 +34,7 @@ class WarExtractor:
ipath = params["ipath:"]
try:
tarinfo = self.tar.getmember(ipath)
except Exception, err:
except Exception as err:
self.em.rclog(str(err))
return (False, "", ipath, rclexecm.RclExecM.noteof)
return self.extractone(tarinfo)

View File

@ -72,7 +72,7 @@ class ZipExtractor:
else:
docdata = self.zip.read(ipath)
ok = True
except Exception, err:
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof
@ -98,7 +98,7 @@ class ZipExtractor:
try:
self.zip = ZipFile(filename)
return True
except Exception, err:
except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err)
return False
@ -111,7 +111,7 @@ class ZipExtractor:
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception, err:
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):

View File

@ -75,7 +75,7 @@ class ConfSimple:
def getNames(self, sk = ''):
if not sk in self.submaps:
return None
return self.submaps[sk].keys()
return list(self.submaps[sk].keys())
class ConfTree(ConfSimple):
"""A ConfTree adds path-hierarchical interpretation of the section keys,

View File

@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
daemloglevel = 6
daemlogfilename = /tmp/rclmontrace
systemfilecommand = xdg-mime query filetype
indexStripChars = 1
detectxattronly = 1