factorize boilerplate in simple filters

This commit is contained in:
Jean-Francois Dockes 2018-06-04 15:08:06 +02:00
parent 211ea8010c
commit 0d24cc35da
5 changed files with 123 additions and 151 deletions

View File

@ -0,0 +1,64 @@
#!/usr/bin/env python3
# Copyright (C) 2016 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Base for extractor classes. With some common generic implementations
# for the boilerplate functions, meant for single-document file handlers.
from __future__ import print_function
import os
import sys
import rclexecm
class RclBaseHandler(object):
def __init__(self, em):
self.em = em
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"]))
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
html = self.html_text(fn)
except Exception as err:
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
self.em.setmimetype('text/html')
return (True, html, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret

View File

@ -15,51 +15,25 @@
# Free Software Foundation, Inc., # Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
###################################### ######################################
# Base class for simple (one stylesheet) xslt-based handlers
from __future__ import print_function from __future__ import print_function
import sys import sys
import rclexecm
import rclxslt import rclxslt
import gzip import gzip
from rclbasehandler import RclBaseHandler
class XSLTExtractor: class XSLTExtractor(RclBaseHandler):
def __init__(self, em, stylesheet, gzip=False): def __init__(self, em, stylesheet, gzip=False):
self.em = em super(XSLTExtractor, self).__init__(em)
self.currentindex = 0
self.stylesheet = stylesheet self.stylesheet = stylesheet
self.dogz = gzip self.dogz = gzip
def html_text(self, fn):
def extractone(self, params): if self.dogz:
if "filename:" not in params: data = gzip.open(fn, 'rb').read()
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
if self.dogz:
data = gzip.open(fn, 'rb').read()
else:
data = open(fn, 'rb').read()
docdata = rclxslt.apply_sheet_data(self.stylesheet, data)
except Exception as err:
self.em.rclog("%s: bad data: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else: else:
ret= self.extractone(params) data = open(fn, 'rb').read()
self.currentindex += 1 return rclxslt.apply_sheet_data(self.stylesheet, data)
return ret

View File

@ -12,6 +12,7 @@ import sys
import os import os
import rclexecm import rclexecm
import re import re
from rclbasehandler import RclBaseHandler
try: try:
import pyexiv2 import pyexiv2
@ -41,31 +42,21 @@ meta_pyexiv2_keys = {
exiv2_dates = ['Exif.Photo.DateTimeOriginal', exiv2_dates = ['Exif.Photo.DateTimeOriginal',
'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized'] 'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized']
class ImgTagExtractor: class ImgTagExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(ImgTagExtractor, self).__init__(em)
self.currentindex = 0
def extractone(self, params): def html_text(self, filename):
#self.em.rclog("extractone %s" % params["filename:"])
ok = False ok = False
if "filename:" not in params:
self.em.rclog("extractone: no file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
try: metadata = pyexiv2.ImageMetadata(filename)
metadata = pyexiv2.ImageMetadata(filename) metadata.read()
metadata.read() keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys mdic = {}
mdic = {} for k in keys:
for k in keys: # we skip numeric keys and undecoded makernote data
# we skip numeric keys and undecoded makernote data if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
if k != 'Exif.Photo.MakerNote' and not khexre.match(k): mdic[k] = str(metadata[k].raw_value)
mdic[k] = str(metadata[k].raw_value)
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, "", "", rclexecm.RclExecM.eofnow)
docdata = b'<html><head>\n' docdata = b'<html><head>\n'
@ -101,25 +92,8 @@ class ImgTagExtractor:
self.em.htmlescape(mdic[k]) + "<br />\n") self.em.htmlescape(mdic[k]) + "<br />\n")
docdata += b'</body></html>' docdata += b'</body></html>'
self.em.setmimetype("text/html") return docdata
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__': if __name__ == '__main__':
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()

View File

@ -22,6 +22,7 @@ import sys
import rclexecm import rclexecm
import rclxslt import rclxslt
from zipfile import ZipFile from zipfile import ZipFile
from rclbasehandler import RclBaseHandler
stylesheet_meta = '''<?xml version="1.0"?> stylesheet_meta = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0" <xsl:stylesheet version="1.0"
@ -139,24 +140,14 @@ stylesheet_content = '''<?xml version="1.0"?>
</xsl:stylesheet> </xsl:stylesheet>
''' '''
class OOExtractor: class OOExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(OOExtractor, self).__init__(em)
self.currentindex = 0
def extractone(self, params): def html_text(self, fn):
if "filename:" not in params: f = open(fn, 'rb')
self.em.rclog("extractone: no mime or file name") data = f.read()
return (False, "", "", rclexecm.RclExecM.eofnow) f.close()
fn = params["filename:"]
try:
f = open(fn, 'rb')
data = f.read()
f.close()
except Exception as err:
self.em.rclog("open failed: %s" % err)
return (False, "", "", rclexecm.RclExecM.eofnow)
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \ docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
b'content="text/html; charset=UTF-8">\n' b'content="text/html; charset=UTF-8">\n'
@ -172,31 +163,12 @@ class OOExtractor:
docdata += b'</head><body>' docdata += b'</head><body>'
try: res = rclxslt.apply_sheet_data(stylesheet_content, data)
res = rclxslt.apply_sheet_data(stylesheet_content, data) docdata += res
docdata += res docdata += b'</body></html>'
docdata += b'</body></html>'
except Exception as err:
self.em.rclog("bad data in %s: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext) return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__': if __name__ == '__main__':
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()

View File

@ -1,4 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (C) 2016 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Wrapping a text file. Recoll does it internally in most cases, but # Wrapping a text file. Recoll does it internally in most cases, but
# this is for use by another filter. # this is for use by another filter.
@ -7,46 +22,19 @@ from __future__ import print_function
import rclexecm import rclexecm
import sys import sys
from rclbasehandler import RclBaseHandler
class TxtDump: class TxtDump(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(TxtDump, self).__init__(em)
def extractone(self, params): def html_text(self, fn):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"]))
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
# No charset, so recoll will have to use its config to guess it # No charset, so recoll will have to use its config to guess it
txt = b'<html><head><title></title></head><body><pre>' html = b'<html><head><title></title></head><body><pre>'
try: f = open(fn, "rb")
f = open(fn, "rb") html += self.em.htmlescape(f.read())
txt += self.em.htmlescape(f.read()) html += b'</pre></body></html>'
except Exception as err: return html
self.em.rclog("TxtDump: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
txt += b'</pre></body></html>'
return (True, txt, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__': if __name__ == '__main__':
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()