factorize boilerplate in simple filters

This commit is contained in:
Jean-Francois Dockes 2018-06-04 15:08:06 +02:00
parent 211ea8010c
commit 0d24cc35da
5 changed files with 123 additions and 151 deletions

View File

@ -0,0 +1,64 @@
#!/usr/bin/env python3
# Copyright (C) 2016 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Base for extractor classes. With some common generic implementations
# for the boilerplate functions, meant for single-document file handlers.
from __future__ import print_function
import os
import sys
import rclexecm
class RclBaseHandler(object):
def __init__(self, em):
self.em = em
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"]))
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
html = self.html_text(fn)
except Exception as err:
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
self.em.setmimetype('text/html')
return (True, html, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret

View File

@ -15,51 +15,25 @@
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
######################################
# Base class for simple (one stylesheet) xslt-based handlers
from __future__ import print_function
import sys
import rclexecm
import rclxslt
import gzip
from rclbasehandler import RclBaseHandler
class XSLTExtractor:
class XSLTExtractor(RclBaseHandler):
def __init__(self, em, stylesheet, gzip=False):
self.em = em
self.currentindex = 0
super(XSLTExtractor, self).__init__(em)
self.stylesheet = stylesheet
self.dogz = gzip
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
if self.dogz:
data = gzip.open(fn, 'rb').read()
else:
data = open(fn, 'rb').read()
docdata = rclxslt.apply_sheet_data(self.stylesheet, data)
except Exception as err:
self.em.rclog("%s: bad data: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
def html_text(self, fn):
if self.dogz:
data = gzip.open(fn, 'rb').read()
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
data = open(fn, 'rb').read()
return rclxslt.apply_sheet_data(self.stylesheet, data)

View File

@ -12,6 +12,7 @@ import sys
import os
import rclexecm
import re
from rclbasehandler import RclBaseHandler
try:
import pyexiv2
@ -41,31 +42,21 @@ meta_pyexiv2_keys = {
exiv2_dates = ['Exif.Photo.DateTimeOriginal',
'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized']
class ImgTagExtractor:
class ImgTagExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
self.currentindex = 0
super(ImgTagExtractor, self).__init__(em)
def extractone(self, params):
#self.em.rclog("extractone %s" % params["filename:"])
def html_text(self, filename):
ok = False
if "filename:" not in params:
self.em.rclog("extractone: no file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
try:
metadata = pyexiv2.ImageMetadata(filename)
metadata.read()
keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
mdic = {}
for k in keys:
# we skip numeric keys and undecoded makernote data
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
mdic[k] = str(metadata[k].raw_value)
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, "", "", rclexecm.RclExecM.eofnow)
metadata = pyexiv2.ImageMetadata(filename)
metadata.read()
keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
mdic = {}
for k in keys:
# we skip numeric keys and undecoded makernote data
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
mdic[k] = str(metadata[k].raw_value)
docdata = b'<html><head>\n'
@ -101,25 +92,8 @@ class ImgTagExtractor:
self.em.htmlescape(mdic[k]) + "<br />\n")
docdata += b'</body></html>'
self.em.setmimetype("text/html")
return docdata
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()

View File

@ -22,6 +22,7 @@ import sys
import rclexecm
import rclxslt
from zipfile import ZipFile
from rclbasehandler import RclBaseHandler
stylesheet_meta = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
@ -139,24 +140,14 @@ stylesheet_content = '''<?xml version="1.0"?>
</xsl:stylesheet>
'''
class OOExtractor:
class OOExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
self.currentindex = 0
super(OOExtractor, self).__init__(em)
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
f = open(fn, 'rb')
data = f.read()
f.close()
except Exception as err:
self.em.rclog("open failed: %s" % err)
return (False, "", "", rclexecm.RclExecM.eofnow)
def html_text(self, fn):
f = open(fn, 'rb')
data = f.read()
f.close()
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
b'content="text/html; charset=UTF-8">\n'
@ -172,31 +163,12 @@ class OOExtractor:
docdata += b'</head><body>'
try:
res = rclxslt.apply_sheet_data(stylesheet_content, data)
docdata += res
docdata += b'</body></html>'
except Exception as err:
self.em.rclog("bad data in %s: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
res = rclxslt.apply_sheet_data(stylesheet_content, data)
docdata += res
docdata += b'</body></html>'
return (True, docdata, "", rclexecm.RclExecM.eofnext)
return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()

View File

@ -1,4 +1,19 @@
#!/usr/bin/env python3
# Copyright (C) 2016 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Wrapping a text file. Recoll does it internally in most cases, but
# this is for use by another filter.
@ -7,46 +22,19 @@ from __future__ import print_function
import rclexecm
import sys
from rclbasehandler import RclBaseHandler
class TxtDump:
class TxtDump(RclBaseHandler):
def __init__(self, em):
self.em = em
super(TxtDump, self).__init__(em)
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"]))
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
def html_text(self, fn):
# No charset, so recoll will have to use its config to guess it
txt = b'<html><head><title></title></head><body><pre>'
try:
f = open(fn, "rb")
txt += self.em.htmlescape(f.read())
except Exception as err:
self.em.rclog("TxtDump: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
txt += b'</pre></body></html>'
return (True, txt, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
html = b'<html><head><title></title></head><body><pre>'
f = open(fn, "rb")
html += self.em.htmlescape(f.read())
html += b'</pre></body></html>'
return html
if __name__ == '__main__':
proto = rclexecm.RclExecM()