diff --git a/src/filters/rclbasehandler.py b/src/filters/rclbasehandler.py new file mode 100644 index 00000000..fa504eed --- /dev/null +++ b/src/filters/rclbasehandler.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Base for extractor classes. With some common generic implementations +# for the boilerplate functions, meant for single-document file handlers. + +from __future__ import print_function + +import os +import sys +import rclexecm + +class RclBaseHandler(object): + def __init__(self, em): + self.em = em + + + def extractone(self, params): + #self.em.rclog("extractone %s %s" % (params["filename:"], \ + #params["mimetype:"])) + if not "filename:" in params: + self.em.rclog("extractone: no file name") + return (False, "", "", rclexecm.RclExecM.eofnow) + fn = params["filename:"] + + try: + html = self.html_text(fn) + except Exception as err: + self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) + return (False, "", "", rclexecm.RclExecM.eofnow) + + self.em.setmimetype('text/html') + return (True, html, "", rclexecm.RclExecM.eofnext) + + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret diff --git a/src/filters/rclgenxslt.py b/src/filters/rclgenxslt.py index 2135a443..5ba1f338 100755 --- a/src/filters/rclgenxslt.py +++ b/src/filters/rclgenxslt.py @@ -15,51 +15,25 @@ # Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ###################################### + +# Base class for simple (one stylesheet) xslt-based handlers + from __future__ import print_function import sys -import rclexecm import rclxslt import gzip +from rclbasehandler import RclBaseHandler -class XSLTExtractor: +class XSLTExtractor(RclBaseHandler): def __init__(self, em, stylesheet, gzip=False): - self.em = em - self.currentindex = 0 + super(XSLTExtractor, self).__init__(em) self.stylesheet = stylesheet self.dogz = gzip - - def extractone(self, params): - if "filename:" not in params: - self.em.rclog("extractone: no mime or file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] - try: - if self.dogz: - data = gzip.open(fn, 'rb').read() - else: - data = open(fn, 'rb').read() - docdata = rclxslt.apply_sheet_data(self.stylesheet, data) - except Exception as err: - self.em.rclog("%s: bad data: %s" % (fn, err)) - return (False, "", "", rclexecm.RclExecM.eofnow) - - return (True, docdata, "", rclexecm.RclExecM.eofnext) - - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) + def html_text(self, fn): + if self.dogz: + data = gzip.open(fn, 'rb').read() else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + data = open(fn, 'rb').read() + return rclxslt.apply_sheet_data(self.stylesheet, data) diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py index 6c7eb8fb..ef633b22 100755 --- a/src/filters/rclimg.py +++ b/src/filters/rclimg.py @@ -12,6 +12,7 @@ import sys import os import rclexecm import re +from rclbasehandler import RclBaseHandler try: import pyexiv2 @@ -41,31 +42,21 @@ meta_pyexiv2_keys = { exiv2_dates = ['Exif.Photo.DateTimeOriginal', 'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized'] -class ImgTagExtractor: +class ImgTagExtractor(RclBaseHandler): def __init__(self, em): - self.em = em - self.currentindex = 0 + super(ImgTagExtractor, self).__init__(em) - def extractone(self, params): - #self.em.rclog("extractone %s" % params["filename:"]) + def html_text(self, filename): ok = False - if "filename:" not in params: - self.em.rclog("extractone: no file name") - return (ok, docdata, "", rclexecm.RclExecM.eofnow) - filename = params["filename:"] - try: - metadata = pyexiv2.ImageMetadata(filename) - metadata.read() - keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys - mdic = {} - for k in keys: - # we skip numeric keys and undecoded makernote data - if k != 'Exif.Photo.MakerNote' and not khexre.match(k): - mdic[k] = str(metadata[k].raw_value) - except Exception as err: - self.em.rclog("extractone: extract failed: [%s]" % err) - return (ok, "", "", rclexecm.RclExecM.eofnow) + metadata = pyexiv2.ImageMetadata(filename) + metadata.read() + keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys + mdic = {} + for k in keys: + # we skip numeric keys and undecoded makernote data + if k != 'Exif.Photo.MakerNote' and not khexre.match(k): + mdic[k] = str(metadata[k].raw_value) docdata = b'
\n' @@ -101,25 +92,8 @@ class ImgTagExtractor: self.em.htmlescape(mdic[k]) + "