factorize boilerplate in simple filters

2018-06-04 15:08:06 +02:00 · 2018-06-04 15:08:06 +02:00 · 0d24cc35da
commit 0d24cc35da
parent 211ea8010c
5 changed files with 123 additions and 151 deletions
--- a/src/filters/rclbasehandler.py
+++ b/src/filters/rclbasehandler.py
@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 # Copyright (C) 2016 J.F.Dockes
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the
 # Free Software Foundation, Inc.,
 # 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 # Base for extractor classes. With some common generic implementations
 # for the boilerplate functions, meant for single-document file handlers.
 from __future__ import print_function
 import os
 import sys
 import rclexecm
 class RclBaseHandler(object):
    def __init__(self, em):
        self.em = em
    def extractone(self, params):
        #self.em.rclog("extractone %s %s" % (params["filename:"], \
        #params["mimetype:"]))
        if not "filename:" in params:
            self.em.rclog("extractone: no file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
        try:
            html = self.html_text(fn)
        except Exception as err:
            self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)
        self.em.setmimetype('text/html')
        return (True, html, "", rclexecm.RclExecM.eofnext)
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(params)
            self.currentindex += 1
            return ret
--- a/src/filters/rclgenxslt.py
+++ b/src/filters/rclgenxslt.py
@ -15,51 +15,25 @@
 #   Free Software Foundation, Inc.,
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ######################################
 # Base class for simple (one stylesheet) xslt-based handlers
 from __future__ import print_function
 import sys
 import rclexecm
 import rclxslt
 import gzip
 from rclbasehandler import RclBaseHandler
-class XSLTExtractor:
+class XSLTExtractor(RclBaseHandler):
    def __init__(self, em, stylesheet, gzip=False):
-        self.em = em
+        super(XSLTExtractor, self).__init__(em)
        self.currentindex = 0
        self.stylesheet = stylesheet
        self.dogz = gzip
-
+    def html_text(self, fn):
-    def extractone(self, params):
+        if self.dogz:
-        if "filename:" not in params:
+            data = gzip.open(fn, 'rb').read()
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
        try:
            if self.dogz:
                data = gzip.open(fn, 'rb').read()
            else:
                data = open(fn, 'rb').read()
            docdata = rclxslt.apply_sheet_data(self.stylesheet, data)
        except Exception as err:
            self.em.rclog("%s: bad data: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)
        return (True, docdata, "", rclexecm.RclExecM.eofnext)
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
-            ret= self.extractone(params)
+            data = open(fn, 'rb').read()
-            self.currentindex += 1
+        return rclxslt.apply_sheet_data(self.stylesheet, data)
            return ret
--- a/src/filters/rclimg.py
+++ b/src/filters/rclimg.py
@ -12,6 +12,7 @@ import sys
 import os
 import rclexecm
 import re
 from rclbasehandler import RclBaseHandler
 try:
    import pyexiv2
@ -41,31 +42,21 @@ meta_pyexiv2_keys = {
 exiv2_dates = ['Exif.Photo.DateTimeOriginal',
               'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized']
-class ImgTagExtractor:
+class ImgTagExtractor(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
+        super(ImgTagExtractor, self).__init__(em)
        self.currentindex = 0
-    def extractone(self, params):
+    def html_text(self, filename):
        #self.em.rclog("extractone %s" % params["filename:"])
        ok = False
        if "filename:" not in params:
            self.em.rclog("extractone: no file name")
            return (ok, docdata, "", rclexecm.RclExecM.eofnow)
        filename = params["filename:"]
-        try:
+        metadata = pyexiv2.ImageMetadata(filename)
-            metadata = pyexiv2.ImageMetadata(filename)
+        metadata.read()
-            metadata.read()
+        keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
-            keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
+        mdic = {}
-            mdic = {}
+        for k in keys:
-            for k in keys:
+            # we skip numeric keys and undecoded makernote data
-                # we skip numeric keys and undecoded makernote data
+            if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
-                if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
+                mdic[k] = str(metadata[k].raw_value)
                    mdic[k] = str(metadata[k].raw_value)
        except Exception as err:
            self.em.rclog("extractone: extract failed: [%s]" % err)
            return (ok, "", "", rclexecm.RclExecM.eofnow)
        docdata = b'<html><head>\n'
@ -101,25 +92,8 @@ class ImgTagExtractor:
                                     self.em.htmlescape(mdic[k]) + "<br />\n")
        docdata += b'</body></html>'
-        self.em.setmimetype("text/html")
+        return docdata
        return (True, docdata, "", rclexecm.RclExecM.eofnext)
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(params)
            self.currentindex += 1
            return ret
 if __name__ == '__main__':
    proto = rclexecm.RclExecM()
--- a/src/filters/rclsoff-flat.py
+++ b/src/filters/rclsoff-flat.py
@ -22,6 +22,7 @@ import sys
 import rclexecm
 import rclxslt
 from zipfile import ZipFile
 from rclbasehandler import RclBaseHandler
 stylesheet_meta = '''<?xml version="1.0"?>
 <xsl:stylesheet version="1.0"
@ -139,24 +140,14 @@ stylesheet_content  = '''<?xml version="1.0"?>
 </xsl:stylesheet>
 '''
-class OOExtractor:
+class OOExtractor(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
+        super(OOExtractor, self).__init__(em)
        self.currentindex = 0
-    def extractone(self, params):
+    def html_text(self, fn):
-        if "filename:" not in params:
+        f = open(fn, 'rb')
-            self.em.rclog("extractone: no mime or file name")
+        data = f.read()
-            return (False, "", "", rclexecm.RclExecM.eofnow)
+        f.close()
        fn = params["filename:"]
        try:
            f = open(fn, 'rb')
            data = f.read()
            f.close()
        except Exception as err:
            self.em.rclog("open failed: %s" % err)
            return (False, "", "", rclexecm.RclExecM.eofnow)
        docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8">\n'
@ -172,31 +163,12 @@ class OOExtractor:
        docdata += b'</head><body>'
-        try:
+        res = rclxslt.apply_sheet_data(stylesheet_content, data)
-            res = rclxslt.apply_sheet_data(stylesheet_content, data)
+        docdata += res
-            docdata += res
+        docdata += b'</body></html>'
            docdata += b'</body></html>'
        except Exception as err:
            self.em.rclog("bad data in %s: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)
-        return (True, docdata, "", rclexecm.RclExecM.eofnext)
+        return docdata
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(params)
            self.currentindex += 1
            return ret
 if __name__ == '__main__':
    proto = rclexecm.RclExecM()
--- a/src/filters/rcltext.py
+++ b/src/filters/rcltext.py
@ -1,4 +1,19 @@
 #!/usr/bin/env python3
 # Copyright (C) 2016 J.F.Dockes
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the
 # Free Software Foundation, Inc.,
 # 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 # Wrapping a text file. Recoll does it internally in most cases, but
 # this is for use by another filter.
@ -7,46 +22,19 @@ from __future__ import print_function
 import rclexecm
 import sys
 from rclbasehandler import RclBaseHandler
-class TxtDump:
+class TxtDump(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
+        super(TxtDump, self).__init__(em)
-    def extractone(self, params):
+    def html_text(self, fn):
        #self.em.rclog("extractone %s %s" % (params["filename:"], \
        #params["mimetype:"]))
        if not "filename:" in params:
            self.em.rclog("extractone: no file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]
        # No charset, so recoll will have to use its config to guess it
-        txt = b'<html><head><title></title></head><body><pre>'
+        html = b'<html><head><title></title></head><body><pre>'
-        try:
+        f = open(fn, "rb")
-            f = open(fn, "rb")
+        html += self.em.htmlescape(f.read())
-            txt += self.em.htmlescape(f.read())
+        html += b'</pre></body></html>'
-        except Exception as err:
+        return html
            self.em.rclog("TxtDump: %s : %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)
        txt += b'</pre></body></html>'
        return (True, txt, "", rclexecm.RclExecM.eofnext)
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", rclexecm.RclExecM.eofnow)
        else:
            ret= self.extractone(params)
            self.currentindex += 1
            return ret
 if __name__ == '__main__':
    proto = rclexecm.RclExecM()