factorize boilerplate in simple filters

2018-06-04 15:08:06 +02:00 · 2018-06-04 15:08:06 +02:00 · 0d24cc35da
commit 0d24cc35da
parent 211ea8010c
5 changed files with 123 additions and 151 deletions
--- a/src/filters/rclbasehandler.py
+++ b/src/filters/rclbasehandler.py
@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# Copyright (C) 2016 J.F.Dockes
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+# Base for extractor classes. With some common generic implementations
+# for the boilerplate functions, meant for single-document file handlers.
+
+from __future__ import print_function
+
+import os
+import sys
+import rclexecm
+
+class RclBaseHandler(object):
+    def __init__(self, em):
+        self.em = em
+
+
+    def extractone(self, params):
+        #self.em.rclog("extractone %s %s" % (params["filename:"], \
+        #params["mimetype:"]))
+        if not "filename:" in params:
+            self.em.rclog("extractone: no file name")
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        fn = params["filename:"]
+
+        try:
+            html = self.html_text(fn)
+        except Exception as err:
+            self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+
+        self.em.setmimetype('text/html')
+        return (True, html, "", rclexecm.RclExecM.eofnext)
+        
+
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        self.currentindex = 0
+        return True
+
+    def getipath(self, params):
+        return self.extractone(params)
+
+    def getnext(self, params):
+        if self.currentindex >= 1:
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        else:
+            ret= self.extractone(params)
+            self.currentindex += 1
+            return ret
--- a/src/filters/rclgenxslt.py
+++ b/src/filters/rclgenxslt.py
@ -15,51 +15,25 @@
 #   Free Software Foundation, Inc.,
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ######################################
+
+# Base class for simple (one stylesheet) xslt-based handlers
+
 from __future__ import print_function

 import sys
-import rclexecm
 import rclxslt
 import gzip
+from rclbasehandler import RclBaseHandler

-class XSLTExtractor:
+class XSLTExtractor(RclBaseHandler):
    def __init__(self, em, stylesheet, gzip=False):
-        self.em = em
-        self.currentindex = 0
+        super(XSLTExtractor, self).__init__(em)
        self.stylesheet = stylesheet
        self.dogz = gzip

-
-    def extractone(self, params):
-        if "filename:" not in params:
-            self.em.rclog("extractone: no mime or file name")
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-        fn = params["filename:"]
-        try:
-            if self.dogz:
-                data = gzip.open(fn, 'rb').read()
-            else:
-                data = open(fn, 'rb').read()
-            docdata = rclxslt.apply_sheet_data(self.stylesheet, data)
-        except Exception as err:
-            self.em.rclog("%s: bad data: %s" % (fn, err))
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-
-        return (True, docdata, "", rclexecm.RclExecM.eofnext)
-    
-
-    ###### File type handler api, used by rclexecm ---------->
-    def openfile(self, params):
-        self.currentindex = 0
-        return True
-
-    def getipath(self, params):
-        return self.extractone(params)
-        
-    def getnext(self, params):
-        if self.currentindex >= 1:
-            return (False, "", "", rclexecm.RclExecM.eofnow)
+    def html_text(self, fn):
+        if self.dogz:
+            data = gzip.open(fn, 'rb').read()
        else:
-            ret= self.extractone(params)
-            self.currentindex += 1
-            return ret
+            data = open(fn, 'rb').read()
+        return rclxslt.apply_sheet_data(self.stylesheet, data)
--- a/src/filters/rclimg.py
+++ b/src/filters/rclimg.py
@ -12,6 +12,7 @@ import sys
 import os
 import rclexecm
 import re
+from rclbasehandler import RclBaseHandler

 try:
    import pyexiv2
@ -41,31 +42,21 @@ meta_pyexiv2_keys = {
 exiv2_dates = ['Exif.Photo.DateTimeOriginal',
               'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized']

-class ImgTagExtractor:
+class ImgTagExtractor(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
-        self.currentindex = 0
+        super(ImgTagExtractor, self).__init__(em)

-    def extractone(self, params):
-        #self.em.rclog("extractone %s" % params["filename:"])
+    def html_text(self, filename):
        ok = False
-        if "filename:" not in params:
-            self.em.rclog("extractone: no file name")
-            return (ok, docdata, "", rclexecm.RclExecM.eofnow)
-        filename = params["filename:"]

-        try:
-            metadata = pyexiv2.ImageMetadata(filename)
-            metadata.read()
-            keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
-            mdic = {}
-            for k in keys:
-                # we skip numeric keys and undecoded makernote data
-                if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
-                    mdic[k] = str(metadata[k].raw_value)
-        except Exception as err:
-            self.em.rclog("extractone: extract failed: [%s]" % err)
-            return (ok, "", "", rclexecm.RclExecM.eofnow)
+        metadata = pyexiv2.ImageMetadata(filename)
+        metadata.read()
+        keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
+        mdic = {}
+        for k in keys:
+            # we skip numeric keys and undecoded makernote data
+            if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
+                mdic[k] = str(metadata[k].raw_value)

        docdata = b'<html><head>\n'

@ -101,25 +92,8 @@ class ImgTagExtractor:
                                     self.em.htmlescape(mdic[k]) + "<br />\n")
        docdata += b'</body></html>'

-        self.em.setmimetype("text/html")
+        return docdata

-        return (True, docdata, "", rclexecm.RclExecM.eofnext)
-
-    ###### File type handler api, used by rclexecm ---------->
-    def openfile(self, params):
-        self.currentindex = 0
-        return True
-
-    def getipath(self, params):
-        return self.extractone(params)
-        
-    def getnext(self, params):
-        if self.currentindex >= 1:
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-        else:
-            ret= self.extractone(params)
-            self.currentindex += 1
-            return ret

 if __name__ == '__main__':
    proto = rclexecm.RclExecM()
--- a/src/filters/rclsoff-flat.py
+++ b/src/filters/rclsoff-flat.py
@ -22,6 +22,7 @@ import sys
 import rclexecm
 import rclxslt
 from zipfile import ZipFile
+from rclbasehandler import RclBaseHandler

 stylesheet_meta = '''<?xml version="1.0"?>
 <xsl:stylesheet version="1.0"
@ -139,24 +140,14 @@ stylesheet_content  = '''<?xml version="1.0"?>
 </xsl:stylesheet>
 '''

-class OOExtractor:
+class OOExtractor(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
-        self.currentindex = 0
+        super(OOExtractor, self).__init__(em)

-    def extractone(self, params):
-        if "filename:" not in params:
-            self.em.rclog("extractone: no mime or file name")
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-        fn = params["filename:"]
-
-        try:
-            f = open(fn, 'rb')
-            data = f.read()
-            f.close()
-        except Exception as err:
-            self.em.rclog("open failed: %s" % err)
-            return (False, "", "", rclexecm.RclExecM.eofnow)
+    def html_text(self, fn):
+        f = open(fn, 'rb')
+        data = f.read()
+        f.close()

        docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8">\n'
@ -172,31 +163,12 @@ class OOExtractor:

        docdata += b'</head><body>'

-        try:
-            res = rclxslt.apply_sheet_data(stylesheet_content, data)
-            docdata += res
-            docdata += b'</body></html>'
-        except Exception as err:
-            self.em.rclog("bad data in %s: %s" % (fn, err))
-            return (False, "", "", rclexecm.RclExecM.eofnow)
+        res = rclxslt.apply_sheet_data(stylesheet_content, data)
+        docdata += res
+        docdata += b'</body></html>'

-        return (True, docdata, "", rclexecm.RclExecM.eofnext)
+        return docdata
    
-    ###### File type handler api, used by rclexecm ---------->
-    def openfile(self, params):
-        self.currentindex = 0
-        return True
-
-    def getipath(self, params):
-        return self.extractone(params)
-        
-    def getnext(self, params):
-        if self.currentindex >= 1:
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-        else:
-            ret= self.extractone(params)
-            self.currentindex += 1
-            return ret

 if __name__ == '__main__':
    proto = rclexecm.RclExecM()
--- a/src/filters/rcltext.py
+++ b/src/filters/rcltext.py
@ -1,4 +1,19 @@
 #!/usr/bin/env python3
+# Copyright (C) 2016 J.F.Dockes
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

 # Wrapping a text file. Recoll does it internally in most cases, but
 # this is for use by another filter.
@ -7,46 +22,19 @@ from __future__ import print_function

 import rclexecm
 import sys
+from rclbasehandler import RclBaseHandler

-class TxtDump:
+class TxtDump(RclBaseHandler):
    def __init__(self, em):
-        self.em = em
+        super(TxtDump, self).__init__(em)

-    def extractone(self, params):
-        #self.em.rclog("extractone %s %s" % (params["filename:"], \
-        #params["mimetype:"]))
-        if not "filename:" in params:
-            self.em.rclog("extractone: no file name")
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-
-        fn = params["filename:"]
+    def html_text(self, fn):
        # No charset, so recoll will have to use its config to guess it
-        txt = b'<html><head><title></title></head><body><pre>'
-        try:
-            f = open(fn, "rb")
-            txt += self.em.htmlescape(f.read())
-        except Exception as err:
-            self.em.rclog("TxtDump: %s : %s" % (fn, err))
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-            
-        txt += b'</pre></body></html>'
-        return (True, txt, "", rclexecm.RclExecM.eofnext)
-        
-    ###### File type handler api, used by rclexecm ---------->
-    def openfile(self, params):
-        self.currentindex = 0
-        return True
-
-    def getipath(self, params):
-        return self.extractone(params)
-        
-    def getnext(self, params):
-        if self.currentindex >= 1:
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-        else:
-            ret= self.extractone(params)
-            self.currentindex += 1
-            return ret
+        html = b'<html><head><title></title></head><body><pre>'
+        f = open(fn, "rb")
+        html += self.em.htmlescape(f.read())
+        html += b'</pre></body></html>'
+        return html

 if __name__ == '__main__':
    proto = rclexecm.RclExecM()