diff --git a/src/filters/rclbasehandler.py b/src/filters/rclbasehandler.py new file mode 100644 index 00000000..fa504eed --- /dev/null +++ b/src/filters/rclbasehandler.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Base for extractor classes. With some common generic implementations +# for the boilerplate functions, meant for single-document file handlers. + +from __future__ import print_function + +import os +import sys +import rclexecm + +class RclBaseHandler(object): + def __init__(self, em): + self.em = em + + + def extractone(self, params): + #self.em.rclog("extractone %s %s" % (params["filename:"], \ + #params["mimetype:"])) + if not "filename:" in params: + self.em.rclog("extractone: no file name") + return (False, "", "", rclexecm.RclExecM.eofnow) + fn = params["filename:"] + + try: + html = self.html_text(fn) + except Exception as err: + self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) + return (False, "", "", rclexecm.RclExecM.eofnow) + + self.em.setmimetype('text/html') + return (True, html, "", rclexecm.RclExecM.eofnext) + + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret diff --git a/src/filters/rclgenxslt.py b/src/filters/rclgenxslt.py index 2135a443..5ba1f338 100755 --- a/src/filters/rclgenxslt.py +++ b/src/filters/rclgenxslt.py @@ -15,51 +15,25 @@ # Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ###################################### + +# Base class for simple (one stylesheet) xslt-based handlers + from __future__ import print_function import sys -import rclexecm import rclxslt import gzip +from rclbasehandler import RclBaseHandler -class XSLTExtractor: +class XSLTExtractor(RclBaseHandler): def __init__(self, em, stylesheet, gzip=False): - self.em = em - self.currentindex = 0 + super(XSLTExtractor, self).__init__(em) self.stylesheet = stylesheet self.dogz = gzip - - def extractone(self, params): - if "filename:" not in params: - self.em.rclog("extractone: no mime or file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] - try: - if self.dogz: - data = gzip.open(fn, 'rb').read() - else: - data = open(fn, 'rb').read() - docdata = rclxslt.apply_sheet_data(self.stylesheet, data) - except Exception as err: - self.em.rclog("%s: bad data: %s" % (fn, err)) - return (False, "", "", rclexecm.RclExecM.eofnow) - - return (True, docdata, "", rclexecm.RclExecM.eofnext) - - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) + def html_text(self, fn): + if self.dogz: + data = gzip.open(fn, 'rb').read() else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + data = open(fn, 'rb').read() + return rclxslt.apply_sheet_data(self.stylesheet, data) diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py index 6c7eb8fb..ef633b22 100755 --- a/src/filters/rclimg.py +++ b/src/filters/rclimg.py @@ -12,6 +12,7 @@ import sys import os import rclexecm import re +from rclbasehandler import RclBaseHandler try: import pyexiv2 @@ -41,31 +42,21 @@ meta_pyexiv2_keys = { exiv2_dates = ['Exif.Photo.DateTimeOriginal', 'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized'] -class ImgTagExtractor: +class ImgTagExtractor(RclBaseHandler): def __init__(self, em): - self.em = em - self.currentindex = 0 + super(ImgTagExtractor, self).__init__(em) - def extractone(self, params): - #self.em.rclog("extractone %s" % params["filename:"]) + def html_text(self, filename): ok = False - if "filename:" not in params: - self.em.rclog("extractone: no file name") - return (ok, docdata, "", rclexecm.RclExecM.eofnow) - filename = params["filename:"] - try: - metadata = pyexiv2.ImageMetadata(filename) - metadata.read() - keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys - mdic = {} - for k in keys: - # we skip numeric keys and undecoded makernote data - if k != 'Exif.Photo.MakerNote' and not khexre.match(k): - mdic[k] = str(metadata[k].raw_value) - except Exception as err: - self.em.rclog("extractone: extract failed: [%s]" % err) - return (ok, "", "", rclexecm.RclExecM.eofnow) + metadata = pyexiv2.ImageMetadata(filename) + metadata.read() + keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys + mdic = {} + for k in keys: + # we skip numeric keys and undecoded makernote data + if k != 'Exif.Photo.MakerNote' and not khexre.match(k): + mdic[k] = str(metadata[k].raw_value) docdata = b'\n' @@ -101,25 +92,8 @@ class ImgTagExtractor: self.em.htmlescape(mdic[k]) + "
\n") docdata += b'' - self.em.setmimetype("text/html") + return docdata - return (True, docdata, "", rclexecm.RclExecM.eofnext) - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret if __name__ == '__main__': proto = rclexecm.RclExecM() diff --git a/src/filters/rclsoff-flat.py b/src/filters/rclsoff-flat.py index 9101fb33..4971e524 100755 --- a/src/filters/rclsoff-flat.py +++ b/src/filters/rclsoff-flat.py @@ -22,6 +22,7 @@ import sys import rclexecm import rclxslt from zipfile import ZipFile +from rclbasehandler import RclBaseHandler stylesheet_meta = ''' ''' -class OOExtractor: +class OOExtractor(RclBaseHandler): def __init__(self, em): - self.em = em - self.currentindex = 0 + super(OOExtractor, self).__init__(em) - def extractone(self, params): - if "filename:" not in params: - self.em.rclog("extractone: no mime or file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] - - try: - f = open(fn, 'rb') - data = f.read() - f.close() - except Exception as err: - self.em.rclog("open failed: %s" % err) - return (False, "", "", rclexecm.RclExecM.eofnow) + def html_text(self, fn): + f = open(fn, 'rb') + data = f.read() + f.close() docdata = b'\n\n\n' @@ -172,31 +163,12 @@ class OOExtractor: docdata += b'' - try: - res = rclxslt.apply_sheet_data(stylesheet_content, data) - docdata += res - docdata += b'' - except Exception as err: - self.em.rclog("bad data in %s: %s" % (fn, err)) - return (False, "", "", rclexecm.RclExecM.eofnow) + res = rclxslt.apply_sheet_data(stylesheet_content, data) + docdata += res + docdata += b'' - return (True, docdata, "", rclexecm.RclExecM.eofnext) + return docdata - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret if __name__ == '__main__': proto = rclexecm.RclExecM() diff --git a/src/filters/rcltext.py b/src/filters/rcltext.py index 4ad7d9d2..ec8a0da3 100755 --- a/src/filters/rcltext.py +++ b/src/filters/rcltext.py @@ -1,4 +1,19 @@ #!/usr/bin/env python3 +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Wrapping a text file. Recoll does it internally in most cases, but # this is for use by another filter. @@ -7,46 +22,19 @@ from __future__ import print_function import rclexecm import sys +from rclbasehandler import RclBaseHandler -class TxtDump: +class TxtDump(RclBaseHandler): def __init__(self, em): - self.em = em + super(TxtDump, self).__init__(em) - def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], \ - #params["mimetype:"])) - if not "filename:" in params: - self.em.rclog("extractone: no file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - - fn = params["filename:"] + def html_text(self, fn): # No charset, so recoll will have to use its config to guess it - txt = b'
'
-        try:
-            f = open(fn, "rb")
-            txt += self.em.htmlescape(f.read())
-        except Exception as err:
-            self.em.rclog("TxtDump: %s : %s" % (fn, err))
-            return (False, "", "", rclexecm.RclExecM.eofnow)
-            
-        txt += b'
' - return (True, txt, "", rclexecm.RclExecM.eofnext) - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + html = b'
'
+        f = open(fn, "rb")
+        html += self.em.htmlescape(f.read())
+        html += b'
' + return html if __name__ == '__main__': proto = rclexecm.RclExecM()