diff --git a/src/filters/rclaudio b/src/filters/rclaudio index fcca997b..39bb55d4 100755 --- a/src/filters/rclaudio +++ b/src/filters/rclaudio @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -# Audio tag filter for Recoll, using mutagen +# Audio tag extractor for Recoll, using mutagen import sys import os import rclexecm +from rclbasehandler import RclBaseHandler import time import datetime import re @@ -174,10 +175,11 @@ def tobytes(s): # mp3: album, title, artist, genre, date, tracknumber # flac: album, title, artist, genre, xxx, tracknumber # oggvorbis:album, title, artist, genre, date, tracknumber -class AudioTagExtractor: +class AudioTagExtractor(RclBaseHandler): + def __init__(self, em): - self.em = em - self.currentindex = 0 + super(AudioTagExtractor, self).__init__(em) + def _showMutaInfo(self, mutf): self.em.rclog("%s" % mutf.info.pprint()) @@ -186,10 +188,6 @@ class AudioTagExtractor: (prop, getattr( mutf.info, prop))) - def _printableFilename(self): - return self.filename.decode('utf-8', errors='replace') - - def _embeddedImageFormat(self, mutf): #self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime) if 'audio/mp3' in mutf.mime: @@ -232,24 +230,16 @@ class AudioTagExtractor: val = time.mktime(pdt.timetuple()) return "%d" % val - def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], - # params["mimetype:"])) - docdata = "" - ok = False - if not "mimetype:" in params or not "filename:" in params: - self.em.rclog("extractone: no mime or file name") - return (ok, docdata, "", rclexecm.RclExecM.eofnow) - filename = params["filename:"] - mimetype = params["mimetype:"] - self.filename = filename - #self.em.rclog("%s" % filename) - try: - mutf = File(filename) - except Exception as err: - self.em.rclog("extractone: extract failed: [%s]" % err) - return (ok, docdata, "", rclexecm.RclExecM.eofnow) + def html_text(self, filename): + if not self.inputmimetype: + raise Exception("html_text: input MIME type not set") + mimetype = self.inputmimetype + + mutf = File(filename) + if not mutf: + raise Exception("mutagen failed opening %s" % filename) + #self._showMutaInfo(mutf) ################### @@ -361,29 +351,10 @@ class AudioTagExtractor: try: docdata = tobytes(mutf.pprint()) except Exception as err: + docdata = "" self.em.rclog("Doc pprint error: %s" % err) - ok = True - return (ok, docdata, "", rclexecm.RclExecM.eofnext) - - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - - def getipath(self, params): - return self.extractone(params) - - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + return docdata def makeObject(): diff --git a/src/filters/rclbasehandler.py b/src/filters/rclbasehandler.py index 859dc45d..2f1a66f6 100644 --- a/src/filters/rclbasehandler.py +++ b/src/filters/rclbasehandler.py @@ -25,22 +25,36 @@ import sys import rclexecm class RclBaseHandler(object): + '''Base Object for simple extractors. + + This implements the boilerplate code for simple extractors for + file types with a single document. The derived class would + typically need only to implement the html_text method to return + the document text in HTML format''' + def __init__(self, em): self.em = em def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], \ - #params["mimetype:"])) + #self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \ + # params["mimetype:"])) if not "filename:" in params: self.em.rclog("extractone: no file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] + if "mimetype:" in params: + self.inputmimetype = params["mimetype:"] + else: + self.inputmimetype = None + try: html = self.html_text(fn) except Exception as err: - self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) + import traceback + traceback.print_exc() + self.em.rclog("RclBaseHandler: %s : %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) self.em.setmimetype('text/html') @@ -52,9 +66,11 @@ class RclBaseHandler(object): self.currentindex = 0 return True + def getipath(self, params): return self.extractone(params) + def getnext(self, params): if self.currentindex >= 1: return (False, "", "", rclexecm.RclExecM.eofnow) diff --git a/src/filters/rcldia b/src/filters/rcldia index 363c9cc2..64209507 100755 --- a/src/filters/rcldia +++ b/src/filters/rcldia @@ -18,6 +18,7 @@ from __future__ import print_function # Small fixes from jfd: dia files are sometimes not compressed. import rclexecm +from rclbasehandler import RclBaseHandler import re from gzip import GzipFile import xml.parsers.expat @@ -58,54 +59,32 @@ class Parser: self._parser.ParseFile(fh) del self._parser -class DiaExtractor: +class DiaExtractor(RclBaseHandler): + def __init__(self, em): - self.em = em + super(DiaExtractor, self).__init__(em) - def extractdia(self): - docdata = "" - ipath = "" - try: - docdata = self.ExtractDiaText() - ok = True - except Exception as err: - self.em.rclog("Dia parse failed: %s"%err) - ok = False - iseof = rclexecm.RclExecM.eofnext - self.em.setmimetype("text/plain") - return (ok, docdata, ipath, iseof) - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): + def html_text(self, fn): try: - self.dia = GzipFile(params["filename:"], 'rb') + dia = GzipFile(fn, 'rb') # Dia files are sometimes not compressed. Quite weirdly, - # GzipFile does not complain until we try to read. Have to do it - # here to be able to retry an uncompressed open. - data = self.dia.readline() - self.dia.seek(0) - return True + # GzipFile does not complain until we try to read. + data = dia.readline() + dia.seek(0) except: # File not compressed ? - try: - self.dia = open(params["filename:"], 'rb') - except: - return False - return True + dia = open(fn, 'rb') - def getipath(self, params): - ok, data, ipath, eof = self.extractdia() - return (ok, data, ipath, eof) - - def getnext(self, params): - ok, data, ipath, eof = self.extractdia() - return (ok, data, ipath, eof) - - ###### read data - def ExtractDiaText(self): diap = Parser(self.em) - diap.feed(self.dia) - return '\n'.join(diap.string) + diap.feed(dia) + + html = '
'
+        html += self.em.htmlescape('\n'.join(diap.string))
+        html += '
' + + return html + # Main program: create protocol handler and extractor and run them proto = rclexecm.RclExecM() diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py index b8015d2f..f02ecbe0 100755 --- a/src/filters/rcldjvu.py +++ b/src/filters/rcldjvu.py @@ -24,22 +24,28 @@ import sys import re import rclexecm import subprocess +from rclbasehandler import RclBaseHandler + +class DJVUExtractor(RclBaseHandler): -class DJVUExtractor: def __init__(self, em): - self.currentindex = 0 - self.djvused = None - self.djvutxt = None - self.em = em + super(DJVUExtractor, self).__init__(em) + self.djvutxt = rclexecm.which("djvutxt") + if not self.djvutxt: + print("RECFILTERROR HELPERNOTFOUND djvutxt") + sys.exit(1); + self.djvused = rclexecm.which("djvused") - def extractone(self, params): + + def html_text(self, fn): self.em.setmimetype('text/html') # Extract metadata + metadata = b"" if self.djvused: try: - metadata = subprocess.check_output([self.djvused, self.filename, - "-e", "select 1;print-meta"]) + metadata = subprocess.check_output( + [self.djvused, fn, "-e", "select 1;print-meta"]) except Exception as e: self.em.rclog("djvused failed: %s" % e) author = "" @@ -55,14 +61,12 @@ class DJVUExtractor: title = ' '.join(line[1:]) # Main text - try: - txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename]) - except Exception as e: - self.em.rclog("djvused failed: %s" % e) - return (False, "", "", rclexecm.RclExecM.eofnow) + txtdata = subprocess.check_output([self.djvutxt, "-escape", fn]) + txtdata = txtdata.decode('UTF-8', 'replace') - data = '''''' + self.em.htmlescape(title) + '''''' + data = '''''' + data += '''''' + self.em.htmlescape(title) + '''''' data += '''''' if author: @@ -72,34 +76,8 @@ class DJVUExtractor: data += self.em.htmlescape(txtdata) data += '''''' - return (True, data, "", rclexecm.RclExecM.eofnext) + return data - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.filename = params["filename:"] - self.currentindex = 0 - #self.em.rclog("openfile: [%s]" % self.filename) - - if not self.djvutxt: - self.djvutxt = rclexecm.which("djvutxt") - if not self.djvutxt: - print("RECFILTERROR HELPERNOTFOUND djvutxt") - sys.exit(1); - self.djvused = rclexecm.which("djvused") - - return True - - def getipath(self, params): - return self.extractone(params) - return (ok, data, ipath, eof) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret # Main program: create protocol handler and extractor and run them proto = rclexecm.RclExecM() diff --git a/src/filters/rclepub1 b/src/filters/rclepub1 index 125da61b..6ecb9c82 100755 --- a/src/filters/rclepub1 +++ b/src/filters/rclepub1 @@ -7,6 +7,7 @@ import os import re import rclexecm +from rclbasehandler import RclBaseHandler sys.path.append(sys.path[0]+"/recollepub.zip") try: @@ -15,14 +16,12 @@ except: print("RECFILTERROR HELPERNOTFOUND python:epub") sys.exit(1); -class rclEPUB: +class EPUBConcatExtractor(RclBaseHandler): """RclExecM slave worker for extracting all text from an EPUB file. This version concatenates all nodes.""" def __init__(self, em): - self.em = em - self.em.setmimetype("text/html") - self.currentindex = 0 + super(EPUBConcatExtractor, self).__init__(em) def _header(self): meta = self.book.opf.metadata @@ -46,10 +45,12 @@ class rclEPUB: return data - def extractone(self, params): + def html_text(self, fn): """Extract EPUB data as concatenated HTML""" - ok = True + f = open(fn, 'rb') + self.book = epub.open_epub(f) + data = self._header() ids = [] if self.book.opf.spine: @@ -72,36 +73,8 @@ class rclEPUB: data += doc data += b'' - if ok: - return (ok, data, "", rclexecm.RclExecM.eofnext) - else: - return (ok, "", "", rclexecm.RclExecM.eofnow) - - def openfile(self, params): - """Open the EPUB file""" - self.currentindex = 0 - if not "filename:" in params: - self.em.rclog("openfile: no file name") - return (ok, "", "", rclexecm.RclExecM.eofnow) - - try: - self.book = epub.open_epub(params["filename:"].decode('UTF-8')) - except Exception as err: - self.em.rclog("openfile: epub.open failed: [%s]" % err) - return False - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + return data proto = rclexecm.RclExecM() -extract = rclEPUB(proto) +extract = EPUBConcatExtractor(proto) rclexecm.main(proto, extract) diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 21f54d5d..8b867bbe 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -388,7 +388,7 @@ def main(proto, extract): params = {'filename:': makebytes(args[0])} # Some filters (e.g. rclaudio) need/get a MIME type from the indexer - mimetype = mimetype_with_xdg(args[0]) + mimetype = mimetype_with_file(args[0]) params['mimetype:'] = mimetype if not extract.openfile(params): diff --git a/src/filters/rclopxml.py b/src/filters/rclopxml.py index 277d0a03..7c939d77 100755 --- a/src/filters/rclopxml.py +++ b/src/filters/rclopxml.py @@ -18,10 +18,11 @@ from __future__ import print_function import sys -import rclexecm -import rclxslt -import fnmatch from zipfile import ZipFile +import fnmatch +import rclexecm +from rclbasehandler import RclBaseHandler +import rclxslt meta_stylesheet = ''' ''' -class OXExtractor: +class OXExtractor(RclBaseHandler): def __init__(self, em): - self.em = em - self.currentindex = 0 + super(OXExtractor, self).__init__(em) + # Replace values inside data style sheet, depending on type of doc def computestylesheet(self, nm): @@ -145,18 +146,11 @@ class OXExtractor: return stylesheet - def extractone(self, params): - if "filename:" not in params: - self.em.rclog("extractone: no mime or file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] - try: - f = open(fn, 'rb') - zip = ZipFile(f) - except Exception as err: - self.em.rclog("unzip failed: " + str(err)) - return (False, "", "", rclexecm.RclExecM.eofnow) + def html_text(self, fn): + + f = open(fn, 'rb') + zip = ZipFile(f) docdata = b'' @@ -166,9 +160,6 @@ class OXExtractor: res = rclxslt.apply_sheet_data(meta_stylesheet, metadata) docdata += res except Exception as err: - # To be checked. I'm under the impression that I get this when - # nothing matches? - self.em.rclog("no/bad metadata in %s: %s" % (fn, err)) pass docdata += b'' @@ -200,25 +191,9 @@ class OXExtractor: docdata += b'' - return (True, docdata, "", rclexecm.RclExecM.eofnext) + return docdata - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret - if __name__ == '__main__': proto = rclexecm.RclExecM() extract = OXExtractor(proto) diff --git a/src/filters/rclsoff.py b/src/filters/rclsoff.py index f3b3cdd5..b2339ba5 100755 --- a/src/filters/rclsoff.py +++ b/src/filters/rclsoff.py @@ -21,6 +21,7 @@ from __future__ import print_function import sys import rclexecm import rclxslt +from rclbasehandler import RclBaseHandler from zipfile import ZipFile stylesheet_meta = ''' @@ -126,26 +127,21 @@ stylesheet_content = ''' ''' -class OOExtractor: +class OOExtractor(RclBaseHandler): def __init__(self, em): - self.em = em - self.currentindex = 0 + super(OOExtractor, self).__init__(em) - def extractone(self, params): - if "filename:" not in params: - self.em.rclog("extractone: no mime or file name") - return (False, "", "", rclexecm.RclExecM.eofnow) - fn = params["filename:"] - try: - zip = ZipFile(fn.decode('UTF-8')) - except Exception as err: - self.em.rclog("unzip failed: %s" % err) - return (False, "", "", rclexecm.RclExecM.eofnow) + def html_text(self, fn): + + f = open(fn, 'rb') + zip = ZipFile(f) docdata = b'\n\n' + # Wrap metadata extraction because it can sometimes throw + # while the main text will be valid try: metadata = zip.read("meta.xml") if metadata: @@ -159,33 +155,14 @@ class OOExtractor: docdata += b'\n\n' - try: - content = zip.read("content.xml") - if content: - res = rclxslt.apply_sheet_data(stylesheet_content, content) - docdata += res - docdata += b'' - except Exception as err: - self.em.rclog("bad data in %s: %s" % (fn, err)) - return (False, "", "", rclexecm.RclExecM.eofnow) + content = zip.read("content.xml") + if content: + res = rclxslt.apply_sheet_data(stylesheet_content, content) + docdata += res + docdata += b'' - return (True, docdata, "", rclexecm.RclExecM.eofnext) + return docdata - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - def getipath(self, params): - return self.extractone(params) - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret if __name__ == '__main__': proto = rclexecm.RclExecM()