diff --git a/src/filters/rclaudio b/src/filters/rclaudio index fcca997b..39bb55d4 100755 --- a/src/filters/rclaudio +++ b/src/filters/rclaudio @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -# Audio tag filter for Recoll, using mutagen +# Audio tag extractor for Recoll, using mutagen import sys import os import rclexecm +from rclbasehandler import RclBaseHandler import time import datetime import re @@ -174,10 +175,11 @@ def tobytes(s): # mp3: album, title, artist, genre, date, tracknumber # flac: album, title, artist, genre, xxx, tracknumber # oggvorbis:album, title, artist, genre, date, tracknumber -class AudioTagExtractor: +class AudioTagExtractor(RclBaseHandler): + def __init__(self, em): - self.em = em - self.currentindex = 0 + super(AudioTagExtractor, self).__init__(em) + def _showMutaInfo(self, mutf): self.em.rclog("%s" % mutf.info.pprint()) @@ -186,10 +188,6 @@ class AudioTagExtractor: (prop, getattr( mutf.info, prop))) - def _printableFilename(self): - return self.filename.decode('utf-8', errors='replace') - - def _embeddedImageFormat(self, mutf): #self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime) if 'audio/mp3' in mutf.mime: @@ -232,24 +230,16 @@ class AudioTagExtractor: val = time.mktime(pdt.timetuple()) return "%d" % val - def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], - # params["mimetype:"])) - docdata = "" - ok = False - if not "mimetype:" in params or not "filename:" in params: - self.em.rclog("extractone: no mime or file name") - return (ok, docdata, "", rclexecm.RclExecM.eofnow) - filename = params["filename:"] - mimetype = params["mimetype:"] - self.filename = filename - #self.em.rclog("%s" % filename) - try: - mutf = File(filename) - except Exception as err: - self.em.rclog("extractone: extract failed: [%s]" % err) - return (ok, docdata, "", rclexecm.RclExecM.eofnow) + def html_text(self, filename): + if not self.inputmimetype: + raise Exception("html_text: input MIME type not set") + mimetype = self.inputmimetype + + mutf = File(filename) + if not mutf: + raise Exception("mutagen failed opening %s" % filename) + #self._showMutaInfo(mutf) ################### @@ -361,29 +351,10 @@ class AudioTagExtractor: try: docdata = tobytes(mutf.pprint()) except Exception as err: + docdata = "" self.em.rclog("Doc pprint error: %s" % err) - ok = True - return (ok, docdata, "", rclexecm.RclExecM.eofnext) - - - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): - self.currentindex = 0 - return True - - - def getipath(self, params): - return self.extractone(params) - - - def getnext(self, params): - if self.currentindex >= 1: - return (False, "", "", rclexecm.RclExecM.eofnow) - else: - ret= self.extractone(params) - self.currentindex += 1 - return ret + return docdata def makeObject(): diff --git a/src/filters/rclbasehandler.py b/src/filters/rclbasehandler.py index 859dc45d..2f1a66f6 100644 --- a/src/filters/rclbasehandler.py +++ b/src/filters/rclbasehandler.py @@ -25,22 +25,36 @@ import sys import rclexecm class RclBaseHandler(object): + '''Base Object for simple extractors. + + This implements the boilerplate code for simple extractors for + file types with a single document. The derived class would + typically need only to implement the html_text method to return + the document text in HTML format''' + def __init__(self, em): self.em = em def extractone(self, params): - #self.em.rclog("extractone %s %s" % (params["filename:"], \ - #params["mimetype:"])) + #self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \ + # params["mimetype:"])) if not "filename:" in params: self.em.rclog("extractone: no file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] + if "mimetype:" in params: + self.inputmimetype = params["mimetype:"] + else: + self.inputmimetype = None + try: html = self.html_text(fn) except Exception as err: - self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) + import traceback + traceback.print_exc() + self.em.rclog("RclBaseHandler: %s : %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) self.em.setmimetype('text/html') @@ -52,9 +66,11 @@ class RclBaseHandler(object): self.currentindex = 0 return True + def getipath(self, params): return self.extractone(params) + def getnext(self, params): if self.currentindex >= 1: return (False, "", "", rclexecm.RclExecM.eofnow) diff --git a/src/filters/rcldia b/src/filters/rcldia index 363c9cc2..64209507 100755 --- a/src/filters/rcldia +++ b/src/filters/rcldia @@ -18,6 +18,7 @@ from __future__ import print_function # Small fixes from jfd: dia files are sometimes not compressed. import rclexecm +from rclbasehandler import RclBaseHandler import re from gzip import GzipFile import xml.parsers.expat @@ -58,54 +59,32 @@ class Parser: self._parser.ParseFile(fh) del self._parser -class DiaExtractor: +class DiaExtractor(RclBaseHandler): + def __init__(self, em): - self.em = em + super(DiaExtractor, self).__init__(em) - def extractdia(self): - docdata = "" - ipath = "" - try: - docdata = self.ExtractDiaText() - ok = True - except Exception as err: - self.em.rclog("Dia parse failed: %s"%err) - ok = False - iseof = rclexecm.RclExecM.eofnext - self.em.setmimetype("text/plain") - return (ok, docdata, ipath, iseof) - ###### File type handler api, used by rclexecm ----------> - def openfile(self, params): + def html_text(self, fn): try: - self.dia = GzipFile(params["filename:"], 'rb') + dia = GzipFile(fn, 'rb') # Dia files are sometimes not compressed. Quite weirdly, - # GzipFile does not complain until we try to read. Have to do it - # here to be able to retry an uncompressed open. - data = self.dia.readline() - self.dia.seek(0) - return True + # GzipFile does not complain until we try to read. + data = dia.readline() + dia.seek(0) except: # File not compressed ? - try: - self.dia = open(params["filename:"], 'rb') - except: - return False - return True + dia = open(fn, 'rb') - def getipath(self, params): - ok, data, ipath, eof = self.extractdia() - return (ok, data, ipath, eof) - - def getnext(self, params): - ok, data, ipath, eof = self.extractdia() - return (ok, data, ipath, eof) - - ###### read data - def ExtractDiaText(self): diap = Parser(self.em) - diap.feed(self.dia) - return '\n'.join(diap.string) + diap.feed(dia) + + html = '
'
+ html += self.em.htmlescape('\n'.join(diap.string))
+ html += ''
+
+ return html
+
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py
index b8015d2f..f02ecbe0 100755
--- a/src/filters/rcldjvu.py
+++ b/src/filters/rcldjvu.py
@@ -24,22 +24,28 @@ import sys
import re
import rclexecm
import subprocess
+from rclbasehandler import RclBaseHandler
+
+class DJVUExtractor(RclBaseHandler):
-class DJVUExtractor:
def __init__(self, em):
- self.currentindex = 0
- self.djvused = None
- self.djvutxt = None
- self.em = em
+ super(DJVUExtractor, self).__init__(em)
+ self.djvutxt = rclexecm.which("djvutxt")
+ if not self.djvutxt:
+ print("RECFILTERROR HELPERNOTFOUND djvutxt")
+ sys.exit(1);
+ self.djvused = rclexecm.which("djvused")
- def extractone(self, params):
+
+ def html_text(self, fn):
self.em.setmimetype('text/html')
# Extract metadata
+ metadata = b""
if self.djvused:
try:
- metadata = subprocess.check_output([self.djvused, self.filename,
- "-e", "select 1;print-meta"])
+ metadata = subprocess.check_output(
+ [self.djvused, fn, "-e", "select 1;print-meta"])
except Exception as e:
self.em.rclog("djvused failed: %s" % e)
author = ""
@@ -55,14 +61,12 @@ class DJVUExtractor:
title = ' '.join(line[1:])
# Main text
- try:
- txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
- except Exception as e:
- self.em.rclog("djvused failed: %s" % e)
- return (False, "", "", rclexecm.RclExecM.eofnow)
+ txtdata = subprocess.check_output([self.djvutxt, "-escape", fn])
+
txtdata = txtdata.decode('UTF-8', 'replace')
- data = '''