Simplified a number of handlers by deriving them from RclBaseHandler

This commit is contained in:
Jean-Francois Dockes 2018-11-09 15:41:14 +01:00
parent e1a937f608
commit 61ee8acbc2
8 changed files with 110 additions and 241 deletions

View File

@ -1,10 +1,11 @@
#!/usr/bin/env python3
# Audio tag filter for Recoll, using mutagen
# Audio tag extractor for Recoll, using mutagen
import sys
import os
import rclexecm
from rclbasehandler import RclBaseHandler
import time
import datetime
import re
@ -174,10 +175,11 @@ def tobytes(s):
# mp3: album, title, artist, genre, date, tracknumber
# flac: album, title, artist, genre, xxx, tracknumber
# oggvorbis:album, title, artist, genre, date, tracknumber
class AudioTagExtractor:
class AudioTagExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
self.currentindex = 0
super(AudioTagExtractor, self).__init__(em)
def _showMutaInfo(self, mutf):
self.em.rclog("%s" % mutf.info.pprint())
@ -186,10 +188,6 @@ class AudioTagExtractor:
(prop, getattr( mutf.info, prop)))
def _printableFilename(self):
return self.filename.decode('utf-8', errors='replace')
def _embeddedImageFormat(self, mutf):
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
if 'audio/mp3' in mutf.mime:
@ -232,24 +230,16 @@ class AudioTagExtractor:
val = time.mktime(pdt.timetuple())
return "%d" % val
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"],
# params["mimetype:"]))
docdata = ""
ok = False
if not "mimetype:" in params or not "filename:" in params:
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
mimetype = params["mimetype:"]
self.filename = filename
#self.em.rclog("%s" % filename)
try:
mutf = File(filename)
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
def html_text(self, filename):
if not self.inputmimetype:
raise Exception("html_text: input MIME type not set")
mimetype = self.inputmimetype
mutf = File(filename)
if not mutf:
raise Exception("mutagen failed opening %s" % filename)
#self._showMutaInfo(mutf)
###################
@ -361,29 +351,10 @@ class AudioTagExtractor:
try:
docdata = tobytes(mutf.pprint())
except Exception as err:
docdata = ""
self.em.rclog("Doc pprint error: %s" % err)
ok = True
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
return docdata
def makeObject():

View File

@ -25,22 +25,36 @@ import sys
import rclexecm
class RclBaseHandler(object):
'''Base Object for simple extractors.
This implements the boilerplate code for simple extractors for
file types with a single document. The derived class would
typically need only to implement the html_text method to return
the document text in HTML format'''
def __init__(self, em):
self.em = em
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
#params["mimetype:"]))
#self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \
# params["mimetype:"]))
if not "filename:" in params:
self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
if "mimetype:" in params:
self.inputmimetype = params["mimetype:"]
else:
self.inputmimetype = None
try:
html = self.html_text(fn)
except Exception as err:
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
import traceback
traceback.print_exc()
self.em.rclog("RclBaseHandler: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
self.em.setmimetype('text/html')
@ -52,9 +66,11 @@ class RclBaseHandler(object):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -18,6 +18,7 @@ from __future__ import print_function
# Small fixes from jfd: dia files are sometimes not compressed.
import rclexecm
from rclbasehandler import RclBaseHandler
import re
from gzip import GzipFile
import xml.parsers.expat
@ -58,54 +59,32 @@ class Parser:
self._parser.ParseFile(fh)
del self._parser
class DiaExtractor:
class DiaExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
super(DiaExtractor, self).__init__(em)
def extractdia(self):
docdata = ""
ipath = ""
try:
docdata = self.ExtractDiaText()
ok = True
except Exception as err:
self.em.rclog("Dia parse failed: %s"%err)
ok = False
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
return (ok, docdata, ipath, iseof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
def html_text(self, fn):
try:
self.dia = GzipFile(params["filename:"], 'rb')
dia = GzipFile(fn, 'rb')
# Dia files are sometimes not compressed. Quite weirdly,
# GzipFile does not complain until we try to read. Have to do it
# here to be able to retry an uncompressed open.
data = self.dia.readline()
self.dia.seek(0)
return True
# GzipFile does not complain until we try to read.
data = dia.readline()
dia.seek(0)
except:
# File not compressed ?
try:
self.dia = open(params["filename:"], 'rb')
except:
return False
return True
dia = open(fn, 'rb')
def getipath(self, params):
ok, data, ipath, eof = self.extractdia()
return (ok, data, ipath, eof)
def getnext(self, params):
ok, data, ipath, eof = self.extractdia()
return (ok, data, ipath, eof)
###### read data
def ExtractDiaText(self):
diap = Parser(self.em)
diap.feed(self.dia)
return '\n'.join(diap.string)
diap.feed(dia)
html = '<html><head><title></title></head><body><pre>'
html += self.em.htmlescape('\n'.join(diap.string))
html += '</pre></body></html>'
return html
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()

View File

@ -24,22 +24,28 @@ import sys
import re
import rclexecm
import subprocess
from rclbasehandler import RclBaseHandler
class DJVUExtractor(RclBaseHandler):
class DJVUExtractor:
def __init__(self, em):
self.currentindex = 0
self.djvused = None
self.djvutxt = None
self.em = em
super(DJVUExtractor, self).__init__(em)
self.djvutxt = rclexecm.which("djvutxt")
if not self.djvutxt:
print("RECFILTERROR HELPERNOTFOUND djvutxt")
sys.exit(1);
self.djvused = rclexecm.which("djvused")
def extractone(self, params):
def html_text(self, fn):
self.em.setmimetype('text/html')
# Extract metadata
metadata = b""
if self.djvused:
try:
metadata = subprocess.check_output([self.djvused, self.filename,
"-e", "select 1;print-meta"])
metadata = subprocess.check_output(
[self.djvused, fn, "-e", "select 1;print-meta"])
except Exception as e:
self.em.rclog("djvused failed: %s" % e)
author = ""
@ -55,14 +61,12 @@ class DJVUExtractor:
title = ' '.join(line[1:])
# Main text
try:
txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
except Exception as e:
self.em.rclog("djvused failed: %s" % e)
return (False, "", "", rclexecm.RclExecM.eofnow)
txtdata = subprocess.check_output([self.djvutxt, "-escape", fn])
txtdata = txtdata.decode('UTF-8', 'replace')
data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
data = '''<html><head>'''
data += '''<title>''' + self.em.htmlescape(title) + '''</title>'''
data += '''<meta http-equiv="Content-Type" '''
data += '''content="text/html;charset=UTF-8">'''
if author:
@ -72,34 +76,8 @@ class DJVUExtractor:
data += self.em.htmlescape(txtdata)
data += '''</pre></body></html>'''
return (True, data, "", rclexecm.RclExecM.eofnext)
return data
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.filename = params["filename:"]
self.currentindex = 0
#self.em.rclog("openfile: [%s]" % self.filename)
if not self.djvutxt:
self.djvutxt = rclexecm.which("djvutxt")
if not self.djvutxt:
print("RECFILTERROR HELPERNOTFOUND djvutxt")
sys.exit(1);
self.djvused = rclexecm.which("djvused")
return True
def getipath(self, params):
return self.extractone(params)
return (ok, data, ipath, eof)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()

View File

@ -7,6 +7,7 @@ import os
import re
import rclexecm
from rclbasehandler import RclBaseHandler
sys.path.append(sys.path[0]+"/recollepub.zip")
try:
@ -15,14 +16,12 @@ except:
print("RECFILTERROR HELPERNOTFOUND python:epub")
sys.exit(1);
class rclEPUB:
class EPUBConcatExtractor(RclBaseHandler):
"""RclExecM slave worker for extracting all text from an EPUB
file. This version concatenates all nodes."""
def __init__(self, em):
self.em = em
self.em.setmimetype("text/html")
self.currentindex = 0
super(EPUBConcatExtractor, self).__init__(em)
def _header(self):
meta = self.book.opf.metadata
@ -46,10 +45,12 @@ class rclEPUB:
return data
def extractone(self, params):
def html_text(self, fn):
"""Extract EPUB data as concatenated HTML"""
ok = True
f = open(fn, 'rb')
self.book = epub.open_epub(f)
data = self._header()
ids = []
if self.book.opf.spine:
@ -72,36 +73,8 @@ class rclEPUB:
data += doc
data += b'</body></html>'
if ok:
return (ok, data, "", rclexecm.RclExecM.eofnext)
else:
return (ok, "", "", rclexecm.RclExecM.eofnow)
def openfile(self, params):
"""Open the EPUB file"""
self.currentindex = 0
if not "filename:" in params:
self.em.rclog("openfile: no file name")
return (ok, "", "", rclexecm.RclExecM.eofnow)
try:
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
except Exception as err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
return data
proto = rclexecm.RclExecM()
extract = rclEPUB(proto)
extract = EPUBConcatExtractor(proto)
rclexecm.main(proto, extract)

View File

@ -388,7 +388,7 @@ def main(proto, extract):
params = {'filename:': makebytes(args[0])}
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
mimetype = mimetype_with_xdg(args[0])
mimetype = mimetype_with_file(args[0])
params['mimetype:'] = mimetype
if not extract.openfile(params):

View File

@ -18,10 +18,11 @@
from __future__ import print_function
import sys
import rclexecm
import rclxslt
import fnmatch
from zipfile import ZipFile
import fnmatch
import rclexecm
from rclbasehandler import RclBaseHandler
import rclxslt
meta_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet
@ -129,10 +130,10 @@ content_stylesheet = '''<?xml version="1.0"?>
</xsl:stylesheet>
'''
class OXExtractor:
class OXExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
self.currentindex = 0
super(OXExtractor, self).__init__(em)
# Replace values inside data style sheet, depending on type of doc
def computestylesheet(self, nm):
@ -145,18 +146,11 @@ class OXExtractor:
return stylesheet
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
f = open(fn, 'rb')
zip = ZipFile(f)
except Exception as err:
self.em.rclog("unzip failed: " + str(err))
return (False, "", "", rclexecm.RclExecM.eofnow)
def html_text(self, fn):
f = open(fn, 'rb')
zip = ZipFile(f)
docdata = b'<html><head>'
@ -166,9 +160,6 @@ class OXExtractor:
res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
docdata += res
except Exception as err:
# To be checked. I'm under the impression that I get this when
# nothing matches?
self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
pass
docdata += b'</head><body>'
@ -200,25 +191,9 @@ class OXExtractor:
docdata += b'</body></html>'
return (True, docdata, "", rclexecm.RclExecM.eofnext)
return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()
extract = OXExtractor(proto)

View File

@ -21,6 +21,7 @@ from __future__ import print_function
import sys
import rclexecm
import rclxslt
from rclbasehandler import RclBaseHandler
from zipfile import ZipFile
stylesheet_meta = '''<?xml version="1.0"?>
@ -126,26 +127,21 @@ stylesheet_content = '''<?xml version="1.0"?>
</xsl:stylesheet>
'''
class OOExtractor:
class OOExtractor(RclBaseHandler):
def __init__(self, em):
self.em = em
self.currentindex = 0
super(OOExtractor, self).__init__(em)
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
zip = ZipFile(fn.decode('UTF-8'))
except Exception as err:
self.em.rclog("unzip failed: %s" % err)
return (False, "", "", rclexecm.RclExecM.eofnow)
def html_text(self, fn):
f = open(fn, 'rb')
zip = ZipFile(f)
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
b'content="text/html; charset=UTF-8">'
# Wrap metadata extraction because it can sometimes throw
# while the main text will be valid
try:
metadata = zip.read("meta.xml")
if metadata:
@ -159,33 +155,14 @@ class OOExtractor:
docdata += b'</head>\n<body>\n'
try:
content = zip.read("content.xml")
if content:
res = rclxslt.apply_sheet_data(stylesheet_content, content)
docdata += res
docdata += b'</body></html>'
except Exception as err:
self.em.rclog("bad data in %s: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
content = zip.read("content.xml")
if content:
res = rclxslt.apply_sheet_data(stylesheet_content, content)
docdata += res
docdata += b'</body></html>'
return (True, docdata, "", rclexecm.RclExecM.eofnext)
return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()