Simplified a number of handlers by deriving them from RclBaseHandler

This commit is contained in:
Jean-Francois Dockes 2018-11-09 15:41:14 +01:00
parent e1a937f608
commit 61ee8acbc2
8 changed files with 110 additions and 241 deletions

View File

@ -1,10 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Audio tag filter for Recoll, using mutagen # Audio tag extractor for Recoll, using mutagen
import sys import sys
import os import os
import rclexecm import rclexecm
from rclbasehandler import RclBaseHandler
import time import time
import datetime import datetime
import re import re
@ -174,10 +175,11 @@ def tobytes(s):
# mp3: album, title, artist, genre, date, tracknumber # mp3: album, title, artist, genre, date, tracknumber
# flac: album, title, artist, genre, xxx, tracknumber # flac: album, title, artist, genre, xxx, tracknumber
# oggvorbis:album, title, artist, genre, date, tracknumber # oggvorbis:album, title, artist, genre, date, tracknumber
class AudioTagExtractor: class AudioTagExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(AudioTagExtractor, self).__init__(em)
self.currentindex = 0
def _showMutaInfo(self, mutf): def _showMutaInfo(self, mutf):
self.em.rclog("%s" % mutf.info.pprint()) self.em.rclog("%s" % mutf.info.pprint())
@ -186,10 +188,6 @@ class AudioTagExtractor:
(prop, getattr( mutf.info, prop))) (prop, getattr( mutf.info, prop)))
def _printableFilename(self):
return self.filename.decode('utf-8', errors='replace')
def _embeddedImageFormat(self, mutf): def _embeddedImageFormat(self, mutf):
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime) #self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
if 'audio/mp3' in mutf.mime: if 'audio/mp3' in mutf.mime:
@ -232,23 +230,15 @@ class AudioTagExtractor:
val = time.mktime(pdt.timetuple()) val = time.mktime(pdt.timetuple())
return "%d" % val return "%d" % val
def extractone(self, params): def html_text(self, filename):
#self.em.rclog("extractone %s %s" % (params["filename:"],
# params["mimetype:"])) if not self.inputmimetype:
docdata = "" raise Exception("html_text: input MIME type not set")
ok = False mimetype = self.inputmimetype
if not "mimetype:" in params or not "filename:" in params:
self.em.rclog("extractone: no mime or file name") mutf = File(filename)
return (ok, docdata, "", rclexecm.RclExecM.eofnow) if not mutf:
filename = params["filename:"] raise Exception("mutagen failed opening %s" % filename)
mimetype = params["mimetype:"]
self.filename = filename
#self.em.rclog("%s" % filename)
try:
mutf = File(filename)
except Exception as err:
self.em.rclog("extractone: extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
#self._showMutaInfo(mutf) #self._showMutaInfo(mutf)
@ -361,29 +351,10 @@ class AudioTagExtractor:
try: try:
docdata = tobytes(mutf.pprint()) docdata = tobytes(mutf.pprint())
except Exception as err: except Exception as err:
docdata = ""
self.em.rclog("Doc pprint error: %s" % err) self.em.rclog("Doc pprint error: %s" % err)
ok = True return docdata
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
def makeObject(): def makeObject():

View File

@ -25,22 +25,36 @@ import sys
import rclexecm import rclexecm
class RclBaseHandler(object): class RclBaseHandler(object):
'''Base Object for simple extractors.
This implements the boilerplate code for simple extractors for
file types with a single document. The derived class would
typically need only to implement the html_text method to return
the document text in HTML format'''
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
def extractone(self, params): def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \ #self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \
#params["mimetype:"])) # params["mimetype:"]))
if not "filename:" in params: if not "filename:" in params:
self.em.rclog("extractone: no file name") self.em.rclog("extractone: no file name")
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]
if "mimetype:" in params:
self.inputmimetype = params["mimetype:"]
else:
self.inputmimetype = None
try: try:
html = self.html_text(fn) html = self.html_text(fn)
except Exception as err: except Exception as err:
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err)) import traceback
traceback.print_exc()
self.em.rclog("RclBaseHandler: %s : %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
self.em.setmimetype('text/html') self.em.setmimetype('text/html')
@ -52,9 +66,11 @@ class RclBaseHandler(object):
self.currentindex = 0 self.currentindex = 0
return True return True
def getipath(self, params): def getipath(self, params):
return self.extractone(params) return self.extractone(params)
def getnext(self, params): def getnext(self, params):
if self.currentindex >= 1: if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -18,6 +18,7 @@ from __future__ import print_function
# Small fixes from jfd: dia files are sometimes not compressed. # Small fixes from jfd: dia files are sometimes not compressed.
import rclexecm import rclexecm
from rclbasehandler import RclBaseHandler
import re import re
from gzip import GzipFile from gzip import GzipFile
import xml.parsers.expat import xml.parsers.expat
@ -58,54 +59,32 @@ class Parser:
self._parser.ParseFile(fh) self._parser.ParseFile(fh)
del self._parser del self._parser
class DiaExtractor: class DiaExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(DiaExtractor, self).__init__(em)
def extractdia(self):
docdata = ""
ipath = ""
try:
docdata = self.ExtractDiaText()
ok = True
except Exception as err:
self.em.rclog("Dia parse failed: %s"%err)
ok = False
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
return (ok, docdata, ipath, iseof)
###### File type handler api, used by rclexecm ----------> def html_text(self, fn):
def openfile(self, params):
try: try:
self.dia = GzipFile(params["filename:"], 'rb') dia = GzipFile(fn, 'rb')
# Dia files are sometimes not compressed. Quite weirdly, # Dia files are sometimes not compressed. Quite weirdly,
# GzipFile does not complain until we try to read. Have to do it # GzipFile does not complain until we try to read.
# here to be able to retry an uncompressed open. data = dia.readline()
data = self.dia.readline() dia.seek(0)
self.dia.seek(0)
return True
except: except:
# File not compressed ? # File not compressed ?
try: dia = open(fn, 'rb')
self.dia = open(params["filename:"], 'rb')
except:
return False
return True
def getipath(self, params):
ok, data, ipath, eof = self.extractdia()
return (ok, data, ipath, eof)
def getnext(self, params):
ok, data, ipath, eof = self.extractdia()
return (ok, data, ipath, eof)
###### read data
def ExtractDiaText(self):
diap = Parser(self.em) diap = Parser(self.em)
diap.feed(self.dia) diap.feed(dia)
return '\n'.join(diap.string)
html = '<html><head><title></title></head><body><pre>'
html += self.em.htmlescape('\n'.join(diap.string))
html += '</pre></body></html>'
return html
# Main program: create protocol handler and extractor and run them # Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()

View File

@ -24,22 +24,28 @@ import sys
import re import re
import rclexecm import rclexecm
import subprocess import subprocess
from rclbasehandler import RclBaseHandler
class DJVUExtractor(RclBaseHandler):
class DJVUExtractor:
def __init__(self, em): def __init__(self, em):
self.currentindex = 0 super(DJVUExtractor, self).__init__(em)
self.djvused = None self.djvutxt = rclexecm.which("djvutxt")
self.djvutxt = None if not self.djvutxt:
self.em = em print("RECFILTERROR HELPERNOTFOUND djvutxt")
sys.exit(1);
self.djvused = rclexecm.which("djvused")
def extractone(self, params):
def html_text(self, fn):
self.em.setmimetype('text/html') self.em.setmimetype('text/html')
# Extract metadata # Extract metadata
metadata = b""
if self.djvused: if self.djvused:
try: try:
metadata = subprocess.check_output([self.djvused, self.filename, metadata = subprocess.check_output(
"-e", "select 1;print-meta"]) [self.djvused, fn, "-e", "select 1;print-meta"])
except Exception as e: except Exception as e:
self.em.rclog("djvused failed: %s" % e) self.em.rclog("djvused failed: %s" % e)
author = "" author = ""
@ -55,14 +61,12 @@ class DJVUExtractor:
title = ' '.join(line[1:]) title = ' '.join(line[1:])
# Main text # Main text
try: txtdata = subprocess.check_output([self.djvutxt, "-escape", fn])
txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
except Exception as e:
self.em.rclog("djvused failed: %s" % e)
return (False, "", "", rclexecm.RclExecM.eofnow)
txtdata = txtdata.decode('UTF-8', 'replace') txtdata = txtdata.decode('UTF-8', 'replace')
data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>''' data = '''<html><head>'''
data += '''<title>''' + self.em.htmlescape(title) + '''</title>'''
data += '''<meta http-equiv="Content-Type" ''' data += '''<meta http-equiv="Content-Type" '''
data += '''content="text/html;charset=UTF-8">''' data += '''content="text/html;charset=UTF-8">'''
if author: if author:
@ -72,34 +76,8 @@ class DJVUExtractor:
data += self.em.htmlescape(txtdata) data += self.em.htmlescape(txtdata)
data += '''</pre></body></html>''' data += '''</pre></body></html>'''
return (True, data, "", rclexecm.RclExecM.eofnext) return data
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.filename = params["filename:"]
self.currentindex = 0
#self.em.rclog("openfile: [%s]" % self.filename)
if not self.djvutxt:
self.djvutxt = rclexecm.which("djvutxt")
if not self.djvutxt:
print("RECFILTERROR HELPERNOTFOUND djvutxt")
sys.exit(1);
self.djvused = rclexecm.which("djvused")
return True
def getipath(self, params):
return self.extractone(params)
return (ok, data, ipath, eof)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
# Main program: create protocol handler and extractor and run them # Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()

View File

@ -7,6 +7,7 @@ import os
import re import re
import rclexecm import rclexecm
from rclbasehandler import RclBaseHandler
sys.path.append(sys.path[0]+"/recollepub.zip") sys.path.append(sys.path[0]+"/recollepub.zip")
try: try:
@ -15,14 +16,12 @@ except:
print("RECFILTERROR HELPERNOTFOUND python:epub") print("RECFILTERROR HELPERNOTFOUND python:epub")
sys.exit(1); sys.exit(1);
class rclEPUB: class EPUBConcatExtractor(RclBaseHandler):
"""RclExecM slave worker for extracting all text from an EPUB """RclExecM slave worker for extracting all text from an EPUB
file. This version concatenates all nodes.""" file. This version concatenates all nodes."""
def __init__(self, em): def __init__(self, em):
self.em = em super(EPUBConcatExtractor, self).__init__(em)
self.em.setmimetype("text/html")
self.currentindex = 0
def _header(self): def _header(self):
meta = self.book.opf.metadata meta = self.book.opf.metadata
@ -46,10 +45,12 @@ class rclEPUB:
return data return data
def extractone(self, params): def html_text(self, fn):
"""Extract EPUB data as concatenated HTML""" """Extract EPUB data as concatenated HTML"""
ok = True f = open(fn, 'rb')
self.book = epub.open_epub(f)
data = self._header() data = self._header()
ids = [] ids = []
if self.book.opf.spine: if self.book.opf.spine:
@ -72,36 +73,8 @@ class rclEPUB:
data += doc data += doc
data += b'</body></html>' data += b'</body></html>'
if ok: return data
return (ok, data, "", rclexecm.RclExecM.eofnext)
else:
return (ok, "", "", rclexecm.RclExecM.eofnow)
def openfile(self, params):
"""Open the EPUB file"""
self.currentindex = 0
if not "filename:" in params:
self.em.rclog("openfile: no file name")
return (ok, "", "", rclexecm.RclExecM.eofnow)
try:
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
except Exception as err:
self.em.rclog("openfile: epub.open failed: [%s]" % err)
return False
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()
extract = rclEPUB(proto) extract = EPUBConcatExtractor(proto)
rclexecm.main(proto, extract) rclexecm.main(proto, extract)

View File

@ -388,7 +388,7 @@ def main(proto, extract):
params = {'filename:': makebytes(args[0])} params = {'filename:': makebytes(args[0])}
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer # Some filters (e.g. rclaudio) need/get a MIME type from the indexer
mimetype = mimetype_with_xdg(args[0]) mimetype = mimetype_with_file(args[0])
params['mimetype:'] = mimetype params['mimetype:'] = mimetype
if not extract.openfile(params): if not extract.openfile(params):

View File

@ -18,10 +18,11 @@
from __future__ import print_function from __future__ import print_function
import sys import sys
import rclexecm
import rclxslt
import fnmatch
from zipfile import ZipFile from zipfile import ZipFile
import fnmatch
import rclexecm
from rclbasehandler import RclBaseHandler
import rclxslt
meta_stylesheet = '''<?xml version="1.0"?> meta_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet <xsl:stylesheet
@ -129,10 +130,10 @@ content_stylesheet = '''<?xml version="1.0"?>
</xsl:stylesheet> </xsl:stylesheet>
''' '''
class OXExtractor: class OXExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(OXExtractor, self).__init__(em)
self.currentindex = 0
# Replace values inside data style sheet, depending on type of doc # Replace values inside data style sheet, depending on type of doc
def computestylesheet(self, nm): def computestylesheet(self, nm):
@ -145,18 +146,11 @@ class OXExtractor:
return stylesheet return stylesheet
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try: def html_text(self, fn):
f = open(fn, 'rb')
zip = ZipFile(f) f = open(fn, 'rb')
except Exception as err: zip = ZipFile(f)
self.em.rclog("unzip failed: " + str(err))
return (False, "", "", rclexecm.RclExecM.eofnow)
docdata = b'<html><head>' docdata = b'<html><head>'
@ -166,9 +160,6 @@ class OXExtractor:
res = rclxslt.apply_sheet_data(meta_stylesheet, metadata) res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
docdata += res docdata += res
except Exception as err: except Exception as err:
# To be checked. I'm under the impression that I get this when
# nothing matches?
self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
pass pass
docdata += b'</head><body>' docdata += b'</head><body>'
@ -200,25 +191,9 @@ class OXExtractor:
docdata += b'</body></html>' docdata += b'</body></html>'
return (True, docdata, "", rclexecm.RclExecM.eofnext) return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__': if __name__ == '__main__':
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()
extract = OXExtractor(proto) extract = OXExtractor(proto)

View File

@ -21,6 +21,7 @@ from __future__ import print_function
import sys import sys
import rclexecm import rclexecm
import rclxslt import rclxslt
from rclbasehandler import RclBaseHandler
from zipfile import ZipFile from zipfile import ZipFile
stylesheet_meta = '''<?xml version="1.0"?> stylesheet_meta = '''<?xml version="1.0"?>
@ -126,26 +127,21 @@ stylesheet_content = '''<?xml version="1.0"?>
</xsl:stylesheet> </xsl:stylesheet>
''' '''
class OOExtractor: class OOExtractor(RclBaseHandler):
def __init__(self, em): def __init__(self, em):
self.em = em super(OOExtractor, self).__init__(em)
self.currentindex = 0
def extractone(self, params):
if "filename:" not in params:
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try: def html_text(self, fn):
zip = ZipFile(fn.decode('UTF-8'))
except Exception as err: f = open(fn, 'rb')
self.em.rclog("unzip failed: %s" % err) zip = ZipFile(f)
return (False, "", "", rclexecm.RclExecM.eofnow)
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \ docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
b'content="text/html; charset=UTF-8">' b'content="text/html; charset=UTF-8">'
# Wrap metadata extraction because it can sometimes throw
# while the main text will be valid
try: try:
metadata = zip.read("meta.xml") metadata = zip.read("meta.xml")
if metadata: if metadata:
@ -159,33 +155,14 @@ class OOExtractor:
docdata += b'</head>\n<body>\n' docdata += b'</head>\n<body>\n'
try: content = zip.read("content.xml")
content = zip.read("content.xml") if content:
if content: res = rclxslt.apply_sheet_data(stylesheet_content, content)
res = rclxslt.apply_sheet_data(stylesheet_content, content) docdata += res
docdata += res docdata += b'</body></html>'
docdata += b'</body></html>'
except Exception as err:
self.em.rclog("bad data in %s: %s" % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext) return docdata
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__': if __name__ == '__main__':
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()