Simplified a number of handlers by deriving them from RclBaseHandler
This commit is contained in:
parent
e1a937f608
commit
61ee8acbc2
@ -1,10 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Audio tag filter for Recoll, using mutagen
|
||||
# Audio tag extractor for Recoll, using mutagen
|
||||
|
||||
import sys
|
||||
import os
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
import time
|
||||
import datetime
|
||||
import re
|
||||
@ -174,10 +175,11 @@ def tobytes(s):
|
||||
# mp3: album, title, artist, genre, date, tracknumber
|
||||
# flac: album, title, artist, genre, xxx, tracknumber
|
||||
# oggvorbis:album, title, artist, genre, date, tracknumber
|
||||
class AudioTagExtractor:
|
||||
class AudioTagExtractor(RclBaseHandler):
|
||||
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(AudioTagExtractor, self).__init__(em)
|
||||
|
||||
|
||||
def _showMutaInfo(self, mutf):
|
||||
self.em.rclog("%s" % mutf.info.pprint())
|
||||
@ -186,10 +188,6 @@ class AudioTagExtractor:
|
||||
(prop, getattr( mutf.info, prop)))
|
||||
|
||||
|
||||
def _printableFilename(self):
|
||||
return self.filename.decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
def _embeddedImageFormat(self, mutf):
|
||||
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
|
||||
if 'audio/mp3' in mutf.mime:
|
||||
@ -232,24 +230,16 @@ class AudioTagExtractor:
|
||||
val = time.mktime(pdt.timetuple())
|
||||
return "%d" % val
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"],
|
||||
# params["mimetype:"]))
|
||||
docdata = ""
|
||||
ok = False
|
||||
if not "mimetype:" in params or not "filename:" in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
filename = params["filename:"]
|
||||
mimetype = params["mimetype:"]
|
||||
self.filename = filename
|
||||
#self.em.rclog("%s" % filename)
|
||||
try:
|
||||
mutf = File(filename)
|
||||
except Exception as err:
|
||||
self.em.rclog("extractone: extract failed: [%s]" % err)
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
def html_text(self, filename):
|
||||
|
||||
if not self.inputmimetype:
|
||||
raise Exception("html_text: input MIME type not set")
|
||||
mimetype = self.inputmimetype
|
||||
|
||||
mutf = File(filename)
|
||||
if not mutf:
|
||||
raise Exception("mutagen failed opening %s" % filename)
|
||||
|
||||
#self._showMutaInfo(mutf)
|
||||
|
||||
###################
|
||||
@ -361,29 +351,10 @@ class AudioTagExtractor:
|
||||
try:
|
||||
docdata = tobytes(mutf.pprint())
|
||||
except Exception as err:
|
||||
docdata = ""
|
||||
self.em.rclog("Doc pprint error: %s" % err)
|
||||
|
||||
ok = True
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
return docdata
|
||||
|
||||
|
||||
def makeObject():
|
||||
|
||||
@ -25,22 +25,36 @@ import sys
|
||||
import rclexecm
|
||||
|
||||
class RclBaseHandler(object):
|
||||
'''Base Object for simple extractors.
|
||||
|
||||
This implements the boilerplate code for simple extractors for
|
||||
file types with a single document. The derived class would
|
||||
typically need only to implement the html_text method to return
|
||||
the document text in HTML format'''
|
||||
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
#params["mimetype:"]))
|
||||
#self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \
|
||||
# params["mimetype:"]))
|
||||
if not "filename:" in params:
|
||||
self.em.rclog("extractone: no file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
if "mimetype:" in params:
|
||||
self.inputmimetype = params["mimetype:"]
|
||||
else:
|
||||
self.inputmimetype = None
|
||||
|
||||
try:
|
||||
html = self.html_text(fn)
|
||||
except Exception as err:
|
||||
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
self.em.rclog("RclBaseHandler: %s : %s" % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
self.em.setmimetype('text/html')
|
||||
@ -52,9 +66,11 @@ class RclBaseHandler(object):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
@ -18,6 +18,7 @@ from __future__ import print_function
|
||||
|
||||
# Small fixes from jfd: dia files are sometimes not compressed.
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
import re
|
||||
from gzip import GzipFile
|
||||
import xml.parsers.expat
|
||||
@ -58,54 +59,32 @@ class Parser:
|
||||
self._parser.ParseFile(fh)
|
||||
del self._parser
|
||||
|
||||
class DiaExtractor:
|
||||
class DiaExtractor(RclBaseHandler):
|
||||
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
super(DiaExtractor, self).__init__(em)
|
||||
|
||||
def extractdia(self):
|
||||
docdata = ""
|
||||
ipath = ""
|
||||
try:
|
||||
docdata = self.ExtractDiaText()
|
||||
ok = True
|
||||
except Exception as err:
|
||||
self.em.rclog("Dia parse failed: %s"%err)
|
||||
ok = False
|
||||
iseof = rclexecm.RclExecM.eofnext
|
||||
self.em.setmimetype("text/plain")
|
||||
return (ok, docdata, ipath, iseof)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
def html_text(self, fn):
|
||||
try:
|
||||
self.dia = GzipFile(params["filename:"], 'rb')
|
||||
dia = GzipFile(fn, 'rb')
|
||||
# Dia files are sometimes not compressed. Quite weirdly,
|
||||
# GzipFile does not complain until we try to read. Have to do it
|
||||
# here to be able to retry an uncompressed open.
|
||||
data = self.dia.readline()
|
||||
self.dia.seek(0)
|
||||
return True
|
||||
# GzipFile does not complain until we try to read.
|
||||
data = dia.readline()
|
||||
dia.seek(0)
|
||||
except:
|
||||
# File not compressed ?
|
||||
try:
|
||||
self.dia = open(params["filename:"], 'rb')
|
||||
except:
|
||||
return False
|
||||
return True
|
||||
dia = open(fn, 'rb')
|
||||
|
||||
def getipath(self, params):
|
||||
ok, data, ipath, eof = self.extractdia()
|
||||
return (ok, data, ipath, eof)
|
||||
|
||||
def getnext(self, params):
|
||||
ok, data, ipath, eof = self.extractdia()
|
||||
return (ok, data, ipath, eof)
|
||||
|
||||
###### read data
|
||||
def ExtractDiaText(self):
|
||||
diap = Parser(self.em)
|
||||
diap.feed(self.dia)
|
||||
return '\n'.join(diap.string)
|
||||
diap.feed(dia)
|
||||
|
||||
html = '<html><head><title></title></head><body><pre>'
|
||||
html += self.em.htmlescape('\n'.join(diap.string))
|
||||
html += '</pre></body></html>'
|
||||
|
||||
return html
|
||||
|
||||
|
||||
# Main program: create protocol handler and extractor and run them
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
@ -24,22 +24,28 @@ import sys
|
||||
import re
|
||||
import rclexecm
|
||||
import subprocess
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
class DJVUExtractor(RclBaseHandler):
|
||||
|
||||
class DJVUExtractor:
|
||||
def __init__(self, em):
|
||||
self.currentindex = 0
|
||||
self.djvused = None
|
||||
self.djvutxt = None
|
||||
self.em = em
|
||||
super(DJVUExtractor, self).__init__(em)
|
||||
self.djvutxt = rclexecm.which("djvutxt")
|
||||
if not self.djvutxt:
|
||||
print("RECFILTERROR HELPERNOTFOUND djvutxt")
|
||||
sys.exit(1);
|
||||
self.djvused = rclexecm.which("djvused")
|
||||
|
||||
def extractone(self, params):
|
||||
|
||||
def html_text(self, fn):
|
||||
self.em.setmimetype('text/html')
|
||||
|
||||
# Extract metadata
|
||||
metadata = b""
|
||||
if self.djvused:
|
||||
try:
|
||||
metadata = subprocess.check_output([self.djvused, self.filename,
|
||||
"-e", "select 1;print-meta"])
|
||||
metadata = subprocess.check_output(
|
||||
[self.djvused, fn, "-e", "select 1;print-meta"])
|
||||
except Exception as e:
|
||||
self.em.rclog("djvused failed: %s" % e)
|
||||
author = ""
|
||||
@ -55,14 +61,12 @@ class DJVUExtractor:
|
||||
title = ' '.join(line[1:])
|
||||
|
||||
# Main text
|
||||
try:
|
||||
txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
|
||||
except Exception as e:
|
||||
self.em.rclog("djvused failed: %s" % e)
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
txtdata = subprocess.check_output([self.djvutxt, "-escape", fn])
|
||||
|
||||
txtdata = txtdata.decode('UTF-8', 'replace')
|
||||
|
||||
data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
|
||||
data = '''<html><head>'''
|
||||
data += '''<title>''' + self.em.htmlescape(title) + '''</title>'''
|
||||
data += '''<meta http-equiv="Content-Type" '''
|
||||
data += '''content="text/html;charset=UTF-8">'''
|
||||
if author:
|
||||
@ -72,34 +76,8 @@ class DJVUExtractor:
|
||||
|
||||
data += self.em.htmlescape(txtdata)
|
||||
data += '''</pre></body></html>'''
|
||||
return (True, data, "", rclexecm.RclExecM.eofnext)
|
||||
return data
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.filename = params["filename:"]
|
||||
self.currentindex = 0
|
||||
#self.em.rclog("openfile: [%s]" % self.filename)
|
||||
|
||||
if not self.djvutxt:
|
||||
self.djvutxt = rclexecm.which("djvutxt")
|
||||
if not self.djvutxt:
|
||||
print("RECFILTERROR HELPERNOTFOUND djvutxt")
|
||||
sys.exit(1);
|
||||
self.djvused = rclexecm.which("djvused")
|
||||
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
return (ok, data, ipath, eof)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
# Main program: create protocol handler and extractor and run them
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
@ -7,6 +7,7 @@ import os
|
||||
import re
|
||||
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
sys.path.append(sys.path[0]+"/recollepub.zip")
|
||||
try:
|
||||
@ -15,14 +16,12 @@ except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python:epub")
|
||||
sys.exit(1);
|
||||
|
||||
class rclEPUB:
|
||||
class EPUBConcatExtractor(RclBaseHandler):
|
||||
"""RclExecM slave worker for extracting all text from an EPUB
|
||||
file. This version concatenates all nodes."""
|
||||
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.em.setmimetype("text/html")
|
||||
self.currentindex = 0
|
||||
super(EPUBConcatExtractor, self).__init__(em)
|
||||
|
||||
def _header(self):
|
||||
meta = self.book.opf.metadata
|
||||
@ -46,10 +45,12 @@ class rclEPUB:
|
||||
|
||||
return data
|
||||
|
||||
def extractone(self, params):
|
||||
def html_text(self, fn):
|
||||
"""Extract EPUB data as concatenated HTML"""
|
||||
|
||||
ok = True
|
||||
f = open(fn, 'rb')
|
||||
self.book = epub.open_epub(f)
|
||||
|
||||
data = self._header()
|
||||
ids = []
|
||||
if self.book.opf.spine:
|
||||
@ -72,36 +73,8 @@ class rclEPUB:
|
||||
data += doc
|
||||
|
||||
data += b'</body></html>'
|
||||
if ok:
|
||||
return (ok, data, "", rclexecm.RclExecM.eofnext)
|
||||
else:
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
def openfile(self, params):
|
||||
"""Open the EPUB file"""
|
||||
self.currentindex = 0
|
||||
if not "filename:" in params:
|
||||
self.em.rclog("openfile: no file name")
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
try:
|
||||
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
|
||||
except Exception as err:
|
||||
self.em.rclog("openfile: epub.open failed: [%s]" % err)
|
||||
return False
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
return data
|
||||
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = rclEPUB(proto)
|
||||
extract = EPUBConcatExtractor(proto)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -388,7 +388,7 @@ def main(proto, extract):
|
||||
|
||||
params = {'filename:': makebytes(args[0])}
|
||||
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
||||
mimetype = mimetype_with_xdg(args[0])
|
||||
mimetype = mimetype_with_file(args[0])
|
||||
params['mimetype:'] = mimetype
|
||||
|
||||
if not extract.openfile(params):
|
||||
|
||||
@ -18,10 +18,11 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
import fnmatch
|
||||
from zipfile import ZipFile
|
||||
import fnmatch
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
import rclxslt
|
||||
|
||||
meta_stylesheet = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet
|
||||
@ -129,10 +130,10 @@ content_stylesheet = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class OXExtractor:
|
||||
class OXExtractor(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(OXExtractor, self).__init__(em)
|
||||
|
||||
|
||||
# Replace values inside data style sheet, depending on type of doc
|
||||
def computestylesheet(self, nm):
|
||||
@ -145,18 +146,11 @@ class OXExtractor:
|
||||
|
||||
return stylesheet
|
||||
|
||||
def extractone(self, params):
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
f = open(fn, 'rb')
|
||||
zip = ZipFile(f)
|
||||
except Exception as err:
|
||||
self.em.rclog("unzip failed: " + str(err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
def html_text(self, fn):
|
||||
|
||||
f = open(fn, 'rb')
|
||||
zip = ZipFile(f)
|
||||
|
||||
docdata = b'<html><head>'
|
||||
|
||||
@ -166,9 +160,6 @@ class OXExtractor:
|
||||
res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
|
||||
docdata += res
|
||||
except Exception as err:
|
||||
# To be checked. I'm under the impression that I get this when
|
||||
# nothing matches?
|
||||
self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
|
||||
pass
|
||||
|
||||
docdata += b'</head><body>'
|
||||
@ -200,25 +191,9 @@ class OXExtractor:
|
||||
|
||||
docdata += b'</body></html>'
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
return docdata
|
||||
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = OXExtractor(proto)
|
||||
|
||||
@ -21,6 +21,7 @@ from __future__ import print_function
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
from rclbasehandler import RclBaseHandler
|
||||
from zipfile import ZipFile
|
||||
|
||||
stylesheet_meta = '''<?xml version="1.0"?>
|
||||
@ -126,26 +127,21 @@ stylesheet_content = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class OOExtractor:
|
||||
class OOExtractor(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(OOExtractor, self).__init__(em)
|
||||
|
||||
def extractone(self, params):
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
zip = ZipFile(fn.decode('UTF-8'))
|
||||
except Exception as err:
|
||||
self.em.rclog("unzip failed: %s" % err)
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
def html_text(self, fn):
|
||||
|
||||
f = open(fn, 'rb')
|
||||
zip = ZipFile(f)
|
||||
|
||||
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
|
||||
b'content="text/html; charset=UTF-8">'
|
||||
|
||||
# Wrap metadata extraction because it can sometimes throw
|
||||
# while the main text will be valid
|
||||
try:
|
||||
metadata = zip.read("meta.xml")
|
||||
if metadata:
|
||||
@ -159,33 +155,14 @@ class OOExtractor:
|
||||
|
||||
docdata += b'</head>\n<body>\n'
|
||||
|
||||
try:
|
||||
content = zip.read("content.xml")
|
||||
if content:
|
||||
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
||||
docdata += res
|
||||
docdata += b'</body></html>'
|
||||
except Exception as err:
|
||||
self.em.rclog("bad data in %s: %s" % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
content = zip.read("content.xml")
|
||||
if content:
|
||||
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
||||
docdata += res
|
||||
docdata += b'</body></html>'
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
return docdata
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user