Simplified a number of handlers by deriving them from RclBaseHandler
This commit is contained in:
parent
e1a937f608
commit
61ee8acbc2
@ -1,10 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
# Audio tag filter for Recoll, using mutagen
|
# Audio tag extractor for Recoll, using mutagen
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
@ -174,10 +175,11 @@ def tobytes(s):
|
|||||||
# mp3: album, title, artist, genre, date, tracknumber
|
# mp3: album, title, artist, genre, date, tracknumber
|
||||||
# flac: album, title, artist, genre, xxx, tracknumber
|
# flac: album, title, artist, genre, xxx, tracknumber
|
||||||
# oggvorbis:album, title, artist, genre, date, tracknumber
|
# oggvorbis:album, title, artist, genre, date, tracknumber
|
||||||
class AudioTagExtractor:
|
class AudioTagExtractor(RclBaseHandler):
|
||||||
|
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
super(AudioTagExtractor, self).__init__(em)
|
||||||
self.currentindex = 0
|
|
||||||
|
|
||||||
def _showMutaInfo(self, mutf):
|
def _showMutaInfo(self, mutf):
|
||||||
self.em.rclog("%s" % mutf.info.pprint())
|
self.em.rclog("%s" % mutf.info.pprint())
|
||||||
@ -186,10 +188,6 @@ class AudioTagExtractor:
|
|||||||
(prop, getattr( mutf.info, prop)))
|
(prop, getattr( mutf.info, prop)))
|
||||||
|
|
||||||
|
|
||||||
def _printableFilename(self):
|
|
||||||
return self.filename.decode('utf-8', errors='replace')
|
|
||||||
|
|
||||||
|
|
||||||
def _embeddedImageFormat(self, mutf):
|
def _embeddedImageFormat(self, mutf):
|
||||||
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
|
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
|
||||||
if 'audio/mp3' in mutf.mime:
|
if 'audio/mp3' in mutf.mime:
|
||||||
@ -232,23 +230,15 @@ class AudioTagExtractor:
|
|||||||
val = time.mktime(pdt.timetuple())
|
val = time.mktime(pdt.timetuple())
|
||||||
return "%d" % val
|
return "%d" % val
|
||||||
|
|
||||||
def extractone(self, params):
|
def html_text(self, filename):
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"],
|
|
||||||
# params["mimetype:"]))
|
if not self.inputmimetype:
|
||||||
docdata = ""
|
raise Exception("html_text: input MIME type not set")
|
||||||
ok = False
|
mimetype = self.inputmimetype
|
||||||
if not "mimetype:" in params or not "filename:" in params:
|
|
||||||
self.em.rclog("extractone: no mime or file name")
|
mutf = File(filename)
|
||||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
if not mutf:
|
||||||
filename = params["filename:"]
|
raise Exception("mutagen failed opening %s" % filename)
|
||||||
mimetype = params["mimetype:"]
|
|
||||||
self.filename = filename
|
|
||||||
#self.em.rclog("%s" % filename)
|
|
||||||
try:
|
|
||||||
mutf = File(filename)
|
|
||||||
except Exception as err:
|
|
||||||
self.em.rclog("extractone: extract failed: [%s]" % err)
|
|
||||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
#self._showMutaInfo(mutf)
|
#self._showMutaInfo(mutf)
|
||||||
|
|
||||||
@ -361,29 +351,10 @@ class AudioTagExtractor:
|
|||||||
try:
|
try:
|
||||||
docdata = tobytes(mutf.pprint())
|
docdata = tobytes(mutf.pprint())
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
docdata = ""
|
||||||
self.em.rclog("Doc pprint error: %s" % err)
|
self.em.rclog("Doc pprint error: %s" % err)
|
||||||
|
|
||||||
ok = True
|
return docdata
|
||||||
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
|
|
||||||
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
|
||||||
def openfile(self, params):
|
|
||||||
self.currentindex = 0
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def makeObject():
|
def makeObject():
|
||||||
|
|||||||
@ -25,22 +25,36 @@ import sys
|
|||||||
import rclexecm
|
import rclexecm
|
||||||
|
|
||||||
class RclBaseHandler(object):
|
class RclBaseHandler(object):
|
||||||
|
'''Base Object for simple extractors.
|
||||||
|
|
||||||
|
This implements the boilerplate code for simple extractors for
|
||||||
|
file types with a single document. The derived class would
|
||||||
|
typically need only to implement the html_text method to return
|
||||||
|
the document text in HTML format'''
|
||||||
|
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
|
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
#self.em.rclog("extractone fn %s mt %s" % (params["filename:"], \
|
||||||
#params["mimetype:"]))
|
# params["mimetype:"]))
|
||||||
if not "filename:" in params:
|
if not "filename:" in params:
|
||||||
self.em.rclog("extractone: no file name")
|
self.em.rclog("extractone: no file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
|
|
||||||
|
if "mimetype:" in params:
|
||||||
|
self.inputmimetype = params["mimetype:"]
|
||||||
|
else:
|
||||||
|
self.inputmimetype = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
html = self.html_text(fn)
|
html = self.html_text(fn)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
self.em.rclog("RclBaseHandler: %s : %s" % (fn, err))
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
@ -52,9 +66,11 @@ class RclBaseHandler(object):
|
|||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def getipath(self, params):
|
def getipath(self, params):
|
||||||
return self.extractone(params)
|
return self.extractone(params)
|
||||||
|
|
||||||
|
|
||||||
def getnext(self, params):
|
def getnext(self, params):
|
||||||
if self.currentindex >= 1:
|
if self.currentindex >= 1:
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|||||||
@ -18,6 +18,7 @@ from __future__ import print_function
|
|||||||
|
|
||||||
# Small fixes from jfd: dia files are sometimes not compressed.
|
# Small fixes from jfd: dia files are sometimes not compressed.
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
import re
|
import re
|
||||||
from gzip import GzipFile
|
from gzip import GzipFile
|
||||||
import xml.parsers.expat
|
import xml.parsers.expat
|
||||||
@ -58,54 +59,32 @@ class Parser:
|
|||||||
self._parser.ParseFile(fh)
|
self._parser.ParseFile(fh)
|
||||||
del self._parser
|
del self._parser
|
||||||
|
|
||||||
class DiaExtractor:
|
class DiaExtractor(RclBaseHandler):
|
||||||
|
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
super(DiaExtractor, self).__init__(em)
|
||||||
|
|
||||||
def extractdia(self):
|
|
||||||
docdata = ""
|
|
||||||
ipath = ""
|
|
||||||
try:
|
|
||||||
docdata = self.ExtractDiaText()
|
|
||||||
ok = True
|
|
||||||
except Exception as err:
|
|
||||||
self.em.rclog("Dia parse failed: %s"%err)
|
|
||||||
ok = False
|
|
||||||
iseof = rclexecm.RclExecM.eofnext
|
|
||||||
self.em.setmimetype("text/plain")
|
|
||||||
return (ok, docdata, ipath, iseof)
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
def html_text(self, fn):
|
||||||
def openfile(self, params):
|
|
||||||
try:
|
try:
|
||||||
self.dia = GzipFile(params["filename:"], 'rb')
|
dia = GzipFile(fn, 'rb')
|
||||||
# Dia files are sometimes not compressed. Quite weirdly,
|
# Dia files are sometimes not compressed. Quite weirdly,
|
||||||
# GzipFile does not complain until we try to read. Have to do it
|
# GzipFile does not complain until we try to read.
|
||||||
# here to be able to retry an uncompressed open.
|
data = dia.readline()
|
||||||
data = self.dia.readline()
|
dia.seek(0)
|
||||||
self.dia.seek(0)
|
|
||||||
return True
|
|
||||||
except:
|
except:
|
||||||
# File not compressed ?
|
# File not compressed ?
|
||||||
try:
|
dia = open(fn, 'rb')
|
||||||
self.dia = open(params["filename:"], 'rb')
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
ok, data, ipath, eof = self.extractdia()
|
|
||||||
return (ok, data, ipath, eof)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
ok, data, ipath, eof = self.extractdia()
|
|
||||||
return (ok, data, ipath, eof)
|
|
||||||
|
|
||||||
###### read data
|
|
||||||
def ExtractDiaText(self):
|
|
||||||
diap = Parser(self.em)
|
diap = Parser(self.em)
|
||||||
diap.feed(self.dia)
|
diap.feed(dia)
|
||||||
return '\n'.join(diap.string)
|
|
||||||
|
html = '<html><head><title></title></head><body><pre>'
|
||||||
|
html += self.em.htmlescape('\n'.join(diap.string))
|
||||||
|
html += '</pre></body></html>'
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
# Main program: create protocol handler and extractor and run them
|
# Main program: create protocol handler and extractor and run them
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
|
|||||||
@ -24,22 +24,28 @@ import sys
|
|||||||
import re
|
import re
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
|
|
||||||
|
class DJVUExtractor(RclBaseHandler):
|
||||||
|
|
||||||
class DJVUExtractor:
|
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.currentindex = 0
|
super(DJVUExtractor, self).__init__(em)
|
||||||
self.djvused = None
|
self.djvutxt = rclexecm.which("djvutxt")
|
||||||
self.djvutxt = None
|
if not self.djvutxt:
|
||||||
self.em = em
|
print("RECFILTERROR HELPERNOTFOUND djvutxt")
|
||||||
|
sys.exit(1);
|
||||||
|
self.djvused = rclexecm.which("djvused")
|
||||||
|
|
||||||
def extractone(self, params):
|
|
||||||
|
def html_text(self, fn):
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
|
|
||||||
# Extract metadata
|
# Extract metadata
|
||||||
|
metadata = b""
|
||||||
if self.djvused:
|
if self.djvused:
|
||||||
try:
|
try:
|
||||||
metadata = subprocess.check_output([self.djvused, self.filename,
|
metadata = subprocess.check_output(
|
||||||
"-e", "select 1;print-meta"])
|
[self.djvused, fn, "-e", "select 1;print-meta"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.em.rclog("djvused failed: %s" % e)
|
self.em.rclog("djvused failed: %s" % e)
|
||||||
author = ""
|
author = ""
|
||||||
@ -55,14 +61,12 @@ class DJVUExtractor:
|
|||||||
title = ' '.join(line[1:])
|
title = ' '.join(line[1:])
|
||||||
|
|
||||||
# Main text
|
# Main text
|
||||||
try:
|
txtdata = subprocess.check_output([self.djvutxt, "-escape", fn])
|
||||||
txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
|
|
||||||
except Exception as e:
|
|
||||||
self.em.rclog("djvused failed: %s" % e)
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
txtdata = txtdata.decode('UTF-8', 'replace')
|
txtdata = txtdata.decode('UTF-8', 'replace')
|
||||||
|
|
||||||
data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
|
data = '''<html><head>'''
|
||||||
|
data += '''<title>''' + self.em.htmlescape(title) + '''</title>'''
|
||||||
data += '''<meta http-equiv="Content-Type" '''
|
data += '''<meta http-equiv="Content-Type" '''
|
||||||
data += '''content="text/html;charset=UTF-8">'''
|
data += '''content="text/html;charset=UTF-8">'''
|
||||||
if author:
|
if author:
|
||||||
@ -72,34 +76,8 @@ class DJVUExtractor:
|
|||||||
|
|
||||||
data += self.em.htmlescape(txtdata)
|
data += self.em.htmlescape(txtdata)
|
||||||
data += '''</pre></body></html>'''
|
data += '''</pre></body></html>'''
|
||||||
return (True, data, "", rclexecm.RclExecM.eofnext)
|
return data
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
|
||||||
def openfile(self, params):
|
|
||||||
self.filename = params["filename:"]
|
|
||||||
self.currentindex = 0
|
|
||||||
#self.em.rclog("openfile: [%s]" % self.filename)
|
|
||||||
|
|
||||||
if not self.djvutxt:
|
|
||||||
self.djvutxt = rclexecm.which("djvutxt")
|
|
||||||
if not self.djvutxt:
|
|
||||||
print("RECFILTERROR HELPERNOTFOUND djvutxt")
|
|
||||||
sys.exit(1);
|
|
||||||
self.djvused = rclexecm.which("djvused")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
return (ok, data, ipath, eof)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
# Main program: create protocol handler and extractor and run them
|
# Main program: create protocol handler and extractor and run them
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
|
|
||||||
sys.path.append(sys.path[0]+"/recollepub.zip")
|
sys.path.append(sys.path[0]+"/recollepub.zip")
|
||||||
try:
|
try:
|
||||||
@ -15,14 +16,12 @@ except:
|
|||||||
print("RECFILTERROR HELPERNOTFOUND python:epub")
|
print("RECFILTERROR HELPERNOTFOUND python:epub")
|
||||||
sys.exit(1);
|
sys.exit(1);
|
||||||
|
|
||||||
class rclEPUB:
|
class EPUBConcatExtractor(RclBaseHandler):
|
||||||
"""RclExecM slave worker for extracting all text from an EPUB
|
"""RclExecM slave worker for extracting all text from an EPUB
|
||||||
file. This version concatenates all nodes."""
|
file. This version concatenates all nodes."""
|
||||||
|
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
super(EPUBConcatExtractor, self).__init__(em)
|
||||||
self.em.setmimetype("text/html")
|
|
||||||
self.currentindex = 0
|
|
||||||
|
|
||||||
def _header(self):
|
def _header(self):
|
||||||
meta = self.book.opf.metadata
|
meta = self.book.opf.metadata
|
||||||
@ -46,10 +45,12 @@ class rclEPUB:
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def extractone(self, params):
|
def html_text(self, fn):
|
||||||
"""Extract EPUB data as concatenated HTML"""
|
"""Extract EPUB data as concatenated HTML"""
|
||||||
|
|
||||||
ok = True
|
f = open(fn, 'rb')
|
||||||
|
self.book = epub.open_epub(f)
|
||||||
|
|
||||||
data = self._header()
|
data = self._header()
|
||||||
ids = []
|
ids = []
|
||||||
if self.book.opf.spine:
|
if self.book.opf.spine:
|
||||||
@ -72,36 +73,8 @@ class rclEPUB:
|
|||||||
data += doc
|
data += doc
|
||||||
|
|
||||||
data += b'</body></html>'
|
data += b'</body></html>'
|
||||||
if ok:
|
return data
|
||||||
return (ok, data, "", rclexecm.RclExecM.eofnext)
|
|
||||||
else:
|
|
||||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
def openfile(self, params):
|
|
||||||
"""Open the EPUB file"""
|
|
||||||
self.currentindex = 0
|
|
||||||
if not "filename:" in params:
|
|
||||||
self.em.rclog("openfile: no file name")
|
|
||||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.book = epub.open_epub(params["filename:"].decode('UTF-8'))
|
|
||||||
except Exception as err:
|
|
||||||
self.em.rclog("openfile: epub.open failed: [%s]" % err)
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
extract = rclEPUB(proto)
|
extract = EPUBConcatExtractor(proto)
|
||||||
rclexecm.main(proto, extract)
|
rclexecm.main(proto, extract)
|
||||||
|
|||||||
@ -388,7 +388,7 @@ def main(proto, extract):
|
|||||||
|
|
||||||
params = {'filename:': makebytes(args[0])}
|
params = {'filename:': makebytes(args[0])}
|
||||||
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
# Some filters (e.g. rclaudio) need/get a MIME type from the indexer
|
||||||
mimetype = mimetype_with_xdg(args[0])
|
mimetype = mimetype_with_file(args[0])
|
||||||
params['mimetype:'] = mimetype
|
params['mimetype:'] = mimetype
|
||||||
|
|
||||||
if not extract.openfile(params):
|
if not extract.openfile(params):
|
||||||
|
|||||||
@ -18,10 +18,11 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import rclexecm
|
|
||||||
import rclxslt
|
|
||||||
import fnmatch
|
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
import fnmatch
|
||||||
|
import rclexecm
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
|
import rclxslt
|
||||||
|
|
||||||
meta_stylesheet = '''<?xml version="1.0"?>
|
meta_stylesheet = '''<?xml version="1.0"?>
|
||||||
<xsl:stylesheet
|
<xsl:stylesheet
|
||||||
@ -129,10 +130,10 @@ content_stylesheet = '''<?xml version="1.0"?>
|
|||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
class OXExtractor:
|
class OXExtractor(RclBaseHandler):
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
super(OXExtractor, self).__init__(em)
|
||||||
self.currentindex = 0
|
|
||||||
|
|
||||||
# Replace values inside data style sheet, depending on type of doc
|
# Replace values inside data style sheet, depending on type of doc
|
||||||
def computestylesheet(self, nm):
|
def computestylesheet(self, nm):
|
||||||
@ -145,18 +146,11 @@ class OXExtractor:
|
|||||||
|
|
||||||
return stylesheet
|
return stylesheet
|
||||||
|
|
||||||
def extractone(self, params):
|
|
||||||
if "filename:" not in params:
|
|
||||||
self.em.rclog("extractone: no mime or file name")
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
fn = params["filename:"]
|
|
||||||
|
|
||||||
try:
|
def html_text(self, fn):
|
||||||
f = open(fn, 'rb')
|
|
||||||
zip = ZipFile(f)
|
f = open(fn, 'rb')
|
||||||
except Exception as err:
|
zip = ZipFile(f)
|
||||||
self.em.rclog("unzip failed: " + str(err))
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
docdata = b'<html><head>'
|
docdata = b'<html><head>'
|
||||||
|
|
||||||
@ -166,9 +160,6 @@ class OXExtractor:
|
|||||||
res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
|
res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
|
||||||
docdata += res
|
docdata += res
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
# To be checked. I'm under the impression that I get this when
|
|
||||||
# nothing matches?
|
|
||||||
self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
docdata += b'</head><body>'
|
docdata += b'</head><body>'
|
||||||
@ -200,25 +191,9 @@ class OXExtractor:
|
|||||||
|
|
||||||
docdata += b'</body></html>'
|
docdata += b'</body></html>'
|
||||||
|
|
||||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
return docdata
|
||||||
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
|
||||||
def openfile(self, params):
|
|
||||||
self.currentindex = 0
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
extract = OXExtractor(proto)
|
extract = OXExtractor(proto)
|
||||||
|
|||||||
@ -21,6 +21,7 @@ from __future__ import print_function
|
|||||||
import sys
|
import sys
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import rclxslt
|
import rclxslt
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
stylesheet_meta = '''<?xml version="1.0"?>
|
stylesheet_meta = '''<?xml version="1.0"?>
|
||||||
@ -126,26 +127,21 @@ stylesheet_content = '''<?xml version="1.0"?>
|
|||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
class OOExtractor:
|
class OOExtractor(RclBaseHandler):
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
super(OOExtractor, self).__init__(em)
|
||||||
self.currentindex = 0
|
|
||||||
|
|
||||||
def extractone(self, params):
|
|
||||||
if "filename:" not in params:
|
|
||||||
self.em.rclog("extractone: no mime or file name")
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
fn = params["filename:"]
|
|
||||||
|
|
||||||
try:
|
def html_text(self, fn):
|
||||||
zip = ZipFile(fn.decode('UTF-8'))
|
|
||||||
except Exception as err:
|
f = open(fn, 'rb')
|
||||||
self.em.rclog("unzip failed: %s" % err)
|
zip = ZipFile(f)
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
|
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
|
||||||
b'content="text/html; charset=UTF-8">'
|
b'content="text/html; charset=UTF-8">'
|
||||||
|
|
||||||
|
# Wrap metadata extraction because it can sometimes throw
|
||||||
|
# while the main text will be valid
|
||||||
try:
|
try:
|
||||||
metadata = zip.read("meta.xml")
|
metadata = zip.read("meta.xml")
|
||||||
if metadata:
|
if metadata:
|
||||||
@ -159,33 +155,14 @@ class OOExtractor:
|
|||||||
|
|
||||||
docdata += b'</head>\n<body>\n'
|
docdata += b'</head>\n<body>\n'
|
||||||
|
|
||||||
try:
|
content = zip.read("content.xml")
|
||||||
content = zip.read("content.xml")
|
if content:
|
||||||
if content:
|
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
||||||
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
docdata += res
|
||||||
docdata += res
|
docdata += b'</body></html>'
|
||||||
docdata += b'</body></html>'
|
|
||||||
except Exception as err:
|
|
||||||
self.em.rclog("bad data in %s: %s" % (fn, err))
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
|
|
||||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
return docdata
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
|
||||||
def openfile(self, params):
|
|
||||||
self.currentindex = 0
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getipath(self, params):
|
|
||||||
return self.extractone(params)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
if self.currentindex >= 1:
|
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
||||||
else:
|
|
||||||
ret= self.extractone(params)
|
|
||||||
self.currentindex += 1
|
|
||||||
return ret
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user