more filters made compatible with python3
This commit is contained in:
parent
f344e8fedd
commit
dfe00ab11f
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
|
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
|
||||||
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
||||||
|
|
||||||
|
|||||||
@ -16,12 +16,15 @@ with acute accent ?
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import string
|
PY3 = sys.version > '3'
|
||||||
|
if not PY3:
|
||||||
|
import string
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
|
||||||
class European8859TextClassifier:
|
class European8859TextClassifier:
|
||||||
def __init__(self, langzip=""):
|
def __init__(self, langzip=""):
|
||||||
"""langzip contains text files. Each text file is named like lang_code.txt
|
"""langzip contains text files. Each text file is named like lang_code.txt
|
||||||
@ -33,9 +36,12 @@ class European8859TextClassifier:
|
|||||||
self.readlanguages(langzip)
|
self.readlanguages(langzip)
|
||||||
|
|
||||||
# Table to translate from punctuation to spaces
|
# Table to translate from punctuation to spaces
|
||||||
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
|
self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r"
|
||||||
spaces = len(self.punct) * " "
|
spaces = len(self.punct) * b' '
|
||||||
self.spacetable = string.maketrans(self.punct, spaces)
|
if PY3:
|
||||||
|
self.spacetable = bytes.maketrans(self.punct, spaces)
|
||||||
|
else:
|
||||||
|
self.spacetable = string.maketrans(self.punct, spaces)
|
||||||
|
|
||||||
def readlanguages(self, langzip):
|
def readlanguages(self, langzip):
|
||||||
"""Extract the stop words lists from the zip file.
|
"""Extract the stop words lists from the zip file.
|
||||||
@ -53,7 +59,7 @@ class European8859TextClassifier:
|
|||||||
text = zip.read(fn)
|
text = zip.read(fn)
|
||||||
words = text.split()
|
words = text.split()
|
||||||
for word in words:
|
for word in words:
|
||||||
if self.allwords.has_key(word):
|
if word in self.allwords:
|
||||||
self.allwords[word].append((lang, code))
|
self.allwords[word].append((lang, code))
|
||||||
else:
|
else:
|
||||||
self.allwords[word] = [(lang, code)]
|
self.allwords[word] = [(lang, code)]
|
||||||
@ -64,7 +70,7 @@ class European8859TextClassifier:
|
|||||||
|
|
||||||
# Limit to reasonable size.
|
# Limit to reasonable size.
|
||||||
if len(rawtext) > 10000:
|
if len(rawtext) > 10000:
|
||||||
i = rawtext.find(" ", 9000)
|
i = rawtext.find(b' ', 9000)
|
||||||
if i == -1:
|
if i == -1:
|
||||||
i = 9000
|
i = 9000
|
||||||
rawtext = rawtext[0:i]
|
rawtext = rawtext[0:i]
|
||||||
@ -79,9 +85,9 @@ class European8859TextClassifier:
|
|||||||
dict = {}
|
dict = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
dict[w] = dict.get(w, 0) + 1
|
dict[w] = dict.get(w, 0) + 1
|
||||||
lfreq = [a[0] for a in sorted(dict.iteritems(), \
|
lfreq = [a[0] for a in sorted(dict.items(), \
|
||||||
key=lambda entry: entry[1], reverse=True)[0:ntest]]
|
key=lambda entry: entry[1], reverse=True)[0:ntest]]
|
||||||
#print lfreq
|
#print(lfreq)
|
||||||
|
|
||||||
# Build a dict (lang,code)->matchcount
|
# Build a dict (lang,code)->matchcount
|
||||||
langstats = {}
|
langstats = {}
|
||||||
@ -91,9 +97,9 @@ class European8859TextClassifier:
|
|||||||
langstats[lc] = langstats.get(lc, 0) + 1
|
langstats[lc] = langstats.get(lc, 0) + 1
|
||||||
|
|
||||||
# Get a list of (lang,code) sorted by match count
|
# Get a list of (lang,code) sorted by match count
|
||||||
lcfreq = sorted(langstats.iteritems(), \
|
lcfreq = sorted(langstats.items(), \
|
||||||
key=lambda entry: entry[1], reverse=True)
|
key=lambda entry: entry[1], reverse=True)
|
||||||
#print lcfreq[0:3]
|
#print(lcfreq[0:3])
|
||||||
if len(lcfreq) != 0:
|
if len(lcfreq) != 0:
|
||||||
lc,maxcount = lcfreq[0]
|
lc,maxcount = lcfreq[0]
|
||||||
maxlang = lc[0]
|
maxlang = lc[0]
|
||||||
|
|||||||
@ -17,6 +17,8 @@
|
|||||||
|
|
||||||
# Recoll PDF extractor, with support for attachments
|
# Recoll PDF extractor, with support for attachments
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
@ -89,7 +91,7 @@ class PDFExtractor:
|
|||||||
"output", tmpdir])
|
"output", tmpdir])
|
||||||
self.attachlist = sorted(os.listdir(tmpdir))
|
self.attachlist = sorted(os.listdir(tmpdir))
|
||||||
return True
|
return True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
self.em.rclog("extractAttach: failed: %s" % e)
|
self.em.rclog("extractAttach: failed: %s" % e)
|
||||||
# Return true anyway, pdf attachments are no big deal
|
# Return true anyway, pdf attachments are no big deal
|
||||||
return True
|
return True
|
||||||
|
|||||||
@ -146,7 +146,7 @@ class OXExtractor:
|
|||||||
return stylesheet
|
return stylesheet
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
if not params.has_key("filename:"):
|
if "filename:" not in params:
|
||||||
self.em.rclog("extractone: no mime or file name")
|
self.em.rclog("extractone: no mime or file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
|
|||||||
@ -1,4 +1,10 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
# Recoll PPT text extractor
|
||||||
|
# Msodump is not compatible with Python3 AFAIK, so this is stuck to
|
||||||
|
# Python2 too
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import rclexec1
|
import rclexec1
|
||||||
|
|||||||
@ -22,6 +22,8 @@
|
|||||||
# - parse script encoding and allow output in any encoding by using unicode
|
# - parse script encoding and allow output in any encoding by using unicode
|
||||||
# as intermediate
|
# as intermediate
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
__version__ = '0.3'
|
__version__ = '0.3'
|
||||||
__date__ = '2005-07-04'
|
__date__ = '2005-07-04'
|
||||||
__license__ = 'GPL'
|
__license__ = 'GPL'
|
||||||
@ -29,9 +31,26 @@ __author__ = 'J
|
|||||||
|
|
||||||
|
|
||||||
# Imports
|
# Imports
|
||||||
import cgi, string, sys, cStringIO
|
import cgi, string, sys
|
||||||
|
PY2 = sys.version < '3'
|
||||||
|
if PY2:
|
||||||
|
import cStringIO
|
||||||
|
else:
|
||||||
|
import io
|
||||||
import keyword, token, tokenize
|
import keyword, token, tokenize
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
def makebytes(data):
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data.encode("UTF-8")
|
||||||
|
else:
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
def makebytes(data):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
return data.encode("UTF-8")
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
### Python Source Parser (does Hilighting)
|
### Python Source Parser (does Hilighting)
|
||||||
@ -57,7 +76,7 @@ _HTML_HEADER = """\
|
|||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>%%(title)s</title>
|
<title>%%(title)s</title>
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
<meta name="Generator" content="colorize.py (version %s)">
|
<meta name="Generator" content="colorize.py (version %s)">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
@ -114,7 +133,7 @@ class Parser:
|
|||||||
def __init__(self, raw, out=sys.stdout):
|
def __init__(self, raw, out=sys.stdout):
|
||||||
""" Store the source text.
|
""" Store the source text.
|
||||||
"""
|
"""
|
||||||
self.raw = string.strip(string.expandtabs(raw))
|
self.raw = raw.expandtabs().strip()
|
||||||
self.out = out
|
self.out = out
|
||||||
|
|
||||||
def format(self):
|
def format(self):
|
||||||
@ -124,35 +143,44 @@ class Parser:
|
|||||||
self.lines = [0, 0]
|
self.lines = [0, 0]
|
||||||
pos = 0
|
pos = 0
|
||||||
while 1:
|
while 1:
|
||||||
pos = string.find(self.raw, '\n', pos) + 1
|
pos = self.raw.find(b'\n', pos) + 1
|
||||||
if not pos: break
|
if not pos: break
|
||||||
self.lines.append(pos)
|
self.lines.append(pos)
|
||||||
self.lines.append(len(self.raw))
|
self.lines.append(len(self.raw))
|
||||||
|
|
||||||
# parse the source and write it
|
# parse the source and write it
|
||||||
self.pos = 0
|
self.pos = 0
|
||||||
text = cStringIO.StringIO(self.raw)
|
if PY2:
|
||||||
self.out.write(self.stylesheet)
|
text = cStringIO.StringIO(self.raw)
|
||||||
self.out.write('<pre class="code">\n')
|
else:
|
||||||
|
text = io.BytesIO(self.raw)
|
||||||
|
self.out.write(makebytes(self.stylesheet))
|
||||||
|
self.out.write(b'<pre class="code">\n')
|
||||||
try:
|
try:
|
||||||
tokenize.tokenize(text.readline, self)
|
if PY2:
|
||||||
except tokenize.TokenError, ex:
|
tokenize.tokenize(text.readline, self)
|
||||||
|
else:
|
||||||
|
for a,b,c,d,e in tokenize.tokenize(text.readline):
|
||||||
|
self(a,b,c,d,e)
|
||||||
|
except tokenize.TokenError as ex:
|
||||||
msg = ex[0]
|
msg = ex[0]
|
||||||
line = ex[1][0]
|
line = ex[1][0]
|
||||||
self.out.write("<h3>ERROR: %s</h3>%s\n" % (
|
self.out.write("<h3>ERROR: %s</h3>%s\n" % (
|
||||||
msg, self.raw[self.lines[line]:]))
|
msg, self.raw[self.lines[line]:]))
|
||||||
except IndentationError, ex:
|
except IndentationError as ex:
|
||||||
msg = ex[0]
|
msg = ex[0]
|
||||||
self.out.write("<h3>ERROR: %s</h3>\n" % (msg))
|
self.out.write("<h3>ERROR: %s</h3>\n" % (msg))
|
||||||
self.out.write('\n</pre>')
|
self.out.write(b'\n</pre>')
|
||||||
|
|
||||||
def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line):
|
def __call__(self, toktype, toktext, startpos, endpos, line):
|
||||||
""" Token handler.
|
""" Token handler.
|
||||||
"""
|
"""
|
||||||
if 0:
|
if 0:
|
||||||
print "type", toktype, token.tok_name[toktype], "text", toktext,
|
print("type %s %s text %s start %s %s end %s %s<br>\n" % \
|
||||||
print "start", srow,scol, "end", erow,ecol, "<br>"
|
(toktype, token.tok_name[toktype], toktext, \
|
||||||
|
srow, scol,erow,ecol))
|
||||||
|
srow, scol = startpos
|
||||||
|
erow, ecol = endpos
|
||||||
# calculate new positions
|
# calculate new positions
|
||||||
oldpos = self.pos
|
oldpos = self.pos
|
||||||
newpos = self.lines[srow] + scol
|
newpos = self.lines[srow] + scol
|
||||||
@ -160,7 +188,7 @@ class Parser:
|
|||||||
|
|
||||||
# handle newlines
|
# handle newlines
|
||||||
if toktype in [token.NEWLINE, tokenize.NL]:
|
if toktype in [token.NEWLINE, tokenize.NL]:
|
||||||
self.out.write('\n')
|
self.out.write(b'\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
# send the original whitespace, if needed
|
# send the original whitespace, if needed
|
||||||
@ -180,9 +208,9 @@ class Parser:
|
|||||||
css_class = _css_classes.get(toktype, 'text')
|
css_class = _css_classes.get(toktype, 'text')
|
||||||
|
|
||||||
# send text
|
# send text
|
||||||
self.out.write('<span class="%s">' % (css_class,))
|
self.out.write(makebytes('<span class="%s">' % (css_class,)))
|
||||||
self.out.write(cgi.escape(toktext))
|
self.out.write(makebytes(cgi.escape(toktext)))
|
||||||
self.out.write('</span>')
|
self.out.write(b'</span>')
|
||||||
|
|
||||||
|
|
||||||
def colorize_file(file=None, outstream=sys.stdout, standalone=True):
|
def colorize_file(file=None, outstream=sys.stdout, standalone=True):
|
||||||
@ -205,7 +233,7 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
|
|||||||
filename = 'STREAM'
|
filename = 'STREAM'
|
||||||
elif file is not None:
|
elif file is not None:
|
||||||
try:
|
try:
|
||||||
sourcefile = open(file)
|
sourcefile = open(file, 'rb')
|
||||||
filename = basename(file)
|
filename = basename(file)
|
||||||
except IOError:
|
except IOError:
|
||||||
raise SystemExit("File %s unknown." % file)
|
raise SystemExit("File %s unknown." % file)
|
||||||
@ -215,22 +243,26 @@ def colorize_file(file=None, outstream=sys.stdout, standalone=True):
|
|||||||
source = sourcefile.read()
|
source = sourcefile.read()
|
||||||
|
|
||||||
if standalone:
|
if standalone:
|
||||||
outstream.write(_HTML_HEADER % {'title': filename})
|
outstream.write(makebytes(_HTML_HEADER % {'title': filename}))
|
||||||
Parser(source, out=outstream).format()
|
Parser(source, out=outstream).format()
|
||||||
if standalone:
|
if standalone:
|
||||||
outstream.write(_HTML_FOOTER)
|
outstream.write(makebytes(_HTML_FOOTER))
|
||||||
|
|
||||||
if file:
|
if file:
|
||||||
sourcefile.close()
|
sourcefile.close()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
|
if PY2:
|
||||||
|
out = sys.stdout
|
||||||
|
else:
|
||||||
|
out = sys.stdout.buffer
|
||||||
if os.environ.get('PATH_TRANSLATED'):
|
if os.environ.get('PATH_TRANSLATED'):
|
||||||
filepath = os.environ.get('PATH_TRANSLATED')
|
filepath = os.environ.get('PATH_TRANSLATED')
|
||||||
print 'Content-Type: text/html; charset="iso-8859-1"\n'
|
print('Content-Type: text/html; charset="iso-8859-1"\n')
|
||||||
colorize_file(filepath)
|
colorize_file(filepath, out)
|
||||||
elif len(sys.argv) > 1:
|
elif len(sys.argv) > 1:
|
||||||
filepath = sys.argv[1]
|
filepath = sys.argv[1]
|
||||||
colorize_file(filepath)
|
colorize_file(filepath, out)
|
||||||
else:
|
else:
|
||||||
colorize_file()
|
colorize_file()
|
||||||
|
|||||||
@ -18,12 +18,14 @@
|
|||||||
# Free Software Foundation, Inc.,
|
# Free Software Foundation, Inc.,
|
||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import rclexecm
|
import rclexecm
|
||||||
try:
|
try:
|
||||||
from rarfile import RarFile
|
from rarfile import RarFile
|
||||||
except:
|
except:
|
||||||
print "RECFILTERROR HELPERNOTFOUND python:rarfile"
|
print("RECFILTERROR HELPERNOTFOUND python:rarfile")
|
||||||
sys.exit(1);
|
sys.exit(1);
|
||||||
|
|
||||||
# Requires RarFile python module. Try "sudo pip install rarfile"
|
# Requires RarFile python module. Try "sudo pip install rarfile"
|
||||||
@ -67,9 +69,7 @@ class RarExtractor:
|
|||||||
iseof = rclexecm.RclExecM.noteof
|
iseof = rclexecm.RclExecM.noteof
|
||||||
if self.currentindex >= len(self.rar.namelist()) -1:
|
if self.currentindex >= len(self.rar.namelist()) -1:
|
||||||
iseof = rclexecm.RclExecM.eofnext
|
iseof = rclexecm.RclExecM.eofnext
|
||||||
if isinstance(ipath, unicode):
|
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
|
||||||
ipath = ipath.encode("utf-8")
|
|
||||||
return (ok, docdata, ipath, iseof)
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
|
|||||||
@ -16,6 +16,8 @@
|
|||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
######################################
|
######################################
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import rclxslt
|
import rclxslt
|
||||||
@ -130,19 +132,19 @@ class OOExtractor:
|
|||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
if not params.has_key("filename:"):
|
if "filename:" not in params:
|
||||||
self.em.rclog("extractone: no mime or file name")
|
self.em.rclog("extractone: no mime or file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
zip = ZipFile(fn)
|
zip = ZipFile(fn.decode('UTF-8'))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("unzip failed: " + str(err))
|
self.em.rclog("unzip failed: %s" % err)
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
docdata = '<html><head><meta http-equiv="Content-Type"' \
|
docdata = b'<html><head><meta http-equiv="Content-Type"' \
|
||||||
'content="text/html; charset=UTF-8"></head><body>'
|
b'content="text/html; charset=UTF-8"></head><body>'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
metadata = zip.read("meta.xml")
|
metadata = zip.read("meta.xml")
|
||||||
@ -160,9 +162,9 @@ class OOExtractor:
|
|||||||
if content:
|
if content:
|
||||||
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
||||||
docdata += res
|
docdata += res
|
||||||
docdata += '</body></html>'
|
docdata += b'</body></html>'
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("bad data in %s" % fn)
|
self.em.rclog("bad data in %s: %s" % (fn, err))
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||||
|
|||||||
@ -16,6 +16,8 @@
|
|||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
######################################
|
######################################
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import rclxslt
|
import rclxslt
|
||||||
@ -104,7 +106,7 @@ class SVGExtractor:
|
|||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
if not params.has_key("filename:"):
|
if "filename:" not in params:
|
||||||
self.em.rclog("extractone: no mime or file name")
|
self.em.rclog("extractone: no mime or file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
|
|||||||
@ -6,12 +6,14 @@
|
|||||||
# It works not only for tar-files, but automatically for gzipped and
|
# It works not only for tar-files, but automatically for gzipped and
|
||||||
# bzipped tar-files at well.
|
# bzipped tar-files at well.
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import tarfile
|
import tarfile
|
||||||
except:
|
except:
|
||||||
print "RECFILTERROR HELPERNOTFOUND python:tarfile"
|
print("RECFILTERROR HELPERNOTFOUND python:tarfile")
|
||||||
sys.exit(1);
|
sys.exit(1);
|
||||||
|
|
||||||
class TarExtractor:
|
class TarExtractor:
|
||||||
@ -38,15 +40,15 @@ class TarExtractor:
|
|||||||
iseof = rclexecm.RclExecM.noteof
|
iseof = rclexecm.RclExecM.noteof
|
||||||
if self.currentindex >= len(self.namen) -1:
|
if self.currentindex >= len(self.namen) -1:
|
||||||
iseof = rclexecm.RclExecM.eofnext
|
iseof = rclexecm.RclExecM.eofnext
|
||||||
if isinstance(ipath, unicode):
|
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
|
||||||
ipath = ipath.encode("utf-8")
|
|
||||||
return (ok, docdata, ipath, iseof)
|
|
||||||
|
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
self.currentindex = -1
|
self.currentindex = -1
|
||||||
try:
|
try:
|
||||||
self.tar = tarfile.open(name=params["filename:"],mode='r')
|
self.tar = tarfile.open(name=params["filename:"],mode='r')
|
||||||
self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
|
#self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
|
||||||
|
self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
|
||||||
|
|
||||||
return True
|
return True
|
||||||
except:
|
except:
|
||||||
return False
|
return False
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Wrapping a text file. Recoll does it internally in most cases, but
|
||||||
|
# this is for use by another filter.
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# Wrapping a text file. Recoll does it internally in most cases, but
|
|
||||||
# there is a reason this exists, just can't remember it ...
|
|
||||||
class TxtDump:
|
class TxtDump:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
@ -12,7 +15,7 @@ class TxtDump:
|
|||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||||
#params["mimetype:"]))
|
#params["mimetype:"]))
|
||||||
if not params.has_key("filename:"):
|
if not "filename:" in params:
|
||||||
self.em.rclog("extractone: no file name")
|
self.em.rclog("extractone: no file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
# WAR web archive filter for recoll. War file are gzipped tar files
|
# WAR web archive filter for recoll. War file are gzipped tar files
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import tarfile
|
import tarfile
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
# Extractor for Excel files.
|
||||||
|
# Mso-dumper is not compatible with Python3
|
||||||
import rclexecm
|
import rclexecm
|
||||||
import rclexec1
|
import rclexec1
|
||||||
import xlsxmltocsv
|
import xlsxmltocsv
|
||||||
|
|||||||
@ -62,7 +62,7 @@ class XMLExtractor:
|
|||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
if not params.has_key("filename:"):
|
if "filename:" not in params:
|
||||||
self.em.rclog("extractone: no mime or file name")
|
self.em.rclog("extractone: no mime or file name")
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
|
|||||||
@ -17,36 +17,54 @@
|
|||||||
|
|
||||||
# Helper module for xslt-based filters
|
# Helper module for xslt-based filters
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
try:
|
PY2 = sys.version < '3'
|
||||||
import libxml2
|
|
||||||
import libxslt
|
|
||||||
except:
|
|
||||||
print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1"
|
|
||||||
sys.exit(1);
|
|
||||||
|
|
||||||
libxml2.substituteEntitiesDefault(1)
|
if PY2:
|
||||||
|
try:
|
||||||
def apply_sheet_data(sheet, data):
|
import libxml2
|
||||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
import libxslt
|
||||||
style = libxslt.parseStylesheetDoc(styledoc)
|
libxml2.substituteEntitiesDefault(1)
|
||||||
doc = libxml2.parseMemory(data, len(data))
|
except:
|
||||||
result = style.applyStylesheet(doc, None)
|
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
|
||||||
res = style.saveResultToString(result)
|
sys.exit(1);
|
||||||
style.freeStylesheet()
|
def apply_sheet_data(sheet, data):
|
||||||
doc.freeDoc()
|
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||||
result.freeDoc()
|
style = libxslt.parseStylesheetDoc(styledoc)
|
||||||
return res
|
doc = libxml2.parseMemory(data, len(data))
|
||||||
|
result = style.applyStylesheet(doc, None)
|
||||||
def apply_sheet_file(sheet, fn):
|
res = style.saveResultToString(result)
|
||||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
style.freeStylesheet()
|
||||||
style = libxslt.parseStylesheetDoc(styledoc)
|
doc.freeDoc()
|
||||||
doc = libxml2.parseFile(fn)
|
result.freeDoc()
|
||||||
result = style.applyStylesheet(doc, None)
|
return res
|
||||||
res = style.saveResultToString(result)
|
def apply_sheet_file(sheet, fn):
|
||||||
style.freeStylesheet()
|
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||||
doc.freeDoc()
|
style = libxslt.parseStylesheetDoc(styledoc)
|
||||||
result.freeDoc()
|
doc = libxml2.parseFile(fn)
|
||||||
return res
|
result = style.applyStylesheet(doc, None)
|
||||||
|
res = style.saveResultToString(result)
|
||||||
|
style.freeStylesheet()
|
||||||
|
doc.freeDoc()
|
||||||
|
result.freeDoc()
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
except:
|
||||||
|
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
|
||||||
|
sys.exit(1);
|
||||||
|
def apply_sheet_data(sheet, data):
|
||||||
|
styledoc = etree.fromstring(sheet)
|
||||||
|
transform = etree.XSLT(styledoc)
|
||||||
|
doc = etree.fromstring(data)
|
||||||
|
return etree.tostring(transform(doc))
|
||||||
|
def apply_sheet_file(sheet, fn):
|
||||||
|
styledoc = etree.fromstring(sheet)
|
||||||
|
transform = etree.XSLT(styledoc)
|
||||||
|
doc = etree.parse(fn)
|
||||||
|
return etree.tostring(transform(doc))
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,24 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# Copyright (C) 2014 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
#
|
||||||
|
|
||||||
# Zip file filter for Recoll
|
# Zip file extractor for Recoll
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import fnmatch
|
import fnmatch
|
||||||
@ -78,9 +96,7 @@ class ZipExtractor:
|
|||||||
iseof = rclexecm.RclExecM.noteof
|
iseof = rclexecm.RclExecM.noteof
|
||||||
if self.currentindex >= len(self.zip.namelist()) -1:
|
if self.currentindex >= len(self.zip.namelist()) -1:
|
||||||
iseof = rclexecm.RclExecM.eofnext
|
iseof = rclexecm.RclExecM.eofnext
|
||||||
if isinstance(ipath, unicode):
|
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
|
||||||
ipath = ipath.encode("utf-8")
|
|
||||||
return (ok, docdata, ipath, iseof)
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
@ -96,7 +112,14 @@ class ZipExtractor:
|
|||||||
self.skiplist = skipped.split(" ")
|
self.skiplist = skipped.split(" ")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.zip = ZipFile(filename)
|
if rclexecm.PY3:
|
||||||
|
# Note: python3 ZipFile wants an str file name, which
|
||||||
|
# is wrong: file names are binary. But it accepts an
|
||||||
|
# open file, and open() has no such restriction
|
||||||
|
f = open(filename, 'rb')
|
||||||
|
self.zip = ZipFile(f)
|
||||||
|
else:
|
||||||
|
self.zip = ZipFile(filename)
|
||||||
return True
|
return True
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("openfile: failed: [%s]" % err)
|
self.em.rclog("openfile: failed: [%s]" % err)
|
||||||
|
|||||||
@ -1,10 +1,14 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
# mso-dumper is not compatible with python3
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys, os.path, optparse
|
import sys, os.path, optparse
|
||||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||||
|
|
||||||
@ -97,7 +101,7 @@ class XLDumper(object):
|
|||||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print >> sys.stderr, "xls-dump.py: error: %s" % err
|
print("xls-dump.py: error: %s" % err, file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def dump (self):
|
def dump (self):
|
||||||
|
|||||||
@ -1,4 +1,9 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
# Transform XML output from xls-dump.py into csv format.
|
||||||
|
# Note: msodumper is not compatible with python3.
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import xml.sax
|
import xml.sax
|
||||||
@ -28,7 +33,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
if "value" in attrs:
|
if "value" in attrs:
|
||||||
value = attrs["value"].encode("UTF-8")
|
value = attrs["value"].encode("UTF-8")
|
||||||
else:
|
else:
|
||||||
value = unicode()
|
value = b''
|
||||||
if "col" in attrs:
|
if "col" in attrs:
|
||||||
self.cells[int(attrs["col"])] = value
|
self.cells[int(attrs["col"])] = value
|
||||||
else:
|
else:
|
||||||
@ -42,7 +47,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
def endElement(self, name, ):
|
def endElement(self, name, ):
|
||||||
if name == "row":
|
if name == "row":
|
||||||
curidx = 0
|
curidx = 0
|
||||||
for idx, value in self.cells.iteritems():
|
for idx, value in self.cells.items():
|
||||||
self.output += sepstring * (idx - curidx)
|
self.output += sepstring * (idx - curidx)
|
||||||
self.output += "%s%s%s" % (dquote, value, dquote)
|
self.output += "%s%s%s" % (dquote, value, dquote)
|
||||||
curidx = idx
|
curidx = idx
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user