python filters: htmlescape needs not be an RclExecM member
This commit is contained in:
parent
d1c99e83d7
commit
a88c0114b1
@ -80,7 +80,7 @@ class DiaExtractor(RclBaseHandler):
|
|||||||
diap.feed(dia)
|
diap.feed(dia)
|
||||||
|
|
||||||
html = '<html><head><title></title></head><body><pre>'
|
html = '<html><head><title></title></head><body><pre>'
|
||||||
html += self.em.htmlescape('\n'.join(diap.string))
|
html += rclexecm.htmlescape('\n'.join(diap.string))
|
||||||
html += '</pre></body></html>'
|
html += '</pre></body></html>'
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|||||||
@ -66,15 +66,15 @@ class DJVUExtractor(RclBaseHandler):
|
|||||||
txtdata = txtdata.decode('UTF-8', 'replace')
|
txtdata = txtdata.decode('UTF-8', 'replace')
|
||||||
|
|
||||||
data = '''<html><head>'''
|
data = '''<html><head>'''
|
||||||
data += '''<title>''' + self.em.htmlescape(title) + '''</title>'''
|
data += '''<title>''' + rclexecm.htmlescape(title) + '''</title>'''
|
||||||
data += '''<meta http-equiv="Content-Type" '''
|
data += '''<meta http-equiv="Content-Type" '''
|
||||||
data += '''content="text/html;charset=UTF-8">'''
|
data += '''content="text/html;charset=UTF-8">'''
|
||||||
if author:
|
if author:
|
||||||
data += '''<meta name="author" content="''' + \
|
data += '''<meta name="author" content="''' + \
|
||||||
self.em.htmlescape(author) + '''">'''
|
rclexecm.htmlescape(author) + '''">'''
|
||||||
data += '''</head><body><pre>'''
|
data += '''</head><body><pre>'''
|
||||||
|
|
||||||
data += self.em.htmlescape(txtdata)
|
data += rclexecm.htmlescape(txtdata)
|
||||||
data += '''</pre></body></html>'''
|
data += '''</pre></body></html>'''
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|||||||
@ -51,7 +51,7 @@ class WordProcessData:
|
|||||||
line = b''
|
line = b''
|
||||||
|
|
||||||
if line:
|
if line:
|
||||||
self.out.append(self.em.htmlescape(line) + b'<br>')
|
self.out.append(rclexecm.htmlescape(line) + b'<br>')
|
||||||
else:
|
else:
|
||||||
self.out.append(b'<br>')
|
self.out.append(b'<br>')
|
||||||
|
|
||||||
|
|||||||
@ -42,13 +42,13 @@ class rclEPUB:
|
|||||||
author += name + " "
|
author += name + " "
|
||||||
data = "<html>\n<head>\n"
|
data = "<html>\n<head>\n"
|
||||||
if title:
|
if title:
|
||||||
data += "<title>" + self.em.htmlescape(title) + "</title>\n"
|
data += "<title>" + rclexecm.htmlescape(title) + "</title>\n"
|
||||||
if author:
|
if author:
|
||||||
data += '<meta name="author" content="' + \
|
data += '<meta name="author" content="' + \
|
||||||
self.em.htmlescape(author).strip() + '">\n'
|
rclexecm.htmlescape(author).strip() + '">\n'
|
||||||
if meta.description:
|
if meta.description:
|
||||||
data += '<meta name="description" content="' + \
|
data += '<meta name="description" content="' + \
|
||||||
self.em.htmlescape(meta.description) + '">\n'
|
rclexecm.htmlescape(meta.description) + '">\n'
|
||||||
data = data.encode('UTF-8')
|
data = data.encode('UTF-8')
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
if len(self.contents) == 0:
|
if len(self.contents) == 0:
|
||||||
|
|||||||
@ -33,13 +33,13 @@ class EPUBConcatExtractor(RclBaseHandler):
|
|||||||
author += name + " "
|
author += name + " "
|
||||||
data = "<html>\n<head>\n"
|
data = "<html>\n<head>\n"
|
||||||
if title:
|
if title:
|
||||||
data += "<title>" + self.em.htmlescape(title) + "</title>\n"
|
data += "<title>" + rclexecm.htmlescape(title) + "</title>\n"
|
||||||
if author:
|
if author:
|
||||||
data += '<meta name="author" content="' + \
|
data += '<meta name="author" content="' + \
|
||||||
self.em.htmlescape(author).strip() + '">\n'
|
rclexecm.htmlescape(author).strip() + '">\n'
|
||||||
if meta.description:
|
if meta.description:
|
||||||
data += '<meta name="description" content="' + \
|
data += '<meta name="description" content="' + \
|
||||||
self.em.htmlescape(meta.description) + '">\n'
|
rclexecm.htmlescape(meta.description) + '">\n'
|
||||||
data += "</head><body>"
|
data += "</head><body>"
|
||||||
data = data.encode('UTF-8')
|
data = data.encode('UTF-8')
|
||||||
|
|
||||||
|
|||||||
@ -33,22 +33,26 @@ import cmdtalk
|
|||||||
PY3 = (sys.version > '3')
|
PY3 = (sys.version > '3')
|
||||||
_mswindows = (sys.platform == "win32")
|
_mswindows = (sys.platform == "win32")
|
||||||
|
|
||||||
|
# Convert to bytes if not already such.
|
||||||
def makebytes(data):
|
def makebytes(data):
|
||||||
if type(data) == type(u''):
|
if type(data) == type(u''):
|
||||||
return data.encode("UTF-8")
|
return data.encode("UTF-8")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
# Possibly decode binary file name for use as subprocess argument,
|
||||||
|
# depending on platform.
|
||||||
def subprocfile(fn):
|
def subprocfile(fn):
|
||||||
# On Windows PY3 the list2cmdline() method in subprocess assumes that
|
# On Windows PY3 the list2cmdline() method in subprocess assumes that
|
||||||
# all args are str, and we receive file names as UTF-8. So we need
|
# all args are str, and we receive file names as UTF-8. So we need
|
||||||
# to convert.
|
# to convert.
|
||||||
# On Unix all list elements get converted to bytes in the C
|
# On Unix all list elements get converted to bytes in the C
|
||||||
# _posixsubprocess module, nothing to do
|
# _posixsubprocess module, nothing to do.
|
||||||
if PY3 and _mswindows:
|
if PY3 and _mswindows:
|
||||||
return fn.decode('UTF-8')
|
return fn.decode('UTF-8')
|
||||||
else:
|
else:
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
|
# Check for truthness of rclconfig value.
|
||||||
def configparamtrue(value):
|
def configparamtrue(value):
|
||||||
if not value:
|
if not value:
|
||||||
return False
|
return False
|
||||||
@ -64,13 +68,27 @@ def configparamtrue(value):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Escape special characters in plain text for inclusion in HTML doc.
|
||||||
|
# Note: tried replacing this with a multiple replacer according to
|
||||||
|
# http://stackoverflow.com/a/15221068, which was **10 times** slower
|
||||||
|
def htmlescape(txt):
|
||||||
|
# & must stay first (it somehow had managed to skip
|
||||||
|
# after the next replace, with rather interesting results)
|
||||||
|
try:
|
||||||
|
txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
|
||||||
|
replace(b'>', b'>').replace(b'"', b'"')
|
||||||
|
except:
|
||||||
|
txt = txt.replace("&", "&").replace("<", "<").\
|
||||||
|
replace(">", ">").replace("\"", """)
|
||||||
|
return txt
|
||||||
|
|
||||||
|
|
||||||
my_config = rclconfig.RclConfig()
|
my_config = rclconfig.RclConfig()
|
||||||
|
|
||||||
############################################
|
############################################
|
||||||
# RclExecM implements the communication protocol with the recollindex
|
# RclExecM implements the communication protocol with the recollindex
|
||||||
# process. It calls the object specific of the document type to
|
# process. It calls the object specific of the document type to
|
||||||
# actually get the data.
|
# actually get the data.
|
||||||
|
|
||||||
class RclExecM(cmdtalk.CmdTalk):
|
class RclExecM(cmdtalk.CmdTalk):
|
||||||
noteof = 0
|
noteof = 0
|
||||||
eofnext = 1
|
eofnext = 1
|
||||||
@ -103,19 +121,6 @@ class RclExecM(cmdtalk.CmdTalk):
|
|||||||
if self.debugfile or sys.platform != "win32":
|
if self.debugfile or sys.platform != "win32":
|
||||||
super().log(s, doexit, exitvalue)
|
super().log(s, doexit, exitvalue)
|
||||||
|
|
||||||
# Note: tried replacing this with a multiple replacer according to
|
|
||||||
# http://stackoverflow.com/a/15221068, which was **10 times** slower
|
|
||||||
def htmlescape(self, txt):
|
|
||||||
# & must stay first (it somehow had managed to skip
|
|
||||||
# after the next replace, with rather interesting results)
|
|
||||||
try:
|
|
||||||
txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
|
|
||||||
replace(b'>', b'>').replace(b'"', b'"')
|
|
||||||
except:
|
|
||||||
txt = txt.replace("&", "&").replace("<", "<").\
|
|
||||||
replace(">", ">").replace("\"", """)
|
|
||||||
return txt
|
|
||||||
|
|
||||||
# Our worker sometimes knows the mime types of the data it sends
|
# Our worker sometimes knows the mime types of the data it sends
|
||||||
def setmimetype(self, mt):
|
def setmimetype(self, mt):
|
||||||
self.mimetype = makebytes(mt)
|
self.mimetype = makebytes(mt)
|
||||||
|
|||||||
@ -59,14 +59,14 @@ class HWP5Dump(RclBaseHandler):
|
|||||||
try:
|
try:
|
||||||
tt = hwpfile.summaryinfo.title.strip()
|
tt = hwpfile.summaryinfo.title.strip()
|
||||||
if tt:
|
if tt:
|
||||||
tt = self.em.htmlescape(tt.encode('utf-8'))
|
tt = rclexecm.htmlescape(tt.encode('utf-8'))
|
||||||
self.em.setfield('caption', tt)
|
self.em.setfield('caption', tt)
|
||||||
|
|
||||||
for k,v in metafields(hwpfile.summaryinfo):
|
for k,v in metafields(hwpfile.summaryinfo):
|
||||||
v = "{0}".format(v)
|
v = "{0}".format(v)
|
||||||
v = v.strip()
|
v = v.strip()
|
||||||
if v:
|
if v:
|
||||||
v = self.em.htmlescape(v.encode('utf-8'))
|
v = rclexecm.htmlescape(v.encode('utf-8'))
|
||||||
k = k.encode('utf-8')
|
k = k.encode('utf-8')
|
||||||
self.em.setfield(k, v)
|
self.em.setfield(k, v)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -63,7 +63,7 @@ class ImgTagExtractor(RclBaseHandler):
|
|||||||
ttdata = set()
|
ttdata = set()
|
||||||
for k in pyexiv2_titles:
|
for k in pyexiv2_titles:
|
||||||
if k in mdic:
|
if k in mdic:
|
||||||
ttdata.add(self.em.htmlescape(mdic[k]))
|
ttdata.add(rclexecm.htmlescape(mdic[k]))
|
||||||
if ttdata:
|
if ttdata:
|
||||||
title = ""
|
title = ""
|
||||||
for v in ttdata:
|
for v in ttdata:
|
||||||
@ -83,13 +83,13 @@ class ImgTagExtractor(RclBaseHandler):
|
|||||||
for k,v in mdic.items():
|
for k,v in mdic.items():
|
||||||
if k == 'Xmp.digiKam.TagsList':
|
if k == 'Xmp.digiKam.TagsList':
|
||||||
docdata += b'<meta name="keywords" content="' + \
|
docdata += b'<meta name="keywords" content="' + \
|
||||||
rclexecm.makebytes(self.em.htmlescape(mdic[k])) + \
|
rclexecm.makebytes(rclexecm.htmlescape(mdic[k])) + \
|
||||||
b'">\n'
|
b'">\n'
|
||||||
|
|
||||||
docdata += b'</head><body>\n'
|
docdata += b'</head><body>\n'
|
||||||
for k,v in mdic.items():
|
for k,v in mdic.items():
|
||||||
docdata += rclexecm.makebytes(k + " : " + \
|
docdata += rclexecm.makebytes(k + " : " + \
|
||||||
self.em.htmlescape(mdic[k]) + "<br />\n")
|
rclexecm.htmlescape(mdic[k]) + "<br />\n")
|
||||||
docdata += b'</body></html>'
|
docdata += b'</body></html>'
|
||||||
|
|
||||||
return docdata
|
return docdata
|
||||||
|
|||||||
@ -30,8 +30,8 @@ class InfoExtractor:
|
|||||||
return(False, "", "", True)
|
return(False, "", "", True)
|
||||||
|
|
||||||
nodename, docdata = self.contents[index]
|
nodename, docdata = self.contents[index]
|
||||||
nodename = self.em.htmlescape(nodename)
|
nodename = rclexecm.htmlescape(nodename)
|
||||||
docdata = self.em.htmlescape(docdata)
|
docdata = rclexecm.htmlescape(docdata)
|
||||||
# strange whitespace to avoid changing the module tests (same as old)
|
# strange whitespace to avoid changing the module tests (same as old)
|
||||||
docdata = b'\n<html>\n <head>\n <title>' + \
|
docdata = b'\n<html>\n <head>\n <title>' + \
|
||||||
nodename + \
|
nodename + \
|
||||||
|
|||||||
@ -126,7 +126,8 @@ class KarTextExtractor(RclBaseHandler):
|
|||||||
self.em.rclog("Encode failed: " + str(err))
|
self.em.rclog("Encode failed: " + str(err))
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
data = self.em.htmlescape(data).decode('utf-8').replace('\n', '<br>\n')
|
data = rclexecm.htmlescape(data).decode('utf-8').replace('\n',
|
||||||
|
'<br>\n')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -247,7 +247,7 @@ class PDFExtractor:
|
|||||||
if not m:
|
if not m:
|
||||||
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
||||||
if m:
|
if m:
|
||||||
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \
|
||||||
m.group(3)
|
m.group(3)
|
||||||
|
|
||||||
# Recoll treats "Subject" as a "title" element
|
# Recoll treats "Subject" as a "title" element
|
||||||
@ -262,7 +262,7 @@ class PDFExtractor:
|
|||||||
# We used to remove end-of-line hyphenation (and join
|
# We used to remove end-of-line hyphenation (and join
|
||||||
# lines), but but it's not clear that we should do
|
# lines), but but it's not clear that we should do
|
||||||
# this as pdftotext without the -layout option does it ?
|
# this as pdftotext without the -layout option does it ?
|
||||||
line = self.em.htmlescape(line)
|
line = rclexecm.htmlescape(line)
|
||||||
|
|
||||||
if re.search(b'<head>', line):
|
if re.search(b'<head>', line):
|
||||||
inheader = True
|
inheader = True
|
||||||
@ -275,7 +275,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
def _metatag(self, nm, val):
|
def _metatag(self, nm, val):
|
||||||
return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
|
return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
|
||||||
self.em.htmlescape(rclexecm.makebytes(val)) + b"\">"
|
rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">"
|
||||||
|
|
||||||
# metaheaders is a list of (nm, value) pairs
|
# metaheaders is a list of (nm, value) pairs
|
||||||
def _injectmeta(self, html, metaheaders):
|
def _injectmeta(self, html, metaheaders):
|
||||||
@ -409,7 +409,7 @@ class PDFExtractor:
|
|||||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||||
self.filename]
|
self.filename]
|
||||||
data = subprocess.check_output(cmd)
|
data = subprocess.check_output(cmd)
|
||||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.em.rclog("%s failed: %s" % (cmd, e))
|
self.em.rclog("%s failed: %s" % (cmd, e))
|
||||||
pass
|
pass
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class PPTProcessData:
|
|||||||
b'content="text/html;charset=UTF-8">' + \
|
b'content="text/html;charset=UTF-8">' + \
|
||||||
b'</head><body><pre>')
|
b'</head><body><pre>')
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.out.append(self.em.htmlescape(line))
|
self.out.append(rclexecm.htmlescape(line))
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
return b'\n'.join(self.out) + b'''</pre></body></html>'''
|
return b'\n'.join(self.out) + b'''</pre></body></html>'''
|
||||||
|
|||||||
@ -32,7 +32,7 @@ class TxtDump(RclBaseHandler):
|
|||||||
# No charset, so recoll will have to use its config to guess it
|
# No charset, so recoll will have to use its config to guess it
|
||||||
html = b'<html><head><title></title></head><body><pre>'
|
html = b'<html><head><title></title></head><body><pre>'
|
||||||
with open(fn, "rb") as f:
|
with open(fn, "rb") as f:
|
||||||
html += self.em.htmlescape(f.read())
|
html += rclexecm.htmlescape(f.read())
|
||||||
html += b'</pre></body></html>'
|
html += b'</pre></body></html>'
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@ class XLSProcessData:
|
|||||||
return b'\n'.join(self.out)
|
return b'\n'.join(self.out)
|
||||||
handler = xlsxmltocsv.XlsXmlHandler()
|
handler = xlsxmltocsv.XlsXmlHandler()
|
||||||
xml.sax.parseString(b'\n'.join(self.xmldata), handler)
|
xml.sax.parseString(b'\n'.join(self.xmldata), handler)
|
||||||
self.out.append(self.em.htmlescape(b'\n'.join(handler.output)))
|
self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output)))
|
||||||
return b'\n'.join(self.out) + b'</pre></body></html>'
|
return b'\n'.join(self.out) + b'</pre></body></html>'
|
||||||
|
|
||||||
class XLSFilter:
|
class XLSFilter:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user