python filters: htmlescape needs not be an RclExecM member

This commit is contained in:
Jean-Francois Dockes 2020-03-27 17:19:40 +01:00
parent d1c99e83d7
commit a88c0114b1
14 changed files with 47 additions and 41 deletions

View File

@ -80,7 +80,7 @@ class DiaExtractor(RclBaseHandler):
diap.feed(dia) diap.feed(dia)
html = '<html><head><title></title></head><body><pre>' html = '<html><head><title></title></head><body><pre>'
html += self.em.htmlescape('\n'.join(diap.string)) html += rclexecm.htmlescape('\n'.join(diap.string))
html += '</pre></body></html>' html += '</pre></body></html>'
return html return html

View File

@ -66,15 +66,15 @@ class DJVUExtractor(RclBaseHandler):
txtdata = txtdata.decode('UTF-8', 'replace') txtdata = txtdata.decode('UTF-8', 'replace')
data = '''<html><head>''' data = '''<html><head>'''
data += '''<title>''' + self.em.htmlescape(title) + '''</title>''' data += '''<title>''' + rclexecm.htmlescape(title) + '''</title>'''
data += '''<meta http-equiv="Content-Type" ''' data += '''<meta http-equiv="Content-Type" '''
data += '''content="text/html;charset=UTF-8">''' data += '''content="text/html;charset=UTF-8">'''
if author: if author:
data += '''<meta name="author" content="''' + \ data += '''<meta name="author" content="''' + \
self.em.htmlescape(author) + '''">''' rclexecm.htmlescape(author) + '''">'''
data += '''</head><body><pre>''' data += '''</head><body><pre>'''
data += self.em.htmlescape(txtdata) data += rclexecm.htmlescape(txtdata)
data += '''</pre></body></html>''' data += '''</pre></body></html>'''
return data return data

View File

@ -51,7 +51,7 @@ class WordProcessData:
line = b'' line = b''
if line: if line:
self.out.append(self.em.htmlescape(line) + b'<br>') self.out.append(rclexecm.htmlescape(line) + b'<br>')
else: else:
self.out.append(b'<br>') self.out.append(b'<br>')

View File

@ -42,13 +42,13 @@ class rclEPUB:
author += name + " " author += name + " "
data = "<html>\n<head>\n" data = "<html>\n<head>\n"
if title: if title:
data += "<title>" + self.em.htmlescape(title) + "</title>\n" data += "<title>" + rclexecm.htmlescape(title) + "</title>\n"
if author: if author:
data += '<meta name="author" content="' + \ data += '<meta name="author" content="' + \
self.em.htmlescape(author).strip() + '">\n' rclexecm.htmlescape(author).strip() + '">\n'
if meta.description: if meta.description:
data += '<meta name="description" content="' + \ data += '<meta name="description" content="' + \
self.em.htmlescape(meta.description) + '">\n' rclexecm.htmlescape(meta.description) + '">\n'
data = data.encode('UTF-8') data = data.encode('UTF-8')
self.em.setmimetype('text/html') self.em.setmimetype('text/html')
if len(self.contents) == 0: if len(self.contents) == 0:

View File

@ -33,13 +33,13 @@ class EPUBConcatExtractor(RclBaseHandler):
author += name + " " author += name + " "
data = "<html>\n<head>\n" data = "<html>\n<head>\n"
if title: if title:
data += "<title>" + self.em.htmlescape(title) + "</title>\n" data += "<title>" + rclexecm.htmlescape(title) + "</title>\n"
if author: if author:
data += '<meta name="author" content="' + \ data += '<meta name="author" content="' + \
self.em.htmlescape(author).strip() + '">\n' rclexecm.htmlescape(author).strip() + '">\n'
if meta.description: if meta.description:
data += '<meta name="description" content="' + \ data += '<meta name="description" content="' + \
self.em.htmlescape(meta.description) + '">\n' rclexecm.htmlescape(meta.description) + '">\n'
data += "</head><body>" data += "</head><body>"
data = data.encode('UTF-8') data = data.encode('UTF-8')

View File

@ -33,22 +33,26 @@ import cmdtalk
PY3 = (sys.version > '3') PY3 = (sys.version > '3')
_mswindows = (sys.platform == "win32") _mswindows = (sys.platform == "win32")
# Convert to bytes if not already such.
def makebytes(data): def makebytes(data):
if type(data) == type(u''): if type(data) == type(u''):
return data.encode("UTF-8") return data.encode("UTF-8")
return data return data
# Possibly decode binary file name for use as subprocess argument,
# depending on platform.
def subprocfile(fn): def subprocfile(fn):
# On Windows PY3 the list2cmdline() method in subprocess assumes that # On Windows PY3 the list2cmdline() method in subprocess assumes that
# all args are str, and we receive file names as UTF-8. So we need # all args are str, and we receive file names as UTF-8. So we need
# to convert. # to convert.
# On Unix all list elements get converted to bytes in the C # On Unix all list elements get converted to bytes in the C
# _posixsubprocess module, nothing to do # _posixsubprocess module, nothing to do.
if PY3 and _mswindows: if PY3 and _mswindows:
return fn.decode('UTF-8') return fn.decode('UTF-8')
else: else:
return fn return fn
# Check for truthness of rclconfig value.
def configparamtrue(value): def configparamtrue(value):
if not value: if not value:
return False return False
@ -64,13 +68,27 @@ def configparamtrue(value):
return True return True
return False return False
# Escape special characters in plain text for inclusion in HTML doc.
# Note: tried replacing this with a multiple replacer according to
# http://stackoverflow.com/a/15221068, which was **10 times** slower
def htmlescape(txt):
# &amp must stay first (it somehow had managed to skip
# after the next replace, with rather interesting results)
try:
txt = txt.replace(b'&', b'&amp;').replace(b'<', b'&lt;').\
replace(b'>', b'&gt;').replace(b'"', b'&quot;')
except:
txt = txt.replace("&", "&amp;").replace("<", "&lt;").\
replace(">", "&gt;").replace("\"", "&quot;")
return txt
my_config = rclconfig.RclConfig() my_config = rclconfig.RclConfig()
############################################ ############################################
# RclExecM implements the communication protocol with the recollindex # RclExecM implements the communication protocol with the recollindex
# process. It calls the object specific of the document type to # process. It calls the object specific of the document type to
# actually get the data. # actually get the data.
class RclExecM(cmdtalk.CmdTalk): class RclExecM(cmdtalk.CmdTalk):
noteof = 0 noteof = 0
eofnext = 1 eofnext = 1
@ -103,19 +121,6 @@ class RclExecM(cmdtalk.CmdTalk):
if self.debugfile or sys.platform != "win32": if self.debugfile or sys.platform != "win32":
super().log(s, doexit, exitvalue) super().log(s, doexit, exitvalue)
# Note: tried replacing this with a multiple replacer according to
# http://stackoverflow.com/a/15221068, which was **10 times** slower
def htmlescape(self, txt):
# &amp must stay first (it somehow had managed to skip
# after the next replace, with rather interesting results)
try:
txt = txt.replace(b'&', b'&amp;').replace(b'<', b'&lt;').\
replace(b'>', b'&gt;').replace(b'"', b'&quot;')
except:
txt = txt.replace("&", "&amp;").replace("<", "&lt;").\
replace(">", "&gt;").replace("\"", "&quot;")
return txt
# Our worker sometimes knows the mime types of the data it sends # Our worker sometimes knows the mime types of the data it sends
def setmimetype(self, mt): def setmimetype(self, mt):
self.mimetype = makebytes(mt) self.mimetype = makebytes(mt)

View File

@ -59,14 +59,14 @@ class HWP5Dump(RclBaseHandler):
try: try:
tt = hwpfile.summaryinfo.title.strip() tt = hwpfile.summaryinfo.title.strip()
if tt: if tt:
tt = self.em.htmlescape(tt.encode('utf-8')) tt = rclexecm.htmlescape(tt.encode('utf-8'))
self.em.setfield('caption', tt) self.em.setfield('caption', tt)
for k,v in metafields(hwpfile.summaryinfo): for k,v in metafields(hwpfile.summaryinfo):
v = "{0}".format(v) v = "{0}".format(v)
v = v.strip() v = v.strip()
if v: if v:
v = self.em.htmlescape(v.encode('utf-8')) v = rclexecm.htmlescape(v.encode('utf-8'))
k = k.encode('utf-8') k = k.encode('utf-8')
self.em.setfield(k, v) self.em.setfield(k, v)
except Exception as e: except Exception as e:

View File

@ -63,7 +63,7 @@ class ImgTagExtractor(RclBaseHandler):
ttdata = set() ttdata = set()
for k in pyexiv2_titles: for k in pyexiv2_titles:
if k in mdic: if k in mdic:
ttdata.add(self.em.htmlescape(mdic[k])) ttdata.add(rclexecm.htmlescape(mdic[k]))
if ttdata: if ttdata:
title = "" title = ""
for v in ttdata: for v in ttdata:
@ -83,13 +83,13 @@ class ImgTagExtractor(RclBaseHandler):
for k,v in mdic.items(): for k,v in mdic.items():
if k == 'Xmp.digiKam.TagsList': if k == 'Xmp.digiKam.TagsList':
docdata += b'<meta name="keywords" content="' + \ docdata += b'<meta name="keywords" content="' + \
rclexecm.makebytes(self.em.htmlescape(mdic[k])) + \ rclexecm.makebytes(rclexecm.htmlescape(mdic[k])) + \
b'">\n' b'">\n'
docdata += b'</head><body>\n' docdata += b'</head><body>\n'
for k,v in mdic.items(): for k,v in mdic.items():
docdata += rclexecm.makebytes(k + " : " + \ docdata += rclexecm.makebytes(k + " : " + \
self.em.htmlescape(mdic[k]) + "<br />\n") rclexecm.htmlescape(mdic[k]) + "<br />\n")
docdata += b'</body></html>' docdata += b'</body></html>'
return docdata return docdata

View File

@ -30,8 +30,8 @@ class InfoExtractor:
return(False, "", "", True) return(False, "", "", True)
nodename, docdata = self.contents[index] nodename, docdata = self.contents[index]
nodename = self.em.htmlescape(nodename) nodename = rclexecm.htmlescape(nodename)
docdata = self.em.htmlescape(docdata) docdata = rclexecm.htmlescape(docdata)
# strange whitespace to avoid changing the module tests (same as old) # strange whitespace to avoid changing the module tests (same as old)
docdata = b'\n<html>\n <head>\n <title>' + \ docdata = b'\n<html>\n <head>\n <title>' + \
nodename + \ nodename + \

View File

@ -126,7 +126,8 @@ class KarTextExtractor(RclBaseHandler):
self.em.rclog("Encode failed: " + str(err)) self.em.rclog("Encode failed: " + str(err))
return "" return ""
data = self.em.htmlescape(data).decode('utf-8').replace('\n', '<br>\n') data = rclexecm.htmlescape(data).decode('utf-8').replace('\n',
'<br>\n')
return data return data

View File

@ -247,7 +247,7 @@ class PDFExtractor:
if not m: if not m:
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line) m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
if m: if m:
line = m.group(1) + self.em.htmlescape(m.group(2)) + \ line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \
m.group(3) m.group(3)
# Recoll treats "Subject" as a "title" element # Recoll treats "Subject" as a "title" element
@ -262,7 +262,7 @@ class PDFExtractor:
# We used to remove end-of-line hyphenation (and join # We used to remove end-of-line hyphenation (and join
# lines), but but it's not clear that we should do # lines), but but it's not clear that we should do
# this as pdftotext without the -layout option does it ? # this as pdftotext without the -layout option does it ?
line = self.em.htmlescape(line) line = rclexecm.htmlescape(line)
if re.search(b'<head>', line): if re.search(b'<head>', line):
inheader = True inheader = True
@ -275,7 +275,7 @@ class PDFExtractor:
def _metatag(self, nm, val): def _metatag(self, nm, val):
return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \ return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
self.em.htmlescape(rclexecm.makebytes(val)) + b"\">" rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">"
# metaheaders is a list of (nm, value) pairs # metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders): def _injectmeta(self, html, metaheaders):
@ -409,7 +409,7 @@ class PDFExtractor:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename] self.filename]
data = subprocess.check_output(cmd) data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix
except Exception as e: except Exception as e:
self.em.rclog("%s failed: %s" % (cmd, e)) self.em.rclog("%s failed: %s" % (cmd, e))
pass pass

View File

@ -23,7 +23,7 @@ class PPTProcessData:
b'content="text/html;charset=UTF-8">' + \ b'content="text/html;charset=UTF-8">' + \
b'</head><body><pre>') b'</head><body><pre>')
self.gotdata = True self.gotdata = True
self.out.append(self.em.htmlescape(line)) self.out.append(rclexecm.htmlescape(line))
def wrapData(self): def wrapData(self):
return b'\n'.join(self.out) + b'''</pre></body></html>''' return b'\n'.join(self.out) + b'''</pre></body></html>'''

View File

@ -32,7 +32,7 @@ class TxtDump(RclBaseHandler):
# No charset, so recoll will have to use its config to guess it # No charset, so recoll will have to use its config to guess it
html = b'<html><head><title></title></head><body><pre>' html = b'<html><head><title></title></head><body><pre>'
with open(fn, "rb") as f: with open(fn, "rb") as f:
html += self.em.htmlescape(f.read()) html += rclexecm.htmlescape(f.read())
html += b'</pre></body></html>' html += b'</pre></body></html>'
return html return html

View File

@ -40,7 +40,7 @@ class XLSProcessData:
return b'\n'.join(self.out) return b'\n'.join(self.out)
handler = xlsxmltocsv.XlsXmlHandler() handler = xlsxmltocsv.XlsXmlHandler()
xml.sax.parseString(b'\n'.join(self.xmldata), handler) xml.sax.parseString(b'\n'.join(self.xmldata), handler)
self.out.append(self.em.htmlescape(b'\n'.join(handler.output))) self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output)))
return b'\n'.join(self.out) + b'</pre></body></html>' return b'\n'.join(self.out) + b'</pre></body></html>'
class XLSFilter: class XLSFilter: