From a88c0114b1dcafa08b559896a50863b4bdb246c3 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 27 Mar 2020 17:19:40 +0100 Subject: [PATCH] python filters: htmlescape needs not be an RclExecM member --- src/filters/rcldia | 2 +- src/filters/rcldjvu.py | 6 +++--- src/filters/rcldoc.py | 2 +- src/filters/rclepub | 6 +++--- src/filters/rclepub1 | 6 +++--- src/filters/rclexecm.py | 35 ++++++++++++++++++++--------------- src/filters/rclhwp.py | 4 ++-- src/filters/rclimg.py | 6 +++--- src/filters/rclinfo | 4 ++-- src/filters/rclkar | 3 ++- src/filters/rclpdf.py | 8 ++++---- src/filters/rclppt.py | 2 +- src/filters/rcltext.py | 2 +- src/filters/rclxls.py | 2 +- 14 files changed, 47 insertions(+), 41 deletions(-) diff --git a/src/filters/rcldia b/src/filters/rcldia index 64209507..3869bced 100755 --- a/src/filters/rcldia +++ b/src/filters/rcldia @@ -80,7 +80,7 @@ class DiaExtractor(RclBaseHandler): diap.feed(dia) html = '
'
-        html += self.em.htmlescape('\n'.join(diap.string))
+        html += rclexecm.htmlescape('\n'.join(diap.string))
         html += '
' return html diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py index 5368ae32..98e12f66 100755 --- a/src/filters/rcldjvu.py +++ b/src/filters/rcldjvu.py @@ -66,15 +66,15 @@ class DJVUExtractor(RclBaseHandler): txtdata = txtdata.decode('UTF-8', 'replace') data = '''''' - data += '''''' + self.em.htmlescape(title) + '''''' + data += '''''' + rclexecm.htmlescape(title) + '''''' data += '''''' if author: data += '''''' + rclexecm.htmlescape(author) + '''">''' data += '''
'''
 
-        data += self.em.htmlescape(txtdata)
+        data += rclexecm.htmlescape(txtdata)
         data += '''
''' return data diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index 104c420d..dc303b40 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -51,7 +51,7 @@ class WordProcessData: line = b'' if line: - self.out.append(self.em.htmlescape(line) + b'
') + self.out.append(rclexecm.htmlescape(line) + b'
') else: self.out.append(b'
') diff --git a/src/filters/rclepub b/src/filters/rclepub index c98d369f..32c55315 100755 --- a/src/filters/rclepub +++ b/src/filters/rclepub @@ -42,13 +42,13 @@ class rclEPUB: author += name + " " data = "\n\n" if title: - data += "" + self.em.htmlescape(title) + "\n" + data += "" + rclexecm.htmlescape(title) + "\n" if author: data += '\n' + rclexecm.htmlescape(author).strip() + '">\n' if meta.description: data += '\n' + rclexecm.htmlescape(meta.description) + '">\n' data = data.encode('UTF-8') self.em.setmimetype('text/html') if len(self.contents) == 0: diff --git a/src/filters/rclepub1 b/src/filters/rclepub1 index 86b6058f..e9574727 100755 --- a/src/filters/rclepub1 +++ b/src/filters/rclepub1 @@ -33,13 +33,13 @@ class EPUBConcatExtractor(RclBaseHandler): author += name + " " data = "\n\n" if title: - data += "" + self.em.htmlescape(title) + "\n" + data += "" + rclexecm.htmlescape(title) + "\n" if author: data += '\n' + rclexecm.htmlescape(author).strip() + '">\n' if meta.description: data += '\n' + rclexecm.htmlescape(meta.description) + '">\n' data += "" data = data.encode('UTF-8') diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index b3142acc..95eea9ca 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -33,22 +33,26 @@ import cmdtalk PY3 = (sys.version > '3') _mswindows = (sys.platform == "win32") +# Convert to bytes if not already such. def makebytes(data): if type(data) == type(u''): return data.encode("UTF-8") return data +# Possibly decode binary file name for use as subprocess argument, +# depending on platform. def subprocfile(fn): # On Windows PY3 the list2cmdline() method in subprocess assumes that # all args are str, and we receive file names as UTF-8. So we need # to convert. # On Unix all list elements get converted to bytes in the C - # _posixsubprocess module, nothing to do + # _posixsubprocess module, nothing to do. if PY3 and _mswindows: return fn.decode('UTF-8') else: return fn +# Check for truthness of rclconfig value. def configparamtrue(value): if not value: return False @@ -64,13 +68,27 @@ def configparamtrue(value): return True return False +# Escape special characters in plain text for inclusion in HTML doc. +# Note: tried replacing this with a multiple replacer according to +# http://stackoverflow.com/a/15221068, which was **10 times** slower +def htmlescape(txt): + # & must stay first (it somehow had managed to skip + # after the next replace, with rather interesting results) + try: + txt = txt.replace(b'&', b'&').replace(b'<', b'<').\ + replace(b'>', b'>').replace(b'"', b'"') + except: + txt = txt.replace("&", "&").replace("<", "<").\ + replace(">", ">").replace("\"", """) + return txt + + my_config = rclconfig.RclConfig() ############################################ # RclExecM implements the communication protocol with the recollindex # process. It calls the object specific of the document type to # actually get the data. - class RclExecM(cmdtalk.CmdTalk): noteof = 0 eofnext = 1 @@ -103,19 +121,6 @@ class RclExecM(cmdtalk.CmdTalk): if self.debugfile or sys.platform != "win32": super().log(s, doexit, exitvalue) - # Note: tried replacing this with a multiple replacer according to - # http://stackoverflow.com/a/15221068, which was **10 times** slower - def htmlescape(self, txt): - # & must stay first (it somehow had managed to skip - # after the next replace, with rather interesting results) - try: - txt = txt.replace(b'&', b'&').replace(b'<', b'<').\ - replace(b'>', b'>').replace(b'"', b'"') - except: - txt = txt.replace("&", "&").replace("<", "<").\ - replace(">", ">").replace("\"", """) - return txt - # Our worker sometimes knows the mime types of the data it sends def setmimetype(self, mt): self.mimetype = makebytes(mt) diff --git a/src/filters/rclhwp.py b/src/filters/rclhwp.py index 40f6ae13..0bb63b3d 100755 --- a/src/filters/rclhwp.py +++ b/src/filters/rclhwp.py @@ -59,14 +59,14 @@ class HWP5Dump(RclBaseHandler): try: tt = hwpfile.summaryinfo.title.strip() if tt: - tt = self.em.htmlescape(tt.encode('utf-8')) + tt = rclexecm.htmlescape(tt.encode('utf-8')) self.em.setfield('caption', tt) for k,v in metafields(hwpfile.summaryinfo): v = "{0}".format(v) v = v.strip() if v: - v = self.em.htmlescape(v.encode('utf-8')) + v = rclexecm.htmlescape(v.encode('utf-8')) k = k.encode('utf-8') self.em.setfield(k, v) except Exception as e: diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py index 99447e63..783aa1d4 100755 --- a/src/filters/rclimg.py +++ b/src/filters/rclimg.py @@ -63,7 +63,7 @@ class ImgTagExtractor(RclBaseHandler): ttdata = set() for k in pyexiv2_titles: if k in mdic: - ttdata.add(self.em.htmlescape(mdic[k])) + ttdata.add(rclexecm.htmlescape(mdic[k])) if ttdata: title = "" for v in ttdata: @@ -83,13 +83,13 @@ class ImgTagExtractor(RclBaseHandler): for k,v in mdic.items(): if k == 'Xmp.digiKam.TagsList': docdata += b'\n' docdata += b'\n' for k,v in mdic.items(): docdata += rclexecm.makebytes(k + " : " + \ - self.em.htmlescape(mdic[k]) + "
\n") + rclexecm.htmlescape(mdic[k]) + "
\n") docdata += b'' return docdata diff --git a/src/filters/rclinfo b/src/filters/rclinfo index 05590113..cdc1d4da 100755 --- a/src/filters/rclinfo +++ b/src/filters/rclinfo @@ -30,8 +30,8 @@ class InfoExtractor: return(False, "", "", True) nodename, docdata = self.contents[index] - nodename = self.em.htmlescape(nodename) - docdata = self.em.htmlescape(docdata) + nodename = rclexecm.htmlescape(nodename) + docdata = rclexecm.htmlescape(docdata) # strange whitespace to avoid changing the module tests (same as old) docdata = b'\n\n \n ' + \ nodename + \ diff --git a/src/filters/rclkar b/src/filters/rclkar index d7221de3..b30865a9 100755 --- a/src/filters/rclkar +++ b/src/filters/rclkar @@ -126,7 +126,8 @@ class KarTextExtractor(RclBaseHandler): self.em.rclog("Encode failed: " + str(err)) return "" - data = self.em.htmlescape(data).decode('utf-8').replace('\n', '<br>\n') + data = rclexecm.htmlescape(data).decode('utf-8').replace('\n', + '<br>\n') return data diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 95483b79..43c4fc15 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -247,7 +247,7 @@ class PDFExtractor: if not m: m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line) if m: - line = m.group(1) + self.em.htmlescape(m.group(2)) + \ + line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \ m.group(3) # Recoll treats "Subject" as a "title" element @@ -262,7 +262,7 @@ class PDFExtractor: # We used to remove end-of-line hyphenation (and join # lines), but but it's not clear that we should do # this as pdftotext without the -layout option does it ? - line = self.em.htmlescape(line) + line = rclexecm.htmlescape(line) if re.search(b'<head>', line): inheader = True @@ -275,7 +275,7 @@ class PDFExtractor: def _metatag(self, nm, val): return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \ - self.em.htmlescape(rclexecm.makebytes(val)) + b"\">" + rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): @@ -409,7 +409,7 @@ class PDFExtractor: cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), self.filename] data = subprocess.check_output(cmd) - html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix + html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix except Exception as e: self.em.rclog("%s failed: %s" % (cmd, e)) pass diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py index ea282990..d7b0b0a6 100755 --- a/src/filters/rclppt.py +++ b/src/filters/rclppt.py @@ -23,7 +23,7 @@ class PPTProcessData: b'content="text/html;charset=UTF-8">' + \ b'</head><body><pre>') self.gotdata = True - self.out.append(self.em.htmlescape(line)) + self.out.append(rclexecm.htmlescape(line)) def wrapData(self): return b'\n'.join(self.out) + b'''</pre></body></html>''' diff --git a/src/filters/rcltext.py b/src/filters/rcltext.py index 9cd24c65..91d8be13 100755 --- a/src/filters/rcltext.py +++ b/src/filters/rcltext.py @@ -32,7 +32,7 @@ class TxtDump(RclBaseHandler): # No charset, so recoll will have to use its config to guess it html = b'<html><head><title>
'
         with open(fn, "rb") as f:
-            html += self.em.htmlescape(f.read())
+            html += rclexecm.htmlescape(f.read())
         html += b'
' return html diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py index 29c217cb..b38e3e60 100755 --- a/src/filters/rclxls.py +++ b/src/filters/rclxls.py @@ -40,7 +40,7 @@ class XLSProcessData: return b'\n'.join(self.out) handler = xlsxmltocsv.XlsXmlHandler() xml.sax.parseString(b'\n'.join(self.xmldata), handler) - self.out.append(self.em.htmlescape(b'\n'.join(handler.output))) + self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output))) return b'\n'.join(self.out) + b'' class XLSFilter: