diff --git a/src/filters/rcldia b/src/filters/rcldia
index 64209507..3869bced 100755
--- a/src/filters/rcldia
+++ b/src/filters/rcldia
@@ -80,7 +80,7 @@ class DiaExtractor(RclBaseHandler):
diap.feed(dia)
html = '
'
- html += self.em.htmlescape('\n'.join(diap.string))
+ html += rclexecm.htmlescape('\n'.join(diap.string))
html += ''
return html
diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py
index 5368ae32..98e12f66 100755
--- a/src/filters/rcldjvu.py
+++ b/src/filters/rcldjvu.py
@@ -66,15 +66,15 @@ class DJVUExtractor(RclBaseHandler):
txtdata = txtdata.decode('UTF-8', 'replace')
data = ''''''
- data += '''''' + self.em.htmlescape(title) + ''''''
+ data += '''''' + rclexecm.htmlescape(title) + ''''''
data += ''''''
if author:
data += ''''''
+ rclexecm.htmlescape(author) + '''">'''
data += ''''''
- data += self.em.htmlescape(txtdata)
+ data += rclexecm.htmlescape(txtdata)
data += ''''''
return data
diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
index 104c420d..dc303b40 100755
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@@ -51,7 +51,7 @@ class WordProcessData:
line = b''
if line:
- self.out.append(self.em.htmlescape(line) + b'
')
+ self.out.append(rclexecm.htmlescape(line) + b'
')
else:
self.out.append(b'
')
diff --git a/src/filters/rclepub b/src/filters/rclepub
index c98d369f..32c55315 100755
--- a/src/filters/rclepub
+++ b/src/filters/rclepub
@@ -42,13 +42,13 @@ class rclEPUB:
author += name + " "
data = "\n\n"
if title:
- data += "" + self.em.htmlescape(title) + "\n"
+ data += "" + rclexecm.htmlescape(title) + "\n"
if author:
data += '\n'
+ rclexecm.htmlescape(author).strip() + '">\n'
if meta.description:
data += '\n'
+ rclexecm.htmlescape(meta.description) + '">\n'
data = data.encode('UTF-8')
self.em.setmimetype('text/html')
if len(self.contents) == 0:
diff --git a/src/filters/rclepub1 b/src/filters/rclepub1
index 86b6058f..e9574727 100755
--- a/src/filters/rclepub1
+++ b/src/filters/rclepub1
@@ -33,13 +33,13 @@ class EPUBConcatExtractor(RclBaseHandler):
author += name + " "
data = "\n\n"
if title:
- data += "" + self.em.htmlescape(title) + "\n"
+ data += "" + rclexecm.htmlescape(title) + "\n"
if author:
data += '\n'
+ rclexecm.htmlescape(author).strip() + '">\n'
if meta.description:
data += '\n'
+ rclexecm.htmlescape(meta.description) + '">\n'
data += ""
data = data.encode('UTF-8')
diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py
index b3142acc..95eea9ca 100644
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@@ -33,22 +33,26 @@ import cmdtalk
PY3 = (sys.version > '3')
_mswindows = (sys.platform == "win32")
+# Convert to bytes if not already such.
def makebytes(data):
if type(data) == type(u''):
return data.encode("UTF-8")
return data
+# Possibly decode binary file name for use as subprocess argument,
+# depending on platform.
def subprocfile(fn):
# On Windows PY3 the list2cmdline() method in subprocess assumes that
# all args are str, and we receive file names as UTF-8. So we need
# to convert.
# On Unix all list elements get converted to bytes in the C
- # _posixsubprocess module, nothing to do
+ # _posixsubprocess module, nothing to do.
if PY3 and _mswindows:
return fn.decode('UTF-8')
else:
return fn
+# Check for truthness of rclconfig value.
def configparamtrue(value):
if not value:
return False
@@ -64,13 +68,27 @@ def configparamtrue(value):
return True
return False
+# Escape special characters in plain text for inclusion in HTML doc.
+# Note: tried replacing this with a multiple replacer according to
+# http://stackoverflow.com/a/15221068, which was **10 times** slower
+def htmlescape(txt):
+ # & must stay first (it somehow had managed to skip
+ # after the next replace, with rather interesting results)
+ try:
+ txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
+ replace(b'>', b'>').replace(b'"', b'"')
+ except:
+ txt = txt.replace("&", "&").replace("<", "<").\
+ replace(">", ">").replace("\"", """)
+ return txt
+
+
my_config = rclconfig.RclConfig()
############################################
# RclExecM implements the communication protocol with the recollindex
# process. It calls the object specific of the document type to
# actually get the data.
-
class RclExecM(cmdtalk.CmdTalk):
noteof = 0
eofnext = 1
@@ -103,19 +121,6 @@ class RclExecM(cmdtalk.CmdTalk):
if self.debugfile or sys.platform != "win32":
super().log(s, doexit, exitvalue)
- # Note: tried replacing this with a multiple replacer according to
- # http://stackoverflow.com/a/15221068, which was **10 times** slower
- def htmlescape(self, txt):
- # & must stay first (it somehow had managed to skip
- # after the next replace, with rather interesting results)
- try:
- txt = txt.replace(b'&', b'&').replace(b'<', b'<').\
- replace(b'>', b'>').replace(b'"', b'"')
- except:
- txt = txt.replace("&", "&").replace("<", "<").\
- replace(">", ">").replace("\"", """)
- return txt
-
# Our worker sometimes knows the mime types of the data it sends
def setmimetype(self, mt):
self.mimetype = makebytes(mt)
diff --git a/src/filters/rclhwp.py b/src/filters/rclhwp.py
index 40f6ae13..0bb63b3d 100755
--- a/src/filters/rclhwp.py
+++ b/src/filters/rclhwp.py
@@ -59,14 +59,14 @@ class HWP5Dump(RclBaseHandler):
try:
tt = hwpfile.summaryinfo.title.strip()
if tt:
- tt = self.em.htmlescape(tt.encode('utf-8'))
+ tt = rclexecm.htmlescape(tt.encode('utf-8'))
self.em.setfield('caption', tt)
for k,v in metafields(hwpfile.summaryinfo):
v = "{0}".format(v)
v = v.strip()
if v:
- v = self.em.htmlescape(v.encode('utf-8'))
+ v = rclexecm.htmlescape(v.encode('utf-8'))
k = k.encode('utf-8')
self.em.setfield(k, v)
except Exception as e:
diff --git a/src/filters/rclimg.py b/src/filters/rclimg.py
index 99447e63..783aa1d4 100755
--- a/src/filters/rclimg.py
+++ b/src/filters/rclimg.py
@@ -63,7 +63,7 @@ class ImgTagExtractor(RclBaseHandler):
ttdata = set()
for k in pyexiv2_titles:
if k in mdic:
- ttdata.add(self.em.htmlescape(mdic[k]))
+ ttdata.add(rclexecm.htmlescape(mdic[k]))
if ttdata:
title = ""
for v in ttdata:
@@ -83,13 +83,13 @@ class ImgTagExtractor(RclBaseHandler):
for k,v in mdic.items():
if k == 'Xmp.digiKam.TagsList':
docdata += b'\n'
docdata += b'\n'
for k,v in mdic.items():
docdata += rclexecm.makebytes(k + " : " + \
- self.em.htmlescape(mdic[k]) + "
\n")
+ rclexecm.htmlescape(mdic[k]) + "
\n")
docdata += b''
return docdata
diff --git a/src/filters/rclinfo b/src/filters/rclinfo
index 05590113..cdc1d4da 100755
--- a/src/filters/rclinfo
+++ b/src/filters/rclinfo
@@ -30,8 +30,8 @@ class InfoExtractor:
return(False, "", "", True)
nodename, docdata = self.contents[index]
- nodename = self.em.htmlescape(nodename)
- docdata = self.em.htmlescape(docdata)
+ nodename = rclexecm.htmlescape(nodename)
+ docdata = rclexecm.htmlescape(docdata)
# strange whitespace to avoid changing the module tests (same as old)
docdata = b'\n\n \n ' + \
nodename + \
diff --git a/src/filters/rclkar b/src/filters/rclkar
index d7221de3..b30865a9 100755
--- a/src/filters/rclkar
+++ b/src/filters/rclkar
@@ -126,7 +126,8 @@ class KarTextExtractor(RclBaseHandler):
self.em.rclog("Encode failed: " + str(err))
return ""
- data = self.em.htmlescape(data).decode('utf-8').replace('\n', '
\n')
+ data = rclexecm.htmlescape(data).decode('utf-8').replace('\n',
+ '
\n')
return data
diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py
index 95483b79..43c4fc15 100755
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@@ -247,7 +247,7 @@ class PDFExtractor:
if not m:
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
if m:
- line = m.group(1) + self.em.htmlescape(m.group(2)) + \
+ line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \
m.group(3)
# Recoll treats "Subject" as a "title" element
@@ -262,7 +262,7 @@ class PDFExtractor:
# We used to remove end-of-line hyphenation (and join
# lines), but but it's not clear that we should do
# this as pdftotext without the -layout option does it ?
- line = self.em.htmlescape(line)
+ line = rclexecm.htmlescape(line)
if re.search(b'', line):
inheader = True
@@ -275,7 +275,7 @@ class PDFExtractor:
def _metatag(self, nm, val):
return b""
+ rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">"
# metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders):
@@ -409,7 +409,7 @@ class PDFExtractor:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
- html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
+ html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix
except Exception as e:
self.em.rclog("%s failed: %s" % (cmd, e))
pass
diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py
index ea282990..d7b0b0a6 100755
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@@ -23,7 +23,7 @@ class PPTProcessData:
b'content="text/html;charset=UTF-8">' + \
b'')
self.gotdata = True
- self.out.append(self.em.htmlescape(line))
+ self.out.append(rclexecm.htmlescape(line))
def wrapData(self):
return b'\n'.join(self.out) + b''''''
diff --git a/src/filters/rcltext.py b/src/filters/rcltext.py
index 9cd24c65..91d8be13 100755
--- a/src/filters/rcltext.py
+++ b/src/filters/rcltext.py
@@ -32,7 +32,7 @@ class TxtDump(RclBaseHandler):
# No charset, so recoll will have to use its config to guess it
html = b''
with open(fn, "rb") as f:
- html += self.em.htmlescape(f.read())
+ html += rclexecm.htmlescape(f.read())
html += b''
return html
diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py
index 29c217cb..b38e3e60 100755
--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@@ -40,7 +40,7 @@ class XLSProcessData:
return b'\n'.join(self.out)
handler = xlsxmltocsv.XlsXmlHandler()
xml.sax.parseString(b'\n'.join(self.xmldata), handler)
- self.out.append(self.em.htmlescape(b'\n'.join(handler.output)))
+ self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output)))
return b'\n'.join(self.out) + b''
class XLSFilter: