Check for newer pdftotext version to avoid double HTML escaping. fixes issue #318
This commit is contained in:
parent
4adf6925b8
commit
d6b230043c
@ -88,6 +88,20 @@ class PDFExtractor:
|
|||||||
if not self.pdftotext:
|
if not self.pdftotext:
|
||||||
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
||||||
|
|
||||||
|
# Check if we need to escape portions of text where old
|
||||||
|
# versions of pdftotext output raw HTML special characters.
|
||||||
|
self.needescape = True
|
||||||
|
try:
|
||||||
|
version = subprocess.check_output([self.pdftotext, "-v"],
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
major,minor,rev = version.split()[2].split('.')
|
||||||
|
# Don't know exactly when this changed but it's fixed in
|
||||||
|
# jessie 0.26.5
|
||||||
|
if int(major) > 0 or int(minor) >= 26:
|
||||||
|
self.needescape = False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# See if we'll try to perform OCR. Need the commands and the
|
# See if we'll try to perform OCR. Need the commands and the
|
||||||
# either the presence of a file in the config dir (historical)
|
# either the presence of a file in the config dir (historical)
|
||||||
# or a set config variable.
|
# or a set config variable.
|
||||||
@ -255,13 +269,13 @@ class PDFExtractor:
|
|||||||
output += b'<meta http-equiv="Content-Type"' + \
|
output += b'<meta http-equiv="Content-Type"' + \
|
||||||
b'content="text/html; charset=UTF-8">\n'
|
b'content="text/html; charset=UTF-8">\n'
|
||||||
didcs = True
|
didcs = True
|
||||||
|
if self.needescape:
|
||||||
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
||||||
if not m:
|
if not m:
|
||||||
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
||||||
if m:
|
if m:
|
||||||
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
||||||
m.group(3)
|
m.group(3)
|
||||||
|
|
||||||
# Recoll treats "Subject" as a "title" element
|
# Recoll treats "Subject" as a "title" element
|
||||||
# (based on emails). The PDF "Subject" metadata
|
# (based on emails). The PDF "Subject" metadata
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user