From d6b230043c9360b5f5c2d33523a6c02ce4b9f942 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 5 Aug 2016 08:51:34 +0200 Subject: [PATCH] Check for newer pdftotext version to avoid double HTML escaping. fixes issue #318 --- src/filters/rclpdf.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index d95d474c..9fe11246 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -88,6 +88,20 @@ class PDFExtractor: if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") + # Check if we need to escape portions of text where old + # versions of pdftotext output raw HTML special characters. + self.needescape = True + try: + version = subprocess.check_output([self.pdftotext, "-v"], + stderr=subprocess.STDOUT) + major,minor,rev = version.split()[2].split('.') + # Don't know exactly when this changed but it's fixed in + # jessie 0.26.5 + if int(major) > 0 or int(minor) >= 26: + self.needescape = False + except: + pass + # See if we'll try to perform OCR. Need the commands and the # either the presence of a file in the config dir (historical) # or a set config variable. @@ -255,13 +269,13 @@ class PDFExtractor: output += b'\n' didcs = True - - m = re.search(b'''(.*)(.*)(<\/title>.*)''', line) - if not m: - m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line) - if m: - line = m.group(1) + self.em.htmlescape(m.group(2)) + \ - m.group(3) + if self.needescape: + m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line) + if not m: + m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line) + if m: + line = m.group(1) + self.em.htmlescape(m.group(2)) + \ + m.group(3) # Recoll treats "Subject" as a "title" element # (based on emails). The PDF "Subject" metadata