Check for newer pdftotext version to avoid double HTML escaping. fixes issue #318

2016-08-05 08:51:34 +02:00 · 2016-08-05 08:51:34 +02:00 · d6b230043c
commit d6b230043c
parent 4adf6925b8
1 changed files with 21 additions and 7 deletions
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -88,6 +88,20 @@ class PDFExtractor:
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")
        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major,minor,rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass
        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
@ -255,13 +269,13 @@ class PDFExtractor:
                    output += b'<meta http-equiv="Content-Type"' + \
                              b'content="text/html; charset=UTF-8">\n'
                    didcs = True
-
+                if self.needescape:
-                m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
+                    m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
-                if not m:
+                    if not m:
-                    m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
+                        m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
-                if m:
+                    if m:
-                    line = m.group(1) + self.em.htmlescape(m.group(2)) + \
+                        line = m.group(1) + self.em.htmlescape(m.group(2)) + \
-                           m.group(3)
+                               m.group(3)
                # Recoll treats "Subject" as a "title" element
                # (based on emails). The PDF "Subject" metadata