diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index d95d474c..9fe11246 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -88,6 +88,20 @@ class PDFExtractor: if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") + # Check if we need to escape portions of text where old + # versions of pdftotext output raw HTML special characters. + self.needescape = True + try: + version = subprocess.check_output([self.pdftotext, "-v"], + stderr=subprocess.STDOUT) + major,minor,rev = version.split()[2].split('.') + # Don't know exactly when this changed but it's fixed in + # jessie 0.26.5 + if int(major) > 0 or int(minor) >= 26: + self.needescape = False + except: + pass + # See if we'll try to perform OCR. Need the commands and the # either the presence of a file in the config dir (historical) # or a set config variable. @@ -255,13 +269,13 @@ class PDFExtractor: output += b'\n' didcs = True - - m = re.search(b'''(.*