diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index cb95c11e..5471a40b 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -469,7 +469,40 @@ class PDFExtractor: if annotsfield: self.em.setfield("pdfannot", annotsfield) return html - + + def _patch_meta(self, html): + '''This fixes https://gitlab.freedesktop.org/poppler/poppler/-/issues/136''' + + if not _mswindows: + pdfinfo = rclexecm.which("pdfinfo") + if not pdfinfo: + pdfinfo = rclexecm.which("poppler/pdfinfo") + + if not pdfinfo: + return html + + info = subprocess.check_output([pdfinfo, '-isodates', self.filename]) + + meta = {} + for line in info.split(b'\n'): + try: + key, value = line.strip().split(b':', 1) + meta[key.strip()] = value.strip().replace(b'"', b'\"') + except ValueError: + pass + + if b'CreationDate' in meta: + meta[b'date'] = meta[b'CreationDate'] + + title = meta.get('Title') + head = [b'%s' % title] if title else [] + head += [ b'' % x for x in meta.items() ] + + start = html.index(b'') + 6 + end = html.index(b'') + return html[:start] + b"".join(head) + html[end:] + + def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') @@ -483,6 +516,8 @@ class PDFExtractor: "UTF-8", "-eol", "unix", "-q", self.filename, "-"]) + html = self._patch_meta(html) + html, isempty = self._fixhtml(html) #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))