From ccc039815517ae82af99da1d035a1e283c706fa2 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 15 May 2017 12:35:59 +0200 Subject: [PATCH] Handle a unicode conversion issue. Avoid returning None as document for an empty document --- src/filters/rclpdf.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index b0c275e3..2ac62196 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -356,17 +356,17 @@ class PDFExtractor: return output, isempty def _metatag(self, nm, val): - return "" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): - metatxt = '' + metatxt = b'' for nm, val in metaheaders: - metatxt += self._metatag(nm, val) + '\n' + metatxt += self._metatag(nm, val) + b'\n' if not metatxt: return html - res = self.re_head.sub('\n' + metatxt, html) + res = self.re_head.sub(b'\n' + metatxt, html) #self.em.rclog("Substituted html: [%s]"%res) if res: return res @@ -385,7 +385,7 @@ class PDFExtractor: def _setextrameta(self, html): if not self.pdfinfo: - return + return html all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) @@ -418,7 +418,7 @@ class PDFExtractor: # define the required namespace. continue if elt is not None: - text = self._xmltreetext(elt) + text = self._xmltreetext(elt).encode('UTF-8') if text: # Should we set empty values ? # Can't use setfield as it only works for @@ -426,7 +426,9 @@ class PDFExtractor: metaheaders.append((rclnm, text)) if metaheaders: return self._injectmeta(html, metaheaders) - + else: + return html + def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') @@ -435,13 +437,13 @@ class PDFExtractor: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof - + html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc", "UTF-8", "-eol", "unix", "-q", self.filename, "-"]) html, isempty = self._fixhtml(html) - #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) + #self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html)) if isempty and self.ocrpossible: html = self.ocrpdf()