Handle a unicode conversion issue. Avoid returning None as document for an empty document

This commit is contained in:
Jean-Francois Dockes 2017-05-15 12:35:59 +02:00
parent 9673775c4e
commit ccc0398155

View File

@ -356,17 +356,17 @@ class PDFExtractor:
return output, isempty return output, isempty
def _metatag(self, nm, val): def _metatag(self, nm, val):
return "<meta name=\"" + nm + "\" content=\"" + \ return b"<meta name=\"" + nm + "\" content=\"" + \
self.em.htmlescape(val) + "\">" self.em.htmlescape(val) + "\">"
# metaheaders is a list of (nm, value) pairs # metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders): def _injectmeta(self, html, metaheaders):
metatxt = '' metatxt = b''
for nm, val in metaheaders: for nm, val in metaheaders:
metatxt += self._metatag(nm, val) + '\n' metatxt += self._metatag(nm, val) + b'\n'
if not metatxt: if not metatxt:
return html return html
res = self.re_head.sub('<head>\n' + metatxt, html) res = self.re_head.sub(b'<head>\n' + metatxt, html)
#self.em.rclog("Substituted html: [%s]"%res) #self.em.rclog("Substituted html: [%s]"%res)
if res: if res:
return res return res
@ -385,7 +385,7 @@ class PDFExtractor:
def _setextrameta(self, html): def _setextrameta(self, html):
if not self.pdfinfo: if not self.pdfinfo:
return return html
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
@ -418,7 +418,7 @@ class PDFExtractor:
# define the required namespace. # define the required namespace.
continue continue
if elt is not None: if elt is not None:
text = self._xmltreetext(elt) text = self._xmltreetext(elt).encode('UTF-8')
if text: if text:
# Should we set empty values ? # Should we set empty values ?
# Can't use setfield as it only works for # Can't use setfield as it only works for
@ -426,6 +426,8 @@ class PDFExtractor:
metaheaders.append((rclnm, text)) metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
return self._injectmeta(html, metaheaders) return self._injectmeta(html, metaheaders)
else:
return html
def _selfdoc(self): def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)''' '''Extract the text from the pdf doc (as opposed to attachment)'''
@ -441,7 +443,7 @@ class PDFExtractor:
self.filename, "-"]) self.filename, "-"])
html, isempty = self._fixhtml(html) html, isempty = self._fixhtml(html)
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) #self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html))
if isempty and self.ocrpossible: if isempty and self.ocrpossible:
html = self.ocrpdf() html = self.ocrpdf()