Handle a unicode conversion issue. Avoid returning None as document for an empty document
This commit is contained in:
parent
9673775c4e
commit
ccc0398155
@ -356,17 +356,17 @@ class PDFExtractor:
|
|||||||
return output, isempty
|
return output, isempty
|
||||||
|
|
||||||
def _metatag(self, nm, val):
|
def _metatag(self, nm, val):
|
||||||
return "<meta name=\"" + nm + "\" content=\"" + \
|
return b"<meta name=\"" + nm + "\" content=\"" + \
|
||||||
self.em.htmlescape(val) + "\">"
|
self.em.htmlescape(val) + "\">"
|
||||||
|
|
||||||
# metaheaders is a list of (nm, value) pairs
|
# metaheaders is a list of (nm, value) pairs
|
||||||
def _injectmeta(self, html, metaheaders):
|
def _injectmeta(self, html, metaheaders):
|
||||||
metatxt = ''
|
metatxt = b''
|
||||||
for nm, val in metaheaders:
|
for nm, val in metaheaders:
|
||||||
metatxt += self._metatag(nm, val) + '\n'
|
metatxt += self._metatag(nm, val) + b'\n'
|
||||||
if not metatxt:
|
if not metatxt:
|
||||||
return html
|
return html
|
||||||
res = self.re_head.sub('<head>\n' + metatxt, html)
|
res = self.re_head.sub(b'<head>\n' + metatxt, html)
|
||||||
#self.em.rclog("Substituted html: [%s]"%res)
|
#self.em.rclog("Substituted html: [%s]"%res)
|
||||||
if res:
|
if res:
|
||||||
return res
|
return res
|
||||||
@ -385,7 +385,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
def _setextrameta(self, html):
|
def _setextrameta(self, html):
|
||||||
if not self.pdfinfo:
|
if not self.pdfinfo:
|
||||||
return
|
return html
|
||||||
|
|
||||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||||
|
|
||||||
@ -418,7 +418,7 @@ class PDFExtractor:
|
|||||||
# define the required namespace.
|
# define the required namespace.
|
||||||
continue
|
continue
|
||||||
if elt is not None:
|
if elt is not None:
|
||||||
text = self._xmltreetext(elt)
|
text = self._xmltreetext(elt).encode('UTF-8')
|
||||||
if text:
|
if text:
|
||||||
# Should we set empty values ?
|
# Should we set empty values ?
|
||||||
# Can't use setfield as it only works for
|
# Can't use setfield as it only works for
|
||||||
@ -426,6 +426,8 @@ class PDFExtractor:
|
|||||||
metaheaders.append((rclnm, text))
|
metaheaders.append((rclnm, text))
|
||||||
if metaheaders:
|
if metaheaders:
|
||||||
return self._injectmeta(html, metaheaders)
|
return self._injectmeta(html, metaheaders)
|
||||||
|
else:
|
||||||
|
return html
|
||||||
|
|
||||||
def _selfdoc(self):
|
def _selfdoc(self):
|
||||||
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
||||||
@ -441,7 +443,7 @@ class PDFExtractor:
|
|||||||
self.filename, "-"])
|
self.filename, "-"])
|
||||||
|
|
||||||
html, isempty = self._fixhtml(html)
|
html, isempty = self._fixhtml(html)
|
||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty and self.ocrpossible:
|
if isempty and self.ocrpossible:
|
||||||
html = self.ocrpdf()
|
html = self.ocrpdf()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user