PDF: fix the XMP metadata extraction code for python3 and other issues. Also get metadata from XML attributes

This commit is contained in:
Jean-Francois Dockes 2019-06-12 19:21:37 +02:00
parent b759490559
commit b895980e95

View File

@ -43,6 +43,7 @@ import atexit
import signal
import rclconfig
import glob
import traceback
tmpdir = None
@ -174,9 +175,9 @@ class PDFExtractor:
self.pdfinfo = None
return
self.re_head = re.compile(r'<head>', re.IGNORECASE)
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
r'(.*)' + r'<\?xpacket[ ]+end',
self.re_head = re.compile(br'<head>', re.IGNORECASE)
self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' +
br'(.*)' + br'<\?xpacket[ ]+end',
flags = re.DOTALL)
# Extract all attachments if any into temporary directory
@ -268,7 +269,7 @@ class PDFExtractor:
global tmpdir
if not tmpdir:
return ""
return b""
tesseractlang = self.guesstesseractlang()
# self.em.rclog("tesseractlang %s" % tesseractlang)
@ -283,7 +284,7 @@ class PDFExtractor:
tmpfile])
except Exception as e:
self.em.rclog("pdftoppm failed: %s" % e)
return ""
return b""
files = glob.glob(tmpfile + "*")
for f in files:
@ -300,17 +301,17 @@ class PDFExtractor:
# Concatenate the result files
files = glob.glob(tmpfile + "*" + ".txt")
data = ""
data = b""
for f in files:
data += open(f, "r").read()
data += open(f, "rb").read()
if not data:
return ""
return '''<html><head>
return b""
return b'''<html><head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
</head><body><pre>''' + \
self.em.htmlescape(data) + \
'''</pre></body></html>'''
b'''</pre></body></html>'''
# pdftotext (used to?) badly escape text inside the header
@ -349,7 +350,7 @@ class PDFExtractor:
elif inbody:
s = line[0:1]
if s != "\x0c" and s != "<":
if s != b"\x0c" and s != b"<":
isempty = False
# We used to remove end-of-line hyphenation (and join
# lines), but but it's not clear that we should do
@ -366,17 +367,17 @@ class PDFExtractor:
return b'\n'.join(output), isempty
def _metatag(self, nm, val):
return "<meta name=\"" + nm + "\" content=\"" + \
self.em.htmlescape(val) + "\">"
return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
self.em.htmlescape(rclexecm.makebytes(val)) + b"\">"
# metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders):
metatxt = ''
metatxt = b''
for nm, val in metaheaders:
metatxt += self._metatag(nm, val) + '\n'
metatxt += self._metatag(nm, val) + b'\n'
if not metatxt:
return html
res = self.re_head.sub('<head>\n' + metatxt, html)
res = self.re_head.sub(b'<head>\n' + metatxt, html)
#self.em.rclog("Substituted html: [%s]"%res)
if res:
return res
@ -395,7 +396,7 @@ class PDFExtractor:
def _setextrameta(self, html):
if not self.pdfinfo:
return
return html
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
@ -417,8 +418,9 @@ class PDFExtractor:
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
rdf = root.find("rdf:RDF", namespaces)
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
if rdf is None:
return html
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
for metanm,rclnm in self.extrameta:
for rdfdesc in rdfdesclist:
try:
@ -427,16 +429,29 @@ class PDFExtractor:
# We get an exception when this rdf:Description does not
# define the required namespace.
continue
text = None
if elt is not None:
text = self._xmltreetext(elt)
if text:
# Should we set empty values ?
# Can't use setfield as it only works for
# text/plain output at the moment.
metaheaders.append((rclnm, text))
else:
# Some docs define the values as attributes. don't
# know if this is valid but anyway...
try:
prefix,nm = metanm.split(":")
fullnm = "{%s}%s" % (rdfdesc.nsmap[prefix], nm)
except:
fullnm = nm
text = rdfdesc.get(fullnm)
if text:
# Should we set empty values ?
# Can't use setfield as it only works for
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
if metaheaders:
return self._injectmeta(html, metaheaders)
else:
return html
def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)'''
self.em.setmimetype('text/html')
@ -460,7 +475,7 @@ class PDFExtractor:
try:
html = self._setextrameta(html)
except Exception as err:
self.em.rclog("Metadata extraction failed: %s" % err)
self.em.rclog("Metadata extraction failed: %s %s" % (err, traceback.format_exc()))
return (True, html, "", eof)