PDF: fix the XMP metadata extraction code for python3 and other issues. Also get metadata from XML attributes

This commit is contained in:
Jean-Francois Dockes 2019-06-12 19:21:37 +02:00
parent b759490559
commit b895980e95

View File

@ -43,6 +43,7 @@ import atexit
import signal import signal
import rclconfig import rclconfig
import glob import glob
import traceback
tmpdir = None tmpdir = None
@ -174,9 +175,9 @@ class PDFExtractor:
self.pdfinfo = None self.pdfinfo = None
return return
self.re_head = re.compile(r'<head>', re.IGNORECASE) self.re_head = re.compile(br'<head>', re.IGNORECASE)
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' +
r'(.*)' + r'<\?xpacket[ ]+end', br'(.*)' + br'<\?xpacket[ ]+end',
flags = re.DOTALL) flags = re.DOTALL)
# Extract all attachments if any into temporary directory # Extract all attachments if any into temporary directory
@ -268,7 +269,7 @@ class PDFExtractor:
global tmpdir global tmpdir
if not tmpdir: if not tmpdir:
return "" return b""
tesseractlang = self.guesstesseractlang() tesseractlang = self.guesstesseractlang()
# self.em.rclog("tesseractlang %s" % tesseractlang) # self.em.rclog("tesseractlang %s" % tesseractlang)
@ -283,7 +284,7 @@ class PDFExtractor:
tmpfile]) tmpfile])
except Exception as e: except Exception as e:
self.em.rclog("pdftoppm failed: %s" % e) self.em.rclog("pdftoppm failed: %s" % e)
return "" return b""
files = glob.glob(tmpfile + "*") files = glob.glob(tmpfile + "*")
for f in files: for f in files:
@ -300,17 +301,17 @@ class PDFExtractor:
# Concatenate the result files # Concatenate the result files
files = glob.glob(tmpfile + "*" + ".txt") files = glob.glob(tmpfile + "*" + ".txt")
data = "" data = b""
for f in files: for f in files:
data += open(f, "r").read() data += open(f, "rb").read()
if not data: if not data:
return "" return b""
return '''<html><head> return b'''<html><head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
</head><body><pre>''' + \ </head><body><pre>''' + \
self.em.htmlescape(data) + \ self.em.htmlescape(data) + \
'''</pre></body></html>''' b'''</pre></body></html>'''
# pdftotext (used to?) badly escape text inside the header # pdftotext (used to?) badly escape text inside the header
@ -349,7 +350,7 @@ class PDFExtractor:
elif inbody: elif inbody:
s = line[0:1] s = line[0:1]
if s != "\x0c" and s != "<": if s != b"\x0c" and s != b"<":
isempty = False isempty = False
# We used to remove end-of-line hyphenation (and join # We used to remove end-of-line hyphenation (and join
# lines), but but it's not clear that we should do # lines), but but it's not clear that we should do
@ -366,17 +367,17 @@ class PDFExtractor:
return b'\n'.join(output), isempty return b'\n'.join(output), isempty
def _metatag(self, nm, val): def _metatag(self, nm, val):
return "<meta name=\"" + nm + "\" content=\"" + \ return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
self.em.htmlescape(val) + "\">" self.em.htmlescape(rclexecm.makebytes(val)) + b"\">"
# metaheaders is a list of (nm, value) pairs # metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders): def _injectmeta(self, html, metaheaders):
metatxt = '' metatxt = b''
for nm, val in metaheaders: for nm, val in metaheaders:
metatxt += self._metatag(nm, val) + '\n' metatxt += self._metatag(nm, val) + b'\n'
if not metatxt: if not metatxt:
return html return html
res = self.re_head.sub('<head>\n' + metatxt, html) res = self.re_head.sub(b'<head>\n' + metatxt, html)
#self.em.rclog("Substituted html: [%s]"%res) #self.em.rclog("Substituted html: [%s]"%res)
if res: if res:
return res return res
@ -395,7 +396,7 @@ class PDFExtractor:
def _setextrameta(self, html): def _setextrameta(self, html):
if not self.pdfinfo: if not self.pdfinfo:
return return html
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
@ -417,8 +418,9 @@ class PDFExtractor:
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
rdf = root.find("rdf:RDF", namespaces) rdf = root.find("rdf:RDF", namespaces)
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
if rdf is None:
return html
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
for metanm,rclnm in self.extrameta: for metanm,rclnm in self.extrameta:
for rdfdesc in rdfdesclist: for rdfdesc in rdfdesclist:
try: try:
@ -427,15 +429,28 @@ class PDFExtractor:
# We get an exception when this rdf:Description does not # We get an exception when this rdf:Description does not
# define the required namespace. # define the required namespace.
continue continue
text = None
if elt is not None: if elt is not None:
text = self._xmltreetext(elt) text = self._xmltreetext(elt)
if text: else:
# Should we set empty values ? # Some docs define the values as attributes. don't
# Can't use setfield as it only works for # know if this is valid but anyway...
# text/plain output at the moment. try:
metaheaders.append((rclnm, text)) prefix,nm = metanm.split(":")
fullnm = "{%s}%s" % (rdfdesc.nsmap[prefix], nm)
except:
fullnm = nm
text = rdfdesc.get(fullnm)
if text:
# Should we set empty values ?
# Can't use setfield as it only works for
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
return self._injectmeta(html, metaheaders) return self._injectmeta(html, metaheaders)
else:
return html
def _selfdoc(self): def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)''' '''Extract the text from the pdf doc (as opposed to attachment)'''
@ -460,7 +475,7 @@ class PDFExtractor:
try: try:
html = self._setextrameta(html) html = self._setextrameta(html)
except Exception as err: except Exception as err:
self.em.rclog("Metadata extraction failed: %s" % err) self.em.rclog("Metadata extraction failed: %s %s" % (err, traceback.format_exc()))
return (True, html, "", eof) return (True, html, "", eof)