PDF: fix the XMP metadata extraction code for python3 and other issues. Also get metadata from XML attributes
This commit is contained in:
parent
b759490559
commit
b895980e95
@ -43,6 +43,7 @@ import atexit
|
|||||||
import signal
|
import signal
|
||||||
import rclconfig
|
import rclconfig
|
||||||
import glob
|
import glob
|
||||||
|
import traceback
|
||||||
|
|
||||||
tmpdir = None
|
tmpdir = None
|
||||||
|
|
||||||
@ -174,9 +175,9 @@ class PDFExtractor:
|
|||||||
self.pdfinfo = None
|
self.pdfinfo = None
|
||||||
return
|
return
|
||||||
|
|
||||||
self.re_head = re.compile(r'<head>', re.IGNORECASE)
|
self.re_head = re.compile(br'<head>', re.IGNORECASE)
|
||||||
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' +
|
||||||
r'(.*)' + r'<\?xpacket[ ]+end',
|
br'(.*)' + br'<\?xpacket[ ]+end',
|
||||||
flags = re.DOTALL)
|
flags = re.DOTALL)
|
||||||
|
|
||||||
# Extract all attachments if any into temporary directory
|
# Extract all attachments if any into temporary directory
|
||||||
@ -268,7 +269,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
global tmpdir
|
global tmpdir
|
||||||
if not tmpdir:
|
if not tmpdir:
|
||||||
return ""
|
return b""
|
||||||
|
|
||||||
tesseractlang = self.guesstesseractlang()
|
tesseractlang = self.guesstesseractlang()
|
||||||
# self.em.rclog("tesseractlang %s" % tesseractlang)
|
# self.em.rclog("tesseractlang %s" % tesseractlang)
|
||||||
@ -283,7 +284,7 @@ class PDFExtractor:
|
|||||||
tmpfile])
|
tmpfile])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.em.rclog("pdftoppm failed: %s" % e)
|
self.em.rclog("pdftoppm failed: %s" % e)
|
||||||
return ""
|
return b""
|
||||||
|
|
||||||
files = glob.glob(tmpfile + "*")
|
files = glob.glob(tmpfile + "*")
|
||||||
for f in files:
|
for f in files:
|
||||||
@ -300,17 +301,17 @@ class PDFExtractor:
|
|||||||
|
|
||||||
# Concatenate the result files
|
# Concatenate the result files
|
||||||
files = glob.glob(tmpfile + "*" + ".txt")
|
files = glob.glob(tmpfile + "*" + ".txt")
|
||||||
data = ""
|
data = b""
|
||||||
for f in files:
|
for f in files:
|
||||||
data += open(f, "r").read()
|
data += open(f, "rb").read()
|
||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
return ""
|
return b""
|
||||||
return '''<html><head>
|
return b'''<html><head>
|
||||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||||
</head><body><pre>''' + \
|
</head><body><pre>''' + \
|
||||||
self.em.htmlescape(data) + \
|
self.em.htmlescape(data) + \
|
||||||
'''</pre></body></html>'''
|
b'''</pre></body></html>'''
|
||||||
|
|
||||||
|
|
||||||
# pdftotext (used to?) badly escape text inside the header
|
# pdftotext (used to?) badly escape text inside the header
|
||||||
@ -349,7 +350,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
elif inbody:
|
elif inbody:
|
||||||
s = line[0:1]
|
s = line[0:1]
|
||||||
if s != "\x0c" and s != "<":
|
if s != b"\x0c" and s != b"<":
|
||||||
isempty = False
|
isempty = False
|
||||||
# We used to remove end-of-line hyphenation (and join
|
# We used to remove end-of-line hyphenation (and join
|
||||||
# lines), but but it's not clear that we should do
|
# lines), but but it's not clear that we should do
|
||||||
@ -366,17 +367,17 @@ class PDFExtractor:
|
|||||||
return b'\n'.join(output), isempty
|
return b'\n'.join(output), isempty
|
||||||
|
|
||||||
def _metatag(self, nm, val):
|
def _metatag(self, nm, val):
|
||||||
return "<meta name=\"" + nm + "\" content=\"" + \
|
return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
|
||||||
self.em.htmlescape(val) + "\">"
|
self.em.htmlescape(rclexecm.makebytes(val)) + b"\">"
|
||||||
|
|
||||||
# metaheaders is a list of (nm, value) pairs
|
# metaheaders is a list of (nm, value) pairs
|
||||||
def _injectmeta(self, html, metaheaders):
|
def _injectmeta(self, html, metaheaders):
|
||||||
metatxt = ''
|
metatxt = b''
|
||||||
for nm, val in metaheaders:
|
for nm, val in metaheaders:
|
||||||
metatxt += self._metatag(nm, val) + '\n'
|
metatxt += self._metatag(nm, val) + b'\n'
|
||||||
if not metatxt:
|
if not metatxt:
|
||||||
return html
|
return html
|
||||||
res = self.re_head.sub('<head>\n' + metatxt, html)
|
res = self.re_head.sub(b'<head>\n' + metatxt, html)
|
||||||
#self.em.rclog("Substituted html: [%s]"%res)
|
#self.em.rclog("Substituted html: [%s]"%res)
|
||||||
if res:
|
if res:
|
||||||
return res
|
return res
|
||||||
@ -395,7 +396,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
def _setextrameta(self, html):
|
def _setextrameta(self, html):
|
||||||
if not self.pdfinfo:
|
if not self.pdfinfo:
|
||||||
return
|
return html
|
||||||
|
|
||||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||||
|
|
||||||
@ -417,8 +418,9 @@ class PDFExtractor:
|
|||||||
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
|
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
|
||||||
rdf = root.find("rdf:RDF", namespaces)
|
rdf = root.find("rdf:RDF", namespaces)
|
||||||
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
|
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
|
||||||
|
if rdf is None:
|
||||||
|
return html
|
||||||
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
|
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
|
||||||
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
|
|
||||||
for metanm,rclnm in self.extrameta:
|
for metanm,rclnm in self.extrameta:
|
||||||
for rdfdesc in rdfdesclist:
|
for rdfdesc in rdfdesclist:
|
||||||
try:
|
try:
|
||||||
@ -427,15 +429,28 @@ class PDFExtractor:
|
|||||||
# We get an exception when this rdf:Description does not
|
# We get an exception when this rdf:Description does not
|
||||||
# define the required namespace.
|
# define the required namespace.
|
||||||
continue
|
continue
|
||||||
|
text = None
|
||||||
if elt is not None:
|
if elt is not None:
|
||||||
text = self._xmltreetext(elt)
|
text = self._xmltreetext(elt)
|
||||||
if text:
|
else:
|
||||||
# Should we set empty values ?
|
# Some docs define the values as attributes. don't
|
||||||
# Can't use setfield as it only works for
|
# know if this is valid but anyway...
|
||||||
# text/plain output at the moment.
|
try:
|
||||||
metaheaders.append((rclnm, text))
|
prefix,nm = metanm.split(":")
|
||||||
|
fullnm = "{%s}%s" % (rdfdesc.nsmap[prefix], nm)
|
||||||
|
except:
|
||||||
|
fullnm = nm
|
||||||
|
text = rdfdesc.get(fullnm)
|
||||||
|
if text:
|
||||||
|
# Should we set empty values ?
|
||||||
|
# Can't use setfield as it only works for
|
||||||
|
# text/plain output at the moment.
|
||||||
|
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
|
||||||
|
metaheaders.append((rclnm, text))
|
||||||
if metaheaders:
|
if metaheaders:
|
||||||
return self._injectmeta(html, metaheaders)
|
return self._injectmeta(html, metaheaders)
|
||||||
|
else:
|
||||||
|
return html
|
||||||
|
|
||||||
def _selfdoc(self):
|
def _selfdoc(self):
|
||||||
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
||||||
@ -460,7 +475,7 @@ class PDFExtractor:
|
|||||||
try:
|
try:
|
||||||
html = self._setextrameta(html)
|
html = self._setextrameta(html)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("Metadata extraction failed: %s" % err)
|
self.em.rclog("Metadata extraction failed: %s %s" % (err, traceback.format_exc()))
|
||||||
|
|
||||||
return (True, html, "", eof)
|
return (True, html, "", eof)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user