pdf xmp: pdfextrametafix: add method which takes the xml elt as arg instead of the text content

This commit is contained in:
Jean-Francois Dockes 2019-11-14 18:19:33 +01:00
parent 3b82aa7927
commit b43d1b3287
2 changed files with 34 additions and 12 deletions

View File

@ -439,14 +439,35 @@ class PDFExtractor:
for metanm,rclnm in self.extrameta:
for rdfdesc in rdfdesclist:
try:
elt = rdfdesc.find(metanm, rdfdesc.nsmap)
elts = rdfdesc.findall(metanm, rdfdesc.nsmap)
except:
# We get an exception when this rdf:Description does not
# define the required namespace.
continue
text = None
if elt is not None:
text = self._xmltreetext(elt)
if elts:
for elt in elts:
text = None
try:
# First try to get text from a custom element handler
text = emf.metafixelt(metanm, elt)
except:
pass
if text is None:
# still nothing here, read the element text
text = self._xmltreetext(elt)
try:
# try to run metafix
text = emf.metafix(metanm, text)
except:
pass
if text:
# Can't use setfield as it only works for
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
else:
# Some docs define the values as attributes. don't
# know if this is valid but anyway...
@ -456,17 +477,13 @@ class PDFExtractor:
except:
fullnm = metanm
text = rdfdesc.get(fullnm)
# Should we set empty values ?
if text:
if emf:
if text:
try:
# try to run metafix
text = emf.metafix(metanm, text)
except:
pass
# Can't use setfield as it only works for
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
metaheaders.append((rclnm, text))
if metaheaders:
if emf:
try:

View File

@ -1,12 +1,17 @@
import sys
import re
import lxml.etree as ET
class MetaFixer(object):
def __init__(self):
pass
# def metafixelt(self, nm, xml):
# print("MetaFixer: metafixelt: %s" % ET.tostring(xml), file=sys.stderr)
# return "hello"
def metafix(self, nm, txt):
#print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
if nm == 'pdf:Producer':
txt += " metafixerunique"
elif nm == 'someothername':