pdf xmp: pdfextrametafix: add method which takes the xml elt as arg instead of the text content

This commit is contained in:
Jean-Francois Dockes 2019-11-14 18:19:33 +01:00
parent 3b82aa7927
commit b43d1b3287
2 changed files with 34 additions and 12 deletions

View File

@ -439,14 +439,35 @@ class PDFExtractor:
for metanm,rclnm in self.extrameta: for metanm,rclnm in self.extrameta:
for rdfdesc in rdfdesclist: for rdfdesc in rdfdesclist:
try: try:
elt = rdfdesc.find(metanm, rdfdesc.nsmap) elts = rdfdesc.findall(metanm, rdfdesc.nsmap)
except: except:
# We get an exception when this rdf:Description does not # We get an exception when this rdf:Description does not
# define the required namespace. # define the required namespace.
continue continue
text = None
if elt is not None: if elts:
text = self._xmltreetext(elt) for elt in elts:
text = None
try:
# First try to get text from a custom element handler
text = emf.metafixelt(metanm, elt)
except:
pass
if text is None:
# still nothing here, read the element text
text = self._xmltreetext(elt)
try:
# try to run metafix
text = emf.metafix(metanm, text)
except:
pass
if text:
# Can't use setfield as it only works for
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
else: else:
# Some docs define the values as attributes. don't # Some docs define the values as attributes. don't
# know if this is valid but anyway... # know if this is valid but anyway...
@ -456,17 +477,13 @@ class PDFExtractor:
except: except:
fullnm = metanm fullnm = metanm
text = rdfdesc.get(fullnm) text = rdfdesc.get(fullnm)
# Should we set empty values ? if text:
if text:
if emf:
try: try:
# try to run metafix
text = emf.metafix(metanm, text) text = emf.metafix(metanm, text)
except: except:
pass pass
# Can't use setfield as it only works for metaheaders.append((rclnm, text))
# text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
if emf: if emf:
try: try:

View File

@ -1,12 +1,17 @@
import sys import sys
import re import re
import lxml.etree as ET
class MetaFixer(object): class MetaFixer(object):
def __init__(self): def __init__(self):
pass pass
# def metafixelt(self, nm, xml):
# print("MetaFixer: metafixelt: %s" % ET.tostring(xml), file=sys.stderr)
# return "hello"
def metafix(self, nm, txt): def metafix(self, nm, txt):
#print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr) print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
if nm == 'pdf:Producer': if nm == 'pdf:Producer':
txt += " metafixerunique" txt += " metafixerunique"
elif nm == 'someothername': elif nm == 'someothername':