pdf xmp: pdfextrametafix: add method which takes the xml elt as arg instead of the text content
This commit is contained in:
parent
3b82aa7927
commit
b43d1b3287
@ -439,14 +439,35 @@ class PDFExtractor:
|
|||||||
for metanm,rclnm in self.extrameta:
|
for metanm,rclnm in self.extrameta:
|
||||||
for rdfdesc in rdfdesclist:
|
for rdfdesc in rdfdesclist:
|
||||||
try:
|
try:
|
||||||
elt = rdfdesc.find(metanm, rdfdesc.nsmap)
|
elts = rdfdesc.findall(metanm, rdfdesc.nsmap)
|
||||||
except:
|
except:
|
||||||
# We get an exception when this rdf:Description does not
|
# We get an exception when this rdf:Description does not
|
||||||
# define the required namespace.
|
# define the required namespace.
|
||||||
continue
|
continue
|
||||||
text = None
|
|
||||||
if elt is not None:
|
if elts:
|
||||||
text = self._xmltreetext(elt)
|
for elt in elts:
|
||||||
|
text = None
|
||||||
|
try:
|
||||||
|
# First try to get text from a custom element handler
|
||||||
|
text = emf.metafixelt(metanm, elt)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if text is None:
|
||||||
|
# still nothing here, read the element text
|
||||||
|
text = self._xmltreetext(elt)
|
||||||
|
try:
|
||||||
|
# try to run metafix
|
||||||
|
text = emf.metafix(metanm, text)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Can't use setfield as it only works for
|
||||||
|
# text/plain output at the moment.
|
||||||
|
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
|
||||||
|
metaheaders.append((rclnm, text))
|
||||||
else:
|
else:
|
||||||
# Some docs define the values as attributes. don't
|
# Some docs define the values as attributes. don't
|
||||||
# know if this is valid but anyway...
|
# know if this is valid but anyway...
|
||||||
@ -456,17 +477,13 @@ class PDFExtractor:
|
|||||||
except:
|
except:
|
||||||
fullnm = metanm
|
fullnm = metanm
|
||||||
text = rdfdesc.get(fullnm)
|
text = rdfdesc.get(fullnm)
|
||||||
# Should we set empty values ?
|
if text:
|
||||||
if text:
|
|
||||||
if emf:
|
|
||||||
try:
|
try:
|
||||||
|
# try to run metafix
|
||||||
text = emf.metafix(metanm, text)
|
text = emf.metafix(metanm, text)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# Can't use setfield as it only works for
|
metaheaders.append((rclnm, text))
|
||||||
# text/plain output at the moment.
|
|
||||||
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
|
|
||||||
metaheaders.append((rclnm, text))
|
|
||||||
if metaheaders:
|
if metaheaders:
|
||||||
if emf:
|
if emf:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -1,12 +1,17 @@
|
|||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
|
import lxml.etree as ET
|
||||||
|
|
||||||
class MetaFixer(object):
|
class MetaFixer(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# def metafixelt(self, nm, xml):
|
||||||
|
# print("MetaFixer: metafixelt: %s" % ET.tostring(xml), file=sys.stderr)
|
||||||
|
# return "hello"
|
||||||
|
|
||||||
def metafix(self, nm, txt):
|
def metafix(self, nm, txt):
|
||||||
#print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
|
print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
|
||||||
if nm == 'pdf:Producer':
|
if nm == 'pdf:Producer':
|
||||||
txt += " metafixerunique"
|
txt += " metafixerunique"
|
||||||
elif nm == 'someothername':
|
elif nm == 'someothername':
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user