diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index aba9ee1f..2b8d5108 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -439,14 +439,35 @@ class PDFExtractor: for metanm,rclnm in self.extrameta: for rdfdesc in rdfdesclist: try: - elt = rdfdesc.find(metanm, rdfdesc.nsmap) + elts = rdfdesc.findall(metanm, rdfdesc.nsmap) except: # We get an exception when this rdf:Description does not # define the required namespace. continue - text = None - if elt is not None: - text = self._xmltreetext(elt) + + if elts: + for elt in elts: + text = None + try: + # First try to get text from a custom element handler + text = emf.metafixelt(metanm, elt) + except: + pass + + if text is None: + # still nothing here, read the element text + text = self._xmltreetext(elt) + try: + # try to run metafix + text = emf.metafix(metanm, text) + except: + pass + + if text: + # Can't use setfield as it only works for + # text/plain output at the moment. + #self.em.rclog("Appending: (%s,%s)"%(rclnm,text)) + metaheaders.append((rclnm, text)) else: # Some docs define the values as attributes. don't # know if this is valid but anyway... @@ -456,17 +477,13 @@ class PDFExtractor: except: fullnm = metanm text = rdfdesc.get(fullnm) - # Should we set empty values ? - if text: - if emf: + if text: try: + # try to run metafix text = emf.metafix(metanm, text) except: pass - # Can't use setfield as it only works for - # text/plain output at the moment. - #self.em.rclog("Appending: (%s,%s)"%(rclnm,text)) - metaheaders.append((rclnm, text)) + metaheaders.append((rclnm, text)) if metaheaders: if emf: try: diff --git a/tests/config/pdfemf.py b/tests/config/pdfemf.py index 3c973b93..d8f7e239 100644 --- a/tests/config/pdfemf.py +++ b/tests/config/pdfemf.py @@ -1,12 +1,17 @@ import sys import re +import lxml.etree as ET class MetaFixer(object): def __init__(self): pass +# def metafixelt(self, nm, xml): +# print("MetaFixer: metafixelt: %s" % ET.tostring(xml), file=sys.stderr) +# return "hello" + def metafix(self, nm, txt): - #print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr) + print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr) if nm == 'pdf:Producer': txt += " metafixerunique" elif nm == 'someothername':