From 9e046187da6f1d72d7abbe6381a9f4708b4f1ac5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 16 May 2017 03:20:57 +0200 Subject: [PATCH] pdf xmp metadata: handle the case where the x:xmpmeta node is omitted and the XML root is rdf:RDF --- src/filters/rclpdf.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 889a2954..04007d0a 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -412,18 +412,26 @@ class PDFExtractor: xml = '' if res: xml = res.group(1) - # self.em.rclog("extrameta: XML: [%s]" % xml) + #self.em.rclog("extrameta: XML: [%s]" % xml) if not xml: return html - metaheaders = [] # The namespace thing is a drag. Can't do it from the top. See # the stackoverflow ref above. Maybe we'd be better off just # walking the full tree and building the namespaces dict. root = ET.fromstring(xml) - #self.em.rclog("NSMAP: %s"% root.nsmap) - namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} - rdf = root.find("rdf:RDF", namespaces) + + # Sometimes the root tag is , sometimes + if root.tag.endswith('RDF'): + rdf = root + else: + namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} + rdf = root.find("rdf:RDF", namespaces) + if rdf is None: + self.em.rclog("No rdf:RDF node"); + return html + + metaheaders = [] #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)