From 9e046187da6f1d72d7abbe6381a9f4708b4f1ac5 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Tue, 16 May 2017 03:20:57 +0200
Subject: [PATCH] pdf xmp metadata: handle the case where the x:xmpmeta node is
 omitted and the XML root is rdf:RDF

---
 src/filters/rclpdf.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py
index 889a2954..04007d0a 100755
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@@ -412,18 +412,26 @@ class PDFExtractor:
         xml = ''
         if res:
             xml = res.group(1)
-        # self.em.rclog("extrameta: XML: [%s]" % xml)
+        #self.em.rclog("extrameta: XML: [%s]" % xml)
         if not xml:
             return html
 
-        metaheaders = []
         # The namespace thing is a drag. Can't do it from the top. See
         # the stackoverflow ref above. Maybe we'd be better off just
         # walking the full tree and building the namespaces dict.
         root = ET.fromstring(xml)
-        #self.em.rclog("NSMAP: %s"% root.nsmap)
-        namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
-        rdf = root.find("rdf:RDF", namespaces)
+
+        # Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
+        if root.tag.endswith('RDF'):
+            rdf = root
+        else:
+            namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
+            rdf = root.find("rdf:RDF", namespaces)
+        if rdf is None:
+            self.em.rclog("No rdf:RDF node");
+            return html
+
+        metaheaders = []
         #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
         rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
         #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)