pdf: add and document MetaFixer::wrapup() method

2017-05-17 08:32:23 +02:00 · 2017-05-17 08:32:23 +02:00 · 123d5b36ad
commit 123d5b36ad
parent 41eb89bbe0
3 changed files with 46 additions and 20 deletions
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -20,8 +20,8 @@ alink="#0000FF">
    <div class="titlepage">
      <div>
        <div>
-          <h1 class="title"><a name="idp35245072" id=
-          "idp35245072"></a>Recoll user manual</h1>
+          <h1 class="title"><a name="idp56557776" id=
+          "idp56557776"></a>Recoll user manual</h1>
        </div>

        <div>
@ -109,13 +109,13 @@ alink="#0000FF">
                multiple indexes</a></span></dt>

                <dt><span class="sect2">2.1.3. <a href=
-                "#idp40818624">Document types</a></span></dt>
+                "#idp62130176">Document types</a></span></dt>

                <dt><span class="sect2">2.1.4. <a href=
-                "#idp40843200">Indexing failures</a></span></dt>
+                "#idp62154272">Indexing failures</a></span></dt>

                <dt><span class="sect2">2.1.5. <a href=
-                "#idp40850208">Recovery</a></span></dt>
+                "#idp62161280">Recovery</a></span></dt>
              </dl>
            </dd>

@ -1017,8 +1017,8 @@ alink="#0000FF">
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40818624" id=
-                "idp40818624"></a>2.1.3.&nbsp;Document types</h3>
+                <h3 class="title"><a name="idp62130176" id=
+                "idp62130176"></a>2.1.3.&nbsp;Document types</h3>
              </div>
            </div>
          </div>
@ -1131,8 +1131,8 @@ indexedmimetypes = application/pdf
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40843200" id=
-                "idp40843200"></a>2.1.4.&nbsp;Indexing
+                <h3 class="title"><a name="idp62154272" id=
+                "idp62154272"></a>2.1.4.&nbsp;Indexing
                failures</h3>
              </div>
            </div>
@ -1172,8 +1172,8 @@ indexedmimetypes = application/pdf
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40850208" id=
-                "idp40850208"></a>2.1.5.&nbsp;Recovery</h3>
+                <h3 class="title"><a name="idp62161280" id=
+                "idp62161280"></a>2.1.5.&nbsp;Recovery</h3>
              </div>
            </div>
          </div>
@ -2042,8 +2042,20 @@ class MetaFixer(object):
            pass
    
        return txt
+    def wrapup(self, metaheaders):
+        pass
        
 </pre>
+
+          <p>If the 'metafix()' method is defined, it is called for
+          each metadata field. A new MetaFixer object is created
+          for each PDF document (so the object can keep state for,
+          for example, eliminating duplicate values). If the
+          'wrapup()' method is defined, it is called at the end of
+          XMP fields processing with the whole metadata as
+          parameter, as an array of '(nm, val)' pairs, allowing an
+          alternate approach for editing or adding/deleting
+          fields.</p>
        </div>

        <div class="sect2">
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -1174,8 +1174,17 @@ class MetaFixer(object):
            pass
    
        return txt
+    def wrapup(self, metaheaders):
+        pass
        </programlisting>

+        <para>If the 'metafix()' method is defined, it is called for each
+        metadata field. A new MetaFixer object is created for each PDF
+        document (so the object can keep state for, for example,
+        eliminating duplicate values). If the 'wrapup()' method is defined, it
+        is called at the end of XMP fields processing with the whole
+        metadata as parameter, as an array of '(nm, val)' pairs, allowing
+        an alternate approach for editing or adding/deleting fields.</para>
        
        <!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP
        tags setup</ulink>, including a  nice result list paragraph format in the 
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -400,23 +400,20 @@ class PDFExtractor:

        emf = EMF.MetaFixer() if EMF else None

+        # Execute pdfinfo and extract the XML packet
        all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
-
-        # Extract the XML packet
        res = self.re_xmlpacket.search(all)
-        xml = ''
-        if res:
-            xml = res.group(1)
+        xml = res.group(1) if res else ''
        #self.em.rclog("extrameta: XML: [%s]" % xml)
        if not xml:
            return html

+        # Process the XML data
+        root = ET.fromstring(xml)
+        # Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
        # The namespace thing is a drag. Can't do it from the top. See
        # the stackoverflow ref above. Maybe we'd be better off just
        # walking the full tree and building the namespaces dict.
-        root = ET.fromstring(xml)
-
-        # Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
        if root.tag.endswith('RDF'):
            rdf = root
        else:
@ -441,13 +438,21 @@ class PDFExtractor:
                if elt is not None:
                    text = self._xmltreetext(elt).encode('UTF-8')
                    if emf:
-                        text = emf.metafix(metanm, text)
+                        try:
+                            text = emf.metafix(metanm, text)
+                        except:
+                            pass
                    # Should we set empty values ?
                    if text:
                        # Can't use setfield as it only works for
                        # text/plain output at the moment.
                        metaheaders.append((rclnm, text))
        if metaheaders:
+            if emf:
+                try:
+                    emf.wrapup(metaheaders)
+                except:
+                    pass
            return self._injectmeta(html, metaheaders)
        else:
            return html