pdf: add and document MetaFixer::wrapup() method

2017-05-17 08:32:23 +02:00 · 2017-05-17 08:32:23 +02:00 · 123d5b36ad
commit 123d5b36ad
parent 41eb89bbe0
3 changed files with 46 additions and 20 deletions
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -20,8 +20,8 @@ alink="#0000FF">
    <div class="titlepage">
      <div>
        <div>
-          <h1 class="title"><a name="idp35245072" id=
+          <h1 class="title"><a name="idp56557776" id=
-          "idp35245072"></a>Recoll user manual</h1>
+          "idp56557776"></a>Recoll user manual</h1>
        </div>
        <div>
@ -109,13 +109,13 @@ alink="#0000FF">
                multiple indexes</a></span></dt>
                <dt><span class="sect2">2.1.3. <a href=
-                "#idp40818624">Document types</a></span></dt>
+                "#idp62130176">Document types</a></span></dt>
                <dt><span class="sect2">2.1.4. <a href=
-                "#idp40843200">Indexing failures</a></span></dt>
+                "#idp62154272">Indexing failures</a></span></dt>
                <dt><span class="sect2">2.1.5. <a href=
-                "#idp40850208">Recovery</a></span></dt>
+                "#idp62161280">Recovery</a></span></dt>
              </dl>
            </dd>
@ -1017,8 +1017,8 @@ alink="#0000FF">
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40818624" id=
+                <h3 class="title"><a name="idp62130176" id=
-                "idp40818624"></a>2.1.3.&nbsp;Document types</h3>
+                "idp62130176"></a>2.1.3.&nbsp;Document types</h3>
              </div>
            </div>
          </div>
@ -1131,8 +1131,8 @@ indexedmimetypes = application/pdf
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40843200" id=
+                <h3 class="title"><a name="idp62154272" id=
-                "idp40843200"></a>2.1.4.&nbsp;Indexing
+                "idp62154272"></a>2.1.4.&nbsp;Indexing
                failures</h3>
              </div>
            </div>
@ -1172,8 +1172,8 @@ indexedmimetypes = application/pdf
          <div class="titlepage">
            <div>
              <div>
-                <h3 class="title"><a name="idp40850208" id=
+                <h3 class="title"><a name="idp62161280" id=
-                "idp40850208"></a>2.1.5.&nbsp;Recovery</h3>
+                "idp62161280"></a>2.1.5.&nbsp;Recovery</h3>
              </div>
            </div>
          </div>
@ -2042,8 +2042,20 @@ class MetaFixer(object):
            pass
        return txt
    def wrapup(self, metaheaders):
        pass
 </pre>
          <p>If the 'metafix()' method is defined, it is called for
          each metadata field. A new MetaFixer object is created
          for each PDF document (so the object can keep state for,
          for example, eliminating duplicate values). If the
          'wrapup()' method is defined, it is called at the end of
          XMP fields processing with the whole metadata as
          parameter, as an array of '(nm, val)' pairs, allowing an
          alternate approach for editing or adding/deleting
          fields.</p>
        </div>
        <div class="sect2">
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -1174,8 +1174,17 @@ class MetaFixer(object):
            pass
        return txt
    def wrapup(self, metaheaders):
        pass
        </programlisting>
        <para>If the 'metafix()' method is defined, it is called for each
        metadata field. A new MetaFixer object is created for each PDF
        document (so the object can keep state for, for example,
        eliminating duplicate values). If the 'wrapup()' method is defined, it
        is called at the end of XMP fields processing with the whole
        metadata as parameter, as an array of '(nm, val)' pairs, allowing
        an alternate approach for editing or adding/deleting fields.</para>
        <!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP
        tags setup</ulink>, including a  nice result list paragraph format in the 
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -400,23 +400,20 @@ class PDFExtractor:
        emf = EMF.MetaFixer() if EMF else None
        # Execute pdfinfo and extract the XML packet
        all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
        # Extract the XML packet
        res = self.re_xmlpacket.search(all)
-        xml = ''
+        xml = res.group(1) if res else ''
        if res:
            xml = res.group(1)
        #self.em.rclog("extrameta: XML: [%s]" % xml)
        if not xml:
            return html
        # Process the XML data
        root = ET.fromstring(xml)
        # Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
        # The namespace thing is a drag. Can't do it from the top. See
        # the stackoverflow ref above. Maybe we'd be better off just
        # walking the full tree and building the namespaces dict.
        root = ET.fromstring(xml)
        # Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
        if root.tag.endswith('RDF'):
            rdf = root
        else:
@ -441,13 +438,21 @@ class PDFExtractor:
                if elt is not None:
                    text = self._xmltreetext(elt).encode('UTF-8')
                    if emf:
-                        text = emf.metafix(metanm, text)
+                        try:
                            text = emf.metafix(metanm, text)
                        except:
                            pass
                    # Should we set empty values ?
                    if text:
                        # Can't use setfield as it only works for
                        # text/plain output at the moment.
                        metaheaders.append((rclnm, text))
        if metaheaders:
            if emf:
                try:
                    emf.wrapup(metaheaders)
                except:
                    pass
            return self._injectmeta(html, metaheaders)
        else:
            return html