pdf: add and document MetaFixer::wrapup() method

This commit is contained in:
Jean-Francois Dockes 2017-05-17 08:32:23 +02:00
parent 41eb89bbe0
commit 123d5b36ad
3 changed files with 46 additions and 20 deletions

View File

@ -20,8 +20,8 @@ alink="#0000FF">
<div class="titlepage"> <div class="titlepage">
<div> <div>
<div> <div>
<h1 class="title"><a name="idp35245072" id= <h1 class="title"><a name="idp56557776" id=
"idp35245072"></a>Recoll user manual</h1> "idp56557776"></a>Recoll user manual</h1>
</div> </div>
<div> <div>
@ -109,13 +109,13 @@ alink="#0000FF">
multiple indexes</a></span></dt> multiple indexes</a></span></dt>
<dt><span class="sect2">2.1.3. <a href= <dt><span class="sect2">2.1.3. <a href=
"#idp40818624">Document types</a></span></dt> "#idp62130176">Document types</a></span></dt>
<dt><span class="sect2">2.1.4. <a href= <dt><span class="sect2">2.1.4. <a href=
"#idp40843200">Indexing failures</a></span></dt> "#idp62154272">Indexing failures</a></span></dt>
<dt><span class="sect2">2.1.5. <a href= <dt><span class="sect2">2.1.5. <a href=
"#idp40850208">Recovery</a></span></dt> "#idp62161280">Recovery</a></span></dt>
</dl> </dl>
</dd> </dd>
@ -1017,8 +1017,8 @@ alink="#0000FF">
<div class="titlepage"> <div class="titlepage">
<div> <div>
<div> <div>
<h3 class="title"><a name="idp40818624" id= <h3 class="title"><a name="idp62130176" id=
"idp40818624"></a>2.1.3.&nbsp;Document types</h3> "idp62130176"></a>2.1.3.&nbsp;Document types</h3>
</div> </div>
</div> </div>
</div> </div>
@ -1131,8 +1131,8 @@ indexedmimetypes = application/pdf
<div class="titlepage"> <div class="titlepage">
<div> <div>
<div> <div>
<h3 class="title"><a name="idp40843200" id= <h3 class="title"><a name="idp62154272" id=
"idp40843200"></a>2.1.4.&nbsp;Indexing "idp62154272"></a>2.1.4.&nbsp;Indexing
failures</h3> failures</h3>
</div> </div>
</div> </div>
@ -1172,8 +1172,8 @@ indexedmimetypes = application/pdf
<div class="titlepage"> <div class="titlepage">
<div> <div>
<div> <div>
<h3 class="title"><a name="idp40850208" id= <h3 class="title"><a name="idp62161280" id=
"idp40850208"></a>2.1.5.&nbsp;Recovery</h3> "idp62161280"></a>2.1.5.&nbsp;Recovery</h3>
</div> </div>
</div> </div>
</div> </div>
@ -2042,8 +2042,20 @@ class MetaFixer(object):
pass pass
return txt return txt
def wrapup(self, metaheaders):
pass
</pre> </pre>
<p>If the 'metafix()' method is defined, it is called for
each metadata field. A new MetaFixer object is created
for each PDF document (so the object can keep state for,
for example, eliminating duplicate values). If the
'wrapup()' method is defined, it is called at the end of
XMP fields processing with the whole metadata as
parameter, as an array of '(nm, val)' pairs, allowing an
alternate approach for editing or adding/deleting
fields.</p>
</div> </div>
<div class="sect2"> <div class="sect2">

View File

@ -1174,8 +1174,17 @@ class MetaFixer(object):
pass pass
return txt return txt
def wrapup(self, metaheaders):
pass
</programlisting> </programlisting>
<para>If the 'metafix()' method is defined, it is called for each
metadata field. A new MetaFixer object is created for each PDF
document (so the object can keep state for, for example,
eliminating duplicate values). If the 'wrapup()' method is defined, it
is called at the end of XMP fields processing with the whole
metadata as parameter, as an array of '(nm, val)' pairs, allowing
an alternate approach for editing or adding/deleting fields.</para>
<!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP <!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP
tags setup</ulink>, including a nice result list paragraph format in the tags setup</ulink>, including a nice result list paragraph format in the

View File

@ -400,23 +400,20 @@ class PDFExtractor:
emf = EMF.MetaFixer() if EMF else None emf = EMF.MetaFixer() if EMF else None
# Execute pdfinfo and extract the XML packet
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
# Extract the XML packet
res = self.re_xmlpacket.search(all) res = self.re_xmlpacket.search(all)
xml = '' xml = res.group(1) if res else ''
if res:
xml = res.group(1)
#self.em.rclog("extrameta: XML: [%s]" % xml) #self.em.rclog("extrameta: XML: [%s]" % xml)
if not xml: if not xml:
return html return html
# Process the XML data
root = ET.fromstring(xml)
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
# The namespace thing is a drag. Can't do it from the top. See # The namespace thing is a drag. Can't do it from the top. See
# the stackoverflow ref above. Maybe we'd be better off just # the stackoverflow ref above. Maybe we'd be better off just
# walking the full tree and building the namespaces dict. # walking the full tree and building the namespaces dict.
root = ET.fromstring(xml)
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
if root.tag.endswith('RDF'): if root.tag.endswith('RDF'):
rdf = root rdf = root
else: else:
@ -441,13 +438,21 @@ class PDFExtractor:
if elt is not None: if elt is not None:
text = self._xmltreetext(elt).encode('UTF-8') text = self._xmltreetext(elt).encode('UTF-8')
if emf: if emf:
text = emf.metafix(metanm, text) try:
text = emf.metafix(metanm, text)
except:
pass
# Should we set empty values ? # Should we set empty values ?
if text: if text:
# Can't use setfield as it only works for # Can't use setfield as it only works for
# text/plain output at the moment. # text/plain output at the moment.
metaheaders.append((rclnm, text)) metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
if emf:
try:
emf.wrapup(metaheaders)
except:
pass
return self._injectmeta(html, metaheaders) return self._injectmeta(html, metaheaders)
else: else:
return html return html