pdf: add and document MetaFixer::wrapup() method
This commit is contained in:
parent
41eb89bbe0
commit
123d5b36ad
@ -20,8 +20,8 @@ alink="#0000FF">
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h1 class="title"><a name="idp35245072" id=
|
<h1 class="title"><a name="idp56557776" id=
|
||||||
"idp35245072"></a>Recoll user manual</h1>
|
"idp56557776"></a>Recoll user manual</h1>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
@ -109,13 +109,13 @@ alink="#0000FF">
|
|||||||
multiple indexes</a></span></dt>
|
multiple indexes</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.3. <a href=
|
<dt><span class="sect2">2.1.3. <a href=
|
||||||
"#idp40818624">Document types</a></span></dt>
|
"#idp62130176">Document types</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.4. <a href=
|
<dt><span class="sect2">2.1.4. <a href=
|
||||||
"#idp40843200">Indexing failures</a></span></dt>
|
"#idp62154272">Indexing failures</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.5. <a href=
|
<dt><span class="sect2">2.1.5. <a href=
|
||||||
"#idp40850208">Recovery</a></span></dt>
|
"#idp62161280">Recovery</a></span></dt>
|
||||||
</dl>
|
</dl>
|
||||||
</dd>
|
</dd>
|
||||||
|
|
||||||
@ -1017,8 +1017,8 @@ alink="#0000FF">
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp40818624" id=
|
<h3 class="title"><a name="idp62130176" id=
|
||||||
"idp40818624"></a>2.1.3. Document types</h3>
|
"idp62130176"></a>2.1.3. Document types</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -1131,8 +1131,8 @@ indexedmimetypes = application/pdf
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp40843200" id=
|
<h3 class="title"><a name="idp62154272" id=
|
||||||
"idp40843200"></a>2.1.4. Indexing
|
"idp62154272"></a>2.1.4. Indexing
|
||||||
failures</h3>
|
failures</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -1172,8 +1172,8 @@ indexedmimetypes = application/pdf
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp40850208" id=
|
<h3 class="title"><a name="idp62161280" id=
|
||||||
"idp40850208"></a>2.1.5. Recovery</h3>
|
"idp62161280"></a>2.1.5. Recovery</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -2042,8 +2042,20 @@ class MetaFixer(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
def wrapup(self, metaheaders):
|
||||||
|
pass
|
||||||
|
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
<p>If the 'metafix()' method is defined, it is called for
|
||||||
|
each metadata field. A new MetaFixer object is created
|
||||||
|
for each PDF document (so the object can keep state for,
|
||||||
|
for example, eliminating duplicate values). If the
|
||||||
|
'wrapup()' method is defined, it is called at the end of
|
||||||
|
XMP fields processing with the whole metadata as
|
||||||
|
parameter, as an array of '(nm, val)' pairs, allowing an
|
||||||
|
alternate approach for editing or adding/deleting
|
||||||
|
fields.</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="sect2">
|
<div class="sect2">
|
||||||
|
|||||||
@ -1174,8 +1174,17 @@ class MetaFixer(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
def wrapup(self, metaheaders):
|
||||||
|
pass
|
||||||
</programlisting>
|
</programlisting>
|
||||||
|
|
||||||
|
<para>If the 'metafix()' method is defined, it is called for each
|
||||||
|
metadata field. A new MetaFixer object is created for each PDF
|
||||||
|
document (so the object can keep state for, for example,
|
||||||
|
eliminating duplicate values). If the 'wrapup()' method is defined, it
|
||||||
|
is called at the end of XMP fields processing with the whole
|
||||||
|
metadata as parameter, as an array of '(nm, val)' pairs, allowing
|
||||||
|
an alternate approach for editing or adding/deleting fields.</para>
|
||||||
|
|
||||||
<!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP
|
<!-- <para> There is a <ulink url="&WIKI;PDFXMP.wiki">complete example of XMP
|
||||||
tags setup</ulink>, including a nice result list paragraph format in the
|
tags setup</ulink>, including a nice result list paragraph format in the
|
||||||
|
|||||||
@ -400,23 +400,20 @@ class PDFExtractor:
|
|||||||
|
|
||||||
emf = EMF.MetaFixer() if EMF else None
|
emf = EMF.MetaFixer() if EMF else None
|
||||||
|
|
||||||
|
# Execute pdfinfo and extract the XML packet
|
||||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||||
|
|
||||||
# Extract the XML packet
|
|
||||||
res = self.re_xmlpacket.search(all)
|
res = self.re_xmlpacket.search(all)
|
||||||
xml = ''
|
xml = res.group(1) if res else ''
|
||||||
if res:
|
|
||||||
xml = res.group(1)
|
|
||||||
#self.em.rclog("extrameta: XML: [%s]" % xml)
|
#self.em.rclog("extrameta: XML: [%s]" % xml)
|
||||||
if not xml:
|
if not xml:
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
# Process the XML data
|
||||||
|
root = ET.fromstring(xml)
|
||||||
|
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
|
||||||
# The namespace thing is a drag. Can't do it from the top. See
|
# The namespace thing is a drag. Can't do it from the top. See
|
||||||
# the stackoverflow ref above. Maybe we'd be better off just
|
# the stackoverflow ref above. Maybe we'd be better off just
|
||||||
# walking the full tree and building the namespaces dict.
|
# walking the full tree and building the namespaces dict.
|
||||||
root = ET.fromstring(xml)
|
|
||||||
|
|
||||||
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
|
|
||||||
if root.tag.endswith('RDF'):
|
if root.tag.endswith('RDF'):
|
||||||
rdf = root
|
rdf = root
|
||||||
else:
|
else:
|
||||||
@ -441,13 +438,21 @@ class PDFExtractor:
|
|||||||
if elt is not None:
|
if elt is not None:
|
||||||
text = self._xmltreetext(elt).encode('UTF-8')
|
text = self._xmltreetext(elt).encode('UTF-8')
|
||||||
if emf:
|
if emf:
|
||||||
text = emf.metafix(metanm, text)
|
try:
|
||||||
|
text = emf.metafix(metanm, text)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
# Should we set empty values ?
|
# Should we set empty values ?
|
||||||
if text:
|
if text:
|
||||||
# Can't use setfield as it only works for
|
# Can't use setfield as it only works for
|
||||||
# text/plain output at the moment.
|
# text/plain output at the moment.
|
||||||
metaheaders.append((rclnm, text))
|
metaheaders.append((rclnm, text))
|
||||||
if metaheaders:
|
if metaheaders:
|
||||||
|
if emf:
|
||||||
|
try:
|
||||||
|
emf.wrapup(metaheaders)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
return self._injectmeta(html, metaheaders)
|
return self._injectmeta(html, metaheaders)
|
||||||
else:
|
else:
|
||||||
return html
|
return html
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user