rclpdf: restore pdfextrametafix function and add test

This commit is contained in:
Jean-Francois Dockes 2019-09-04 09:38:11 +02:00
parent c1d593b104
commit 2e801812fe
4 changed files with 56 additions and 9 deletions

View File

@ -100,6 +100,7 @@ class PDFExtractor:
# (xmltag,rcltag) pairs # (xmltag,rcltag) pairs
self.extrameta = self.config.getConfParam("pdfextrameta") self.extrameta = self.config.getConfParam("pdfextrameta")
if self.extrameta: if self.extrameta:
self.extrametafix = self.config.getConfParam("pdfextrametafix")
self._initextrameta() self._initextrameta()
# Check if we need to escape portions of text where old # Check if we need to escape portions of text where old
@ -147,8 +148,8 @@ class PDFExtractor:
self.extrameta = None self.extrameta = None
return return
# extrameta is like "samename metanm|rclnm ..." # extrameta is like "metanm|rclnm ...", where |rclnm maybe absent (keep
# we turn it into a list of pairs # original name). Parse into a list of pairs.
l = self.extrameta.split() l = self.extrameta.split()
self.extrameta = [] self.extrameta = []
for e in l: for e in l:
@ -178,6 +179,18 @@ class PDFExtractor:
self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' + self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' +
br'(.*)' + br'<\?xpacket[ ]+end', br'(.*)' + br'<\?xpacket[ ]+end',
flags = re.DOTALL) flags = re.DOTALL)
global EMF
EMF = None
if self.extrametafix:
try:
import importlib.util
spec = importlib.util.spec_from_file_location(
'pdfextrametafix', self.extrametafix)
EMF = importlib.util.module_from_spec(spec)
spec.loader.exec_module(EMF)
except Exception as err:
self.em.rclog("Import extrametafix failed: %s" % err)
pass
# Extract all attachments if any into temporary directory # Extract all attachments if any into temporary directory
def extractAttach(self): def extractAttach(self):
@ -396,13 +409,12 @@ class PDFExtractor:
if not self.pdfinfo: if not self.pdfinfo:
return html return html
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) emf = EMF.MetaFixer() if EMF else None
# Extract the XML packet # Execute pdfinfo and extract the XML packet
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
res = self.re_xmlpacket.search(all) res = self.re_xmlpacket.search(all)
xml = '' xml = res.group(1) if res else ''
if res:
xml = res.group(1)
# self.em.rclog("extrameta: XML: [%s]" % xml) # self.em.rclog("extrameta: XML: [%s]" % xml)
if not xml: if not xml:
return html return html
@ -439,13 +451,23 @@ class PDFExtractor:
except: except:
fullnm = nm fullnm = nm
text = rdfdesc.get(fullnm) text = rdfdesc.get(fullnm)
# Should we set empty values ?
if text: if text:
# Should we set empty values ? if emf:
try:
text = emf.metafix(metanm, text)
except:
pass
# Can't use setfield as it only works for # Can't use setfield as it only works for
# text/plain output at the moment. # text/plain output at the moment.
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text)) #self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
metaheaders.append((rclnm, text)) metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
if emf:
try:
emf.wrapup(metaheaders)
except:
pass
return self._injectmeta(html, metaheaders) return self._injectmeta(html, metaheaders)
else: else:
return html return html

23
tests/config/pdfemf.py Normal file
View File

@ -0,0 +1,23 @@
import sys
import re
class MetaFixer(object):
def __init__(self):
pass
def metafix(self, nm, txt):
#print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
if nm == 'pdf:Producer':
txt += " metafixerunique"
elif nm == 'someothername':
# do something else
pass
elif nm == 'stillanother':
# etc.
pass
return txt
def wrapup(self, metaheaders):
#print("Metafixer: wrapup: %s" % metaheaders, file=sys.stderr)
pass

View File

@ -39,6 +39,8 @@ daemSkippedPaths = \
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
pdfextrameta = pdf:Producer dc:identifier pdfextrameta = pdf:Producer dc:identifier
pdfextrametafix = /home/dockes/projets/fulltext/recoll/tests/config/pdfemf.py
[/home/dockes/projets/fulltext/testrecoll/pdf] [/home/dockes/projets/fulltext/testrecoll/pdf]
pdfocr = 1 pdfocr = 1
pdfocrlang = eng pdfocrlang = eng

View File

@ -11,7 +11,7 @@ initvariables $0
# defaults field file, and this can't be overruled afaics, so # defaults field file, and this can't be overruled afaics, so
# url is prefixed for the dc:identifier search to work # url is prefixed for the dc:identifier search to work
recollq dc:identifier:10.12345/sampledoi recollq dc:identifier:10.12345/sampledoi
recollq 'pdf:Producer:"GPL Ghostscript 9.18"' recollq 'pdf:Producer:"GPL Ghostscript 9.18" metafixerunique'
recollq '"bubbleupnp server to simulate openhome"' recollq '"bubbleupnp server to simulate openhome"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout