rclpdf: restore pdfextrametafix function and add test
This commit is contained in:
parent
c1d593b104
commit
2e801812fe
@ -100,6 +100,7 @@ class PDFExtractor:
|
||||
# (xmltag,rcltag) pairs
|
||||
self.extrameta = self.config.getConfParam("pdfextrameta")
|
||||
if self.extrameta:
|
||||
self.extrametafix = self.config.getConfParam("pdfextrametafix")
|
||||
self._initextrameta()
|
||||
|
||||
# Check if we need to escape portions of text where old
|
||||
@ -147,8 +148,8 @@ class PDFExtractor:
|
||||
self.extrameta = None
|
||||
return
|
||||
|
||||
# extrameta is like "samename metanm|rclnm ..."
|
||||
# we turn it into a list of pairs
|
||||
# extrameta is like "metanm|rclnm ...", where |rclnm maybe absent (keep
|
||||
# original name). Parse into a list of pairs.
|
||||
l = self.extrameta.split()
|
||||
self.extrameta = []
|
||||
for e in l:
|
||||
@ -178,6 +179,18 @@ class PDFExtractor:
|
||||
self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' +
|
||||
br'(.*)' + br'<\?xpacket[ ]+end',
|
||||
flags = re.DOTALL)
|
||||
global EMF
|
||||
EMF = None
|
||||
if self.extrametafix:
|
||||
try:
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
'pdfextrametafix', self.extrametafix)
|
||||
EMF = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(EMF)
|
||||
except Exception as err:
|
||||
self.em.rclog("Import extrametafix failed: %s" % err)
|
||||
pass
|
||||
|
||||
# Extract all attachments if any into temporary directory
|
||||
def extractAttach(self):
|
||||
@ -396,13 +409,12 @@ class PDFExtractor:
|
||||
if not self.pdfinfo:
|
||||
return html
|
||||
|
||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||
emf = EMF.MetaFixer() if EMF else None
|
||||
|
||||
# Extract the XML packet
|
||||
# Execute pdfinfo and extract the XML packet
|
||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||
res = self.re_xmlpacket.search(all)
|
||||
xml = ''
|
||||
if res:
|
||||
xml = res.group(1)
|
||||
xml = res.group(1) if res else ''
|
||||
# self.em.rclog("extrameta: XML: [%s]" % xml)
|
||||
if not xml:
|
||||
return html
|
||||
@ -439,13 +451,23 @@ class PDFExtractor:
|
||||
except:
|
||||
fullnm = nm
|
||||
text = rdfdesc.get(fullnm)
|
||||
if text:
|
||||
# Should we set empty values ?
|
||||
if text:
|
||||
if emf:
|
||||
try:
|
||||
text = emf.metafix(metanm, text)
|
||||
except:
|
||||
pass
|
||||
# Can't use setfield as it only works for
|
||||
# text/plain output at the moment.
|
||||
#self.em.rclog("Appending: (%s,%s)"%(rclnm,text))
|
||||
metaheaders.append((rclnm, text))
|
||||
if metaheaders:
|
||||
if emf:
|
||||
try:
|
||||
emf.wrapup(metaheaders)
|
||||
except:
|
||||
pass
|
||||
return self._injectmeta(html, metaheaders)
|
||||
else:
|
||||
return html
|
||||
|
||||
23
tests/config/pdfemf.py
Normal file
23
tests/config/pdfemf.py
Normal file
@ -0,0 +1,23 @@
|
||||
import sys
|
||||
import re
|
||||
|
||||
class MetaFixer(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def metafix(self, nm, txt):
|
||||
#print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr)
|
||||
if nm == 'pdf:Producer':
|
||||
txt += " metafixerunique"
|
||||
elif nm == 'someothername':
|
||||
# do something else
|
||||
pass
|
||||
elif nm == 'stillanother':
|
||||
# etc.
|
||||
pass
|
||||
|
||||
return txt
|
||||
|
||||
def wrapup(self, metaheaders):
|
||||
#print("Metafixer: wrapup: %s" % metaheaders, file=sys.stderr)
|
||||
pass
|
||||
@ -39,6 +39,8 @@ daemSkippedPaths = \
|
||||
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
|
||||
|
||||
pdfextrameta = pdf:Producer dc:identifier
|
||||
pdfextrametafix = /home/dockes/projets/fulltext/recoll/tests/config/pdfemf.py
|
||||
|
||||
[/home/dockes/projets/fulltext/testrecoll/pdf]
|
||||
pdfocr = 1
|
||||
pdfocrlang = eng
|
||||
|
||||
@ -11,7 +11,7 @@ initvariables $0
|
||||
# defaults field file, and this can't be overruled afaics, so
|
||||
# url is prefixed for the dc:identifier search to work
|
||||
recollq dc:identifier:10.12345/sampledoi
|
||||
recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
|
||||
recollq 'pdf:Producer:"GPL Ghostscript 9.18" metafixerunique'
|
||||
recollq '"bubbleupnp server to simulate openhome"'
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user