From 2e801812fec260392356968df006d604c6923566 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 4 Sep 2019 09:38:11 +0200 Subject: [PATCH] rclpdf: restore pdfextrametafix function and add test --- src/filters/rclpdf.py | 38 ++++++++++++++++++++++++++++++-------- tests/config/pdfemf.py | 23 +++++++++++++++++++++++ tests/config/recoll.conf | 2 ++ tests/pdf/pdf.sh | 2 +- 4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 tests/config/pdfemf.py diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 16d91be1..27ed7af9 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -100,6 +100,7 @@ class PDFExtractor: # (xmltag,rcltag) pairs self.extrameta = self.config.getConfParam("pdfextrameta") if self.extrameta: + self.extrametafix = self.config.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old @@ -147,8 +148,8 @@ class PDFExtractor: self.extrameta = None return - # extrameta is like "samename metanm|rclnm ..." - # we turn it into a list of pairs + # extrameta is like "metanm|rclnm ...", where |rclnm maybe absent (keep + # original name). Parse into a list of pairs. l = self.extrameta.split() self.extrameta = [] for e in l: @@ -178,6 +179,18 @@ class PDFExtractor: self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' + br'(.*)' + br'<\?xpacket[ ]+end', flags = re.DOTALL) + global EMF + EMF = None + if self.extrametafix: + try: + import importlib.util + spec = importlib.util.spec_from_file_location( + 'pdfextrametafix', self.extrametafix) + EMF = importlib.util.module_from_spec(spec) + spec.loader.exec_module(EMF) + except Exception as err: + self.em.rclog("Import extrametafix failed: %s" % err) + pass # Extract all attachments if any into temporary directory def extractAttach(self): @@ -396,13 +409,12 @@ class PDFExtractor: if not self.pdfinfo: return html - all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) + emf = EMF.MetaFixer() if EMF else None - # Extract the XML packet + # Execute pdfinfo and extract the XML packet + all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) res = self.re_xmlpacket.search(all) - xml = '' - if res: - xml = res.group(1) + xml = res.group(1) if res else '' # self.em.rclog("extrameta: XML: [%s]" % xml) if not xml: return html @@ -439,13 +451,23 @@ class PDFExtractor: except: fullnm = nm text = rdfdesc.get(fullnm) + # Should we set empty values ? if text: - # Should we set empty values ? + if emf: + try: + text = emf.metafix(metanm, text) + except: + pass # Can't use setfield as it only works for # text/plain output at the moment. #self.em.rclog("Appending: (%s,%s)"%(rclnm,text)) metaheaders.append((rclnm, text)) if metaheaders: + if emf: + try: + emf.wrapup(metaheaders) + except: + pass return self._injectmeta(html, metaheaders) else: return html diff --git a/tests/config/pdfemf.py b/tests/config/pdfemf.py new file mode 100644 index 00000000..3c973b93 --- /dev/null +++ b/tests/config/pdfemf.py @@ -0,0 +1,23 @@ +import sys +import re + +class MetaFixer(object): + def __init__(self): + pass + + def metafix(self, nm, txt): + #print("Metafixer: mfix: nm [%s] txt [%s]" % (nm, txt), file=sys.stderr) + if nm == 'pdf:Producer': + txt += " metafixerunique" + elif nm == 'someothername': + # do something else + pass + elif nm == 'stillanother': + # etc. + pass + + return txt + + def wrapup(self, metaheaders): + #print("Metafixer: wrapup: %s" % metaheaders, file=sys.stderr) + pass diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index 40c141c6..680af438 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -39,6 +39,8 @@ daemSkippedPaths = \ unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl pdfextrameta = pdf:Producer dc:identifier +pdfextrametafix = /home/dockes/projets/fulltext/recoll/tests/config/pdfemf.py + [/home/dockes/projets/fulltext/testrecoll/pdf] pdfocr = 1 pdfocrlang = eng diff --git a/tests/pdf/pdf.sh b/tests/pdf/pdf.sh index fe5a1ce6..80d95a29 100755 --- a/tests/pdf/pdf.sh +++ b/tests/pdf/pdf.sh @@ -11,7 +11,7 @@ initvariables $0 # defaults field file, and this can't be overruled afaics, so # url is prefixed for the dc:identifier search to work recollq dc:identifier:10.12345/sampledoi - recollq 'pdf:Producer:"GPL Ghostscript 9.18"' + recollq 'pdf:Producer:"GPL Ghostscript 9.18" metafixerunique' recollq '"bubbleupnp server to simulate openhome"' ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout