From 6f44dce46694fa433b2f47857844b11b56a9b462 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 15 May 2017 14:04:55 +0200 Subject: [PATCH] pdf: Added field-fixing method for Xml metadata --- src/filters/rclpdf.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 2ac62196..889a2954 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -382,7 +382,25 @@ class PDFExtractor: return text.strip() # or: return reduce((lambda t,p : t+p+' '), # [e.text for e in elt.iter() if e.text]).strip() + + + # This can be used for local field editing. For now you need to + # change the program source. maybe we'll make it more dynamic one + # day. The method receives an (original) field name, and the text + # value, and should return the possibly modified text. + def _extrametafix(self, nm, txt): + if nm == 'bibtex:pages': + txt = re.sub(r'--', '-', txt) + elif nm == 'someothername': + # do something else + pass + elif nm == 'stillanother': + # etc. + pass + return txt + + def _setextrameta(self, html): if not self.pdfinfo: return html @@ -419,8 +437,9 @@ class PDFExtractor: continue if elt is not None: text = self._xmltreetext(elt).encode('UTF-8') + # Should we set empty values ? if text: - # Should we set empty values ? + text = self._extrametafix(metanm, text) # Can't use setfield as it only works for # text/plain output at the moment. metaheaders.append((rclnm, text))