diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 603e1d30..1e6852ea 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -86,9 +86,10 @@ class PDFExtractor: self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") - # No need for anything else. openfile() will return an - # error at once - return + if not self.pdftotext: + # No need for anything else. openfile() will return an + # error at once + return cf = rclconfig.RclConfig() self.confdir = cf.getConfDir() @@ -98,7 +99,6 @@ class PDFExtractor: # (xmltag,rcltag) pairs self.extrameta = cf.getConfParam("pdfextrameta") if self.extrameta: - self.extrametafix = cf.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old @@ -179,16 +179,7 @@ class PDFExtractor: self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + r'(.*)' + r'<\?xpacket[ ]+end', flags = re.DOTALL) - global EMF - EMF = None - if self.extrametafix: - try: - import imp - EMF = imp.load_source('pdfextrametafix', self.extrametafix) - except Exception as err: - self.em.rclog("Import extrametafix failed: %s" % err) - pass - + # Extract all attachments if any into temporary directory def extractAttach(self): if self.attextractdone: @@ -366,17 +357,17 @@ class PDFExtractor: return output, isempty def _metatag(self, nm, val): - return b"" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): - metatxt = b'' + metatxt = '' for nm, val in metaheaders: - metatxt += self._metatag(nm, val) + b'\n' + metatxt += self._metatag(nm, val) + '\n' if not metatxt: return html - res = self.re_head.sub(b'
\n' + metatxt, html) + res = self.re_head.sub('\n' + metatxt, html) #self.em.rclog("Substituted html: [%s]"%res) if res: return res @@ -392,38 +383,30 @@ class PDFExtractor: return text.strip() # or: return reduce((lambda t,p : t+p+' '), # [e.text for e in elt.iter() if e.text]).strip() - - + def _setextrameta(self, html): if not self.pdfinfo: - return html + return - emf = EMF.MetaFixer() if EMF else None - - # Execute pdfinfo and extract the XML packet all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) + + # Extract the XML packet res = self.re_xmlpacket.search(all) - xml = res.group(1) if res else '' - #self.em.rclog("extrameta: XML: [%s]" % xml) + xml = '' + if res: + xml = res.group(1) + # self.em.rclog("extrameta: XML: [%s]" % xml) if not xml: return html - # Process the XML data - root = ET.fromstring(xml) - # Sometimes the root tag is