diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 603e1d30..1e6852ea 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -86,9 +86,10 @@ class PDFExtractor: self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") - # No need for anything else. openfile() will return an - # error at once - return + if not self.pdftotext: + # No need for anything else. openfile() will return an + # error at once + return cf = rclconfig.RclConfig() self.confdir = cf.getConfDir() @@ -98,7 +99,6 @@ class PDFExtractor: # (xmltag,rcltag) pairs self.extrameta = cf.getConfParam("pdfextrameta") if self.extrameta: - self.extrametafix = cf.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old @@ -179,16 +179,7 @@ class PDFExtractor: self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + r'(.*)' + r'<\?xpacket[ ]+end', flags = re.DOTALL) - global EMF - EMF = None - if self.extrametafix: - try: - import imp - EMF = imp.load_source('pdfextrametafix', self.extrametafix) - except Exception as err: - self.em.rclog("Import extrametafix failed: %s" % err) - pass - + # Extract all attachments if any into temporary directory def extractAttach(self): if self.attextractdone: @@ -366,17 +357,17 @@ class PDFExtractor: return output, isempty def _metatag(self, nm, val): - return b"" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): - metatxt = b'' + metatxt = '' for nm, val in metaheaders: - metatxt += self._metatag(nm, val) + b'\n' + metatxt += self._metatag(nm, val) + '\n' if not metatxt: return html - res = self.re_head.sub(b'\n' + metatxt, html) + res = self.re_head.sub('\n' + metatxt, html) #self.em.rclog("Substituted html: [%s]"%res) if res: return res @@ -392,38 +383,30 @@ class PDFExtractor: return text.strip() # or: return reduce((lambda t,p : t+p+' '), # [e.text for e in elt.iter() if e.text]).strip() - - + def _setextrameta(self, html): if not self.pdfinfo: - return html + return - emf = EMF.MetaFixer() if EMF else None - - # Execute pdfinfo and extract the XML packet all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) + + # Extract the XML packet res = self.re_xmlpacket.search(all) - xml = res.group(1) if res else '' - #self.em.rclog("extrameta: XML: [%s]" % xml) + xml = '' + if res: + xml = res.group(1) + # self.em.rclog("extrameta: XML: [%s]" % xml) if not xml: return html - # Process the XML data - root = ET.fromstring(xml) - # Sometimes the root tag is , sometimes + metaheaders = [] # The namespace thing is a drag. Can't do it from the top. See # the stackoverflow ref above. Maybe we'd be better off just # walking the full tree and building the namespaces dict. - if root.tag.endswith('RDF'): - rdf = root - else: - namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} - rdf = root.find("rdf:RDF", namespaces) - if rdf is None: - self.em.rclog("No rdf:RDF node"); - return html - - metaheaders = [] + root = ET.fromstring(xml) + #self.em.rclog("NSMAP: %s"% root.nsmap) + namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} + rdf = root.find("rdf:RDF", namespaces) #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap) @@ -436,27 +419,15 @@ class PDFExtractor: # define the required namespace. continue if elt is not None: - text = self._xmltreetext(elt).encode('UTF-8') - if emf: - try: - text = emf.metafix(metanm, text) - except: - pass - # Should we set empty values ? + text = self._xmltreetext(elt) if text: + # Should we set empty values ? # Can't use setfield as it only works for # text/plain output at the moment. metaheaders.append((rclnm, text)) if metaheaders: - if emf: - try: - emf.wrapup(metaheaders) - except: - pass return self._injectmeta(html, metaheaders) - else: - return html - + def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') @@ -465,13 +436,13 @@ class PDFExtractor: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof - + html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc", "UTF-8", "-eol", "unix", "-q", self.filename, "-"]) html, isempty = self._fixhtml(html) - #self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html)) + #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) if isempty and self.ocrpossible: html = self.ocrpdf()