diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index af92d057..616cacfa 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -43,6 +43,7 @@ import atexit import signal import rclconfig import glob +import traceback tmpdir = None @@ -174,9 +175,9 @@ class PDFExtractor: self.pdfinfo = None return - self.re_head = re.compile(r'', re.IGNORECASE) - self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + - r'(.*)' + r'<\?xpacket[ ]+end', + self.re_head = re.compile(br'', re.IGNORECASE) + self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' + + br'(.*)' + br'<\?xpacket[ ]+end', flags = re.DOTALL) # Extract all attachments if any into temporary directory @@ -268,7 +269,7 @@ class PDFExtractor: global tmpdir if not tmpdir: - return "" + return b"" tesseractlang = self.guesstesseractlang() # self.em.rclog("tesseractlang %s" % tesseractlang) @@ -283,7 +284,7 @@ class PDFExtractor: tmpfile]) except Exception as e: self.em.rclog("pdftoppm failed: %s" % e) - return "" + return b"" files = glob.glob(tmpfile + "*") for f in files: @@ -300,17 +301,17 @@ class PDFExtractor: # Concatenate the result files files = glob.glob(tmpfile + "*" + ".txt") - data = "" + data = b"" for f in files: - data += open(f, "r").read() + data += open(f, "rb").read() if not data: - return "" - return ''' + return b"" + return b'''
''' + \
         self.em.htmlescape(data) + \
-        '''
''' + b'''''' # pdftotext (used to?) badly escape text inside the header @@ -349,7 +350,7 @@ class PDFExtractor: elif inbody: s = line[0:1] - if s != "\x0c" and s != "<": + if s != b"\x0c" and s != b"<": isempty = False # We used to remove end-of-line hyphenation (and join # lines), but but it's not clear that we should do @@ -366,17 +367,17 @@ class PDFExtractor: return b'\n'.join(output), isempty def _metatag(self, nm, val): - return "" + return b"" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): - metatxt = '' + metatxt = b'' for nm, val in metaheaders: - metatxt += self._metatag(nm, val) + '\n' + metatxt += self._metatag(nm, val) + b'\n' if not metatxt: return html - res = self.re_head.sub('\n' + metatxt, html) + res = self.re_head.sub(b'\n' + metatxt, html) #self.em.rclog("Substituted html: [%s]"%res) if res: return res @@ -395,7 +396,7 @@ class PDFExtractor: def _setextrameta(self, html): if not self.pdfinfo: - return + return html all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) @@ -417,8 +418,9 @@ class PDFExtractor: namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} rdf = root.find("rdf:RDF", namespaces) #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) + if rdf is None: + return html rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) - #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap) for metanm,rclnm in self.extrameta: for rdfdesc in rdfdesclist: try: @@ -427,16 +429,29 @@ class PDFExtractor: # We get an exception when this rdf:Description does not # define the required namespace. continue + text = None if elt is not None: text = self._xmltreetext(elt) - if text: - # Should we set empty values ? - # Can't use setfield as it only works for - # text/plain output at the moment. - metaheaders.append((rclnm, text)) + else: + # Some docs define the values as attributes. don't + # know if this is valid but anyway... + try: + prefix,nm = metanm.split(":") + fullnm = "{%s}%s" % (rdfdesc.nsmap[prefix], nm) + except: + fullnm = nm + text = rdfdesc.get(fullnm) + if text: + # Should we set empty values ? + # Can't use setfield as it only works for + # text/plain output at the moment. + #self.em.rclog("Appending: (%s,%s)"%(rclnm,text)) + metaheaders.append((rclnm, text)) if metaheaders: return self._injectmeta(html, metaheaders) - + else: + return html + def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') @@ -460,7 +475,7 @@ class PDFExtractor: try: html = self._setextrameta(html) except Exception as err: - self.em.rclog("Metadata extraction failed: %s" % err) + self.em.rclog("Metadata extraction failed: %s %s" % (err, traceback.format_exc())) return (True, html, "", eof)