Fix Windows PDF indexing. The successful test for poppler/pdftotext was not acknowledged and pdf indexing always failed

This commit is contained in:
Jean-Francois Dockes 2018-01-19 13:17:15 +01:00
parent e1662b655d
commit f778274b42

View File

@ -86,9 +86,10 @@ class PDFExtractor:
self.pdftotext = rclexecm.which("pdftotext") self.pdftotext = rclexecm.which("pdftotext")
if not self.pdftotext: if not self.pdftotext:
self.pdftotext = rclexecm.which("poppler/pdftotext") self.pdftotext = rclexecm.which("poppler/pdftotext")
# No need for anything else. openfile() will return an if not self.pdftotext:
# error at once # No need for anything else. openfile() will return an
return # error at once
return
cf = rclconfig.RclConfig() cf = rclconfig.RclConfig()
self.confdir = cf.getConfDir() self.confdir = cf.getConfDir()
@ -98,7 +99,6 @@ class PDFExtractor:
# (xmltag,rcltag) pairs # (xmltag,rcltag) pairs
self.extrameta = cf.getConfParam("pdfextrameta") self.extrameta = cf.getConfParam("pdfextrameta")
if self.extrameta: if self.extrameta:
self.extrametafix = cf.getConfParam("pdfextrametafix")
self._initextrameta() self._initextrameta()
# Check if we need to escape portions of text where old # Check if we need to escape portions of text where old
@ -179,16 +179,7 @@ class PDFExtractor:
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
r'(.*)' + r'<\?xpacket[ ]+end', r'(.*)' + r'<\?xpacket[ ]+end',
flags = re.DOTALL) flags = re.DOTALL)
global EMF
EMF = None
if self.extrametafix:
try:
import imp
EMF = imp.load_source('pdfextrametafix', self.extrametafix)
except Exception as err:
self.em.rclog("Import extrametafix failed: %s" % err)
pass
# Extract all attachments if any into temporary directory # Extract all attachments if any into temporary directory
def extractAttach(self): def extractAttach(self):
if self.attextractdone: if self.attextractdone:
@ -366,17 +357,17 @@ class PDFExtractor:
return output, isempty return output, isempty
def _metatag(self, nm, val): def _metatag(self, nm, val):
return b"<meta name=\"" + nm + "\" content=\"" + \ return "<meta name=\"" + nm + "\" content=\"" + \
self.em.htmlescape(val) + "\">" self.em.htmlescape(val) + "\">"
# metaheaders is a list of (nm, value) pairs # metaheaders is a list of (nm, value) pairs
def _injectmeta(self, html, metaheaders): def _injectmeta(self, html, metaheaders):
metatxt = b'' metatxt = ''
for nm, val in metaheaders: for nm, val in metaheaders:
metatxt += self._metatag(nm, val) + b'\n' metatxt += self._metatag(nm, val) + '\n'
if not metatxt: if not metatxt:
return html return html
res = self.re_head.sub(b'<head>\n' + metatxt, html) res = self.re_head.sub('<head>\n' + metatxt, html)
#self.em.rclog("Substituted html: [%s]"%res) #self.em.rclog("Substituted html: [%s]"%res)
if res: if res:
return res return res
@ -392,38 +383,30 @@ class PDFExtractor:
return text.strip() return text.strip()
# or: return reduce((lambda t,p : t+p+' '), # or: return reduce((lambda t,p : t+p+' '),
# [e.text for e in elt.iter() if e.text]).strip() # [e.text for e in elt.iter() if e.text]).strip()
def _setextrameta(self, html): def _setextrameta(self, html):
if not self.pdfinfo: if not self.pdfinfo:
return html return
emf = EMF.MetaFixer() if EMF else None
# Execute pdfinfo and extract the XML packet
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
# Extract the XML packet
res = self.re_xmlpacket.search(all) res = self.re_xmlpacket.search(all)
xml = res.group(1) if res else '' xml = ''
#self.em.rclog("extrameta: XML: [%s]" % xml) if res:
xml = res.group(1)
# self.em.rclog("extrameta: XML: [%s]" % xml)
if not xml: if not xml:
return html return html
# Process the XML data metaheaders = []
root = ET.fromstring(xml)
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
# The namespace thing is a drag. Can't do it from the top. See # The namespace thing is a drag. Can't do it from the top. See
# the stackoverflow ref above. Maybe we'd be better off just # the stackoverflow ref above. Maybe we'd be better off just
# walking the full tree and building the namespaces dict. # walking the full tree and building the namespaces dict.
if root.tag.endswith('RDF'): root = ET.fromstring(xml)
rdf = root #self.em.rclog("NSMAP: %s"% root.nsmap)
else: namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} rdf = root.find("rdf:RDF", namespaces)
rdf = root.find("rdf:RDF", namespaces)
if rdf is None:
self.em.rclog("No rdf:RDF node");
return html
metaheaders = []
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap) #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
@ -436,27 +419,15 @@ class PDFExtractor:
# define the required namespace. # define the required namespace.
continue continue
if elt is not None: if elt is not None:
text = self._xmltreetext(elt).encode('UTF-8') text = self._xmltreetext(elt)
if emf:
try:
text = emf.metafix(metanm, text)
except:
pass
# Should we set empty values ?
if text: if text:
# Should we set empty values ?
# Can't use setfield as it only works for # Can't use setfield as it only works for
# text/plain output at the moment. # text/plain output at the moment.
metaheaders.append((rclnm, text)) metaheaders.append((rclnm, text))
if metaheaders: if metaheaders:
if emf:
try:
emf.wrapup(metaheaders)
except:
pass
return self._injectmeta(html, metaheaders) return self._injectmeta(html, metaheaders)
else:
return html
def _selfdoc(self): def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)''' '''Extract the text from the pdf doc (as opposed to attachment)'''
self.em.setmimetype('text/html') self.em.setmimetype('text/html')
@ -465,13 +436,13 @@ class PDFExtractor:
eof = rclexecm.RclExecM.eofnext eof = rclexecm.RclExecM.eofnext
else: else:
eof = rclexecm.RclExecM.noteof eof = rclexecm.RclExecM.noteof
html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc", html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
"UTF-8", "-eol", "unix", "-q", "UTF-8", "-eol", "unix", "-q",
self.filename, "-"]) self.filename, "-"])
html, isempty = self._fixhtml(html) html, isempty = self._fixhtml(html)
#self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html)) #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty and self.ocrpossible: if isempty and self.ocrpossible:
html = self.ocrpdf() html = self.ocrpdf()