Fix Windows PDF indexing. The successful test for poppler/pdftotext was not acknowledged and pdf indexing always failed
This commit is contained in:
parent
e1662b655d
commit
f778274b42
@ -86,9 +86,10 @@ class PDFExtractor:
|
|||||||
self.pdftotext = rclexecm.which("pdftotext")
|
self.pdftotext = rclexecm.which("pdftotext")
|
||||||
if not self.pdftotext:
|
if not self.pdftotext:
|
||||||
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
||||||
# No need for anything else. openfile() will return an
|
if not self.pdftotext:
|
||||||
# error at once
|
# No need for anything else. openfile() will return an
|
||||||
return
|
# error at once
|
||||||
|
return
|
||||||
|
|
||||||
cf = rclconfig.RclConfig()
|
cf = rclconfig.RclConfig()
|
||||||
self.confdir = cf.getConfDir()
|
self.confdir = cf.getConfDir()
|
||||||
@ -98,7 +99,6 @@ class PDFExtractor:
|
|||||||
# (xmltag,rcltag) pairs
|
# (xmltag,rcltag) pairs
|
||||||
self.extrameta = cf.getConfParam("pdfextrameta")
|
self.extrameta = cf.getConfParam("pdfextrameta")
|
||||||
if self.extrameta:
|
if self.extrameta:
|
||||||
self.extrametafix = cf.getConfParam("pdfextrametafix")
|
|
||||||
self._initextrameta()
|
self._initextrameta()
|
||||||
|
|
||||||
# Check if we need to escape portions of text where old
|
# Check if we need to escape portions of text where old
|
||||||
@ -179,16 +179,7 @@ class PDFExtractor:
|
|||||||
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
||||||
r'(.*)' + r'<\?xpacket[ ]+end',
|
r'(.*)' + r'<\?xpacket[ ]+end',
|
||||||
flags = re.DOTALL)
|
flags = re.DOTALL)
|
||||||
global EMF
|
|
||||||
EMF = None
|
|
||||||
if self.extrametafix:
|
|
||||||
try:
|
|
||||||
import imp
|
|
||||||
EMF = imp.load_source('pdfextrametafix', self.extrametafix)
|
|
||||||
except Exception as err:
|
|
||||||
self.em.rclog("Import extrametafix failed: %s" % err)
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Extract all attachments if any into temporary directory
|
# Extract all attachments if any into temporary directory
|
||||||
def extractAttach(self):
|
def extractAttach(self):
|
||||||
if self.attextractdone:
|
if self.attextractdone:
|
||||||
@ -366,17 +357,17 @@ class PDFExtractor:
|
|||||||
return output, isempty
|
return output, isempty
|
||||||
|
|
||||||
def _metatag(self, nm, val):
|
def _metatag(self, nm, val):
|
||||||
return b"<meta name=\"" + nm + "\" content=\"" + \
|
return "<meta name=\"" + nm + "\" content=\"" + \
|
||||||
self.em.htmlescape(val) + "\">"
|
self.em.htmlescape(val) + "\">"
|
||||||
|
|
||||||
# metaheaders is a list of (nm, value) pairs
|
# metaheaders is a list of (nm, value) pairs
|
||||||
def _injectmeta(self, html, metaheaders):
|
def _injectmeta(self, html, metaheaders):
|
||||||
metatxt = b''
|
metatxt = ''
|
||||||
for nm, val in metaheaders:
|
for nm, val in metaheaders:
|
||||||
metatxt += self._metatag(nm, val) + b'\n'
|
metatxt += self._metatag(nm, val) + '\n'
|
||||||
if not metatxt:
|
if not metatxt:
|
||||||
return html
|
return html
|
||||||
res = self.re_head.sub(b'<head>\n' + metatxt, html)
|
res = self.re_head.sub('<head>\n' + metatxt, html)
|
||||||
#self.em.rclog("Substituted html: [%s]"%res)
|
#self.em.rclog("Substituted html: [%s]"%res)
|
||||||
if res:
|
if res:
|
||||||
return res
|
return res
|
||||||
@ -392,38 +383,30 @@ class PDFExtractor:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
# or: return reduce((lambda t,p : t+p+' '),
|
# or: return reduce((lambda t,p : t+p+' '),
|
||||||
# [e.text for e in elt.iter() if e.text]).strip()
|
# [e.text for e in elt.iter() if e.text]).strip()
|
||||||
|
|
||||||
|
|
||||||
def _setextrameta(self, html):
|
def _setextrameta(self, html):
|
||||||
if not self.pdfinfo:
|
if not self.pdfinfo:
|
||||||
return html
|
return
|
||||||
|
|
||||||
emf = EMF.MetaFixer() if EMF else None
|
|
||||||
|
|
||||||
# Execute pdfinfo and extract the XML packet
|
|
||||||
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
||||||
|
|
||||||
|
# Extract the XML packet
|
||||||
res = self.re_xmlpacket.search(all)
|
res = self.re_xmlpacket.search(all)
|
||||||
xml = res.group(1) if res else ''
|
xml = ''
|
||||||
#self.em.rclog("extrameta: XML: [%s]" % xml)
|
if res:
|
||||||
|
xml = res.group(1)
|
||||||
|
# self.em.rclog("extrameta: XML: [%s]" % xml)
|
||||||
if not xml:
|
if not xml:
|
||||||
return html
|
return html
|
||||||
|
|
||||||
# Process the XML data
|
metaheaders = []
|
||||||
root = ET.fromstring(xml)
|
|
||||||
# Sometimes the root tag is <x:xmpmeta>, sometimes <rdf:RDF>
|
|
||||||
# The namespace thing is a drag. Can't do it from the top. See
|
# The namespace thing is a drag. Can't do it from the top. See
|
||||||
# the stackoverflow ref above. Maybe we'd be better off just
|
# the stackoverflow ref above. Maybe we'd be better off just
|
||||||
# walking the full tree and building the namespaces dict.
|
# walking the full tree and building the namespaces dict.
|
||||||
if root.tag.endswith('RDF'):
|
root = ET.fromstring(xml)
|
||||||
rdf = root
|
#self.em.rclog("NSMAP: %s"% root.nsmap)
|
||||||
else:
|
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
|
||||||
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
|
rdf = root.find("rdf:RDF", namespaces)
|
||||||
rdf = root.find("rdf:RDF", namespaces)
|
|
||||||
if rdf is None:
|
|
||||||
self.em.rclog("No rdf:RDF node");
|
|
||||||
return html
|
|
||||||
|
|
||||||
metaheaders = []
|
|
||||||
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
|
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
|
||||||
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
|
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
|
||||||
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
|
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
|
||||||
@ -436,27 +419,15 @@ class PDFExtractor:
|
|||||||
# define the required namespace.
|
# define the required namespace.
|
||||||
continue
|
continue
|
||||||
if elt is not None:
|
if elt is not None:
|
||||||
text = self._xmltreetext(elt).encode('UTF-8')
|
text = self._xmltreetext(elt)
|
||||||
if emf:
|
|
||||||
try:
|
|
||||||
text = emf.metafix(metanm, text)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# Should we set empty values ?
|
|
||||||
if text:
|
if text:
|
||||||
|
# Should we set empty values ?
|
||||||
# Can't use setfield as it only works for
|
# Can't use setfield as it only works for
|
||||||
# text/plain output at the moment.
|
# text/plain output at the moment.
|
||||||
metaheaders.append((rclnm, text))
|
metaheaders.append((rclnm, text))
|
||||||
if metaheaders:
|
if metaheaders:
|
||||||
if emf:
|
|
||||||
try:
|
|
||||||
emf.wrapup(metaheaders)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return self._injectmeta(html, metaheaders)
|
return self._injectmeta(html, metaheaders)
|
||||||
else:
|
|
||||||
return html
|
|
||||||
|
|
||||||
def _selfdoc(self):
|
def _selfdoc(self):
|
||||||
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
@ -465,13 +436,13 @@ class PDFExtractor:
|
|||||||
eof = rclexecm.RclExecM.eofnext
|
eof = rclexecm.RclExecM.eofnext
|
||||||
else:
|
else:
|
||||||
eof = rclexecm.RclExecM.noteof
|
eof = rclexecm.RclExecM.noteof
|
||||||
|
|
||||||
html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
|
html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
|
||||||
"UTF-8", "-eol", "unix", "-q",
|
"UTF-8", "-eol", "unix", "-q",
|
||||||
self.filename, "-"])
|
self.filename, "-"])
|
||||||
|
|
||||||
html, isempty = self._fixhtml(html)
|
html, isempty = self._fixhtml(html)
|
||||||
#self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty and self.ocrpossible:
|
if isempty and self.ocrpossible:
|
||||||
html = self.ocrpdf()
|
html = self.ocrpdf()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user