pdf: try to extract annotation text if the python3 poppler-glib binding is available

This commit is contained in:
Jean-Francois Dockes 2020-09-03 16:16:54 +02:00
parent 4df71c4a54
commit d62bb9016a

View File

@ -23,22 +23,34 @@
# If pdftotext produces no text and the configuration allows it, we may try to
# perform OCR.
from __future__ import print_function
import os
import sys
import re
import rclexecm
import urllib.request
import subprocess
import tempfile
import atexit
import signal
import rclconfig
import glob
import traceback
import atexit
import signal
import rclexecm
import rclconfig
_mswindows = (sys.platform == "win32")
# Can we access the poppler-glib python3 bindings ? This would allow extracting
# text from annotations. On Ubuntu, this comes with package gir1.2-poppler-0.18
# (actual versions may differ of course).
havepopplerglib = False
try:
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler
havepopplerglib = True
except:
pass
tmpdir = None
_htmlprefix =b'''<html><head>
@ -204,23 +216,6 @@ class PDFExtractor:
# Return true anyway, pdf attachments are no big deal
return True
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
if not self.attextractdone:
if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow)
path = os.path.join(tmpdir, ipath)
if os.path.isfile(path):
f = open(path, "rb")
docdata = f.read();
f.close()
if self.currentindex == len(self.attachlist) - 1:
eof = rclexecm.RclExecM.eofnext
else:
eof = rclexecm.RclExecM.noteof
return (True, docdata, ipath, eof)
# pdftotext (used to?) badly escape text inside the header
# fields. We do it here. This is not an html parser, and depends a
# lot on the actual format output by pdftotext.
@ -385,6 +380,72 @@ class PDFExtractor:
else:
return html
def maybemaketmpdir(self):
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
return False
else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
if self.pdftk and re.match("/snap/", self.pdftk):
# We know this is Unix (Ubuntu actually). Check that tmpdir
# belongs to the user as snap commands can't use /tmp to share
# files. Don't generate an error as this only affects
# attachment extraction
ok = False
if "TMPDIR" in os.environ:
st = os.stat(os.environ["TMPDIR"])
if st.st_uid == os.getuid():
ok = True
if not ok:
self.em.rclog(
"pdftk is a snap command and needs TMPDIR to be "
"a directory you own")
def _process_annotations(self, html):
doc = Poppler.Document.new_from_file(
'file://%s' %
urllib.request.pathname2url(os.path.abspath(self.filename)), None)
n_pages = doc.get_n_pages()
all_annots = 0
# output format
f = 'P.: {0}, {1:10}, {2:10}: {3}'
# Array of annotations indexed by page number. The page number
# here is the physical one (evince -i), not a page label (evince
# -p). This may be different for some documents.
abypage = {}
for i in range(n_pages):
page = doc.get_page(i)
pnum = i+1
annot_mappings = page.get_annot_mapping ()
num_annots = len(annot_mappings)
for annot_mapping in annot_mappings:
atype = annot_mapping.annot.get_annot_type().value_name
if atype != 'POPPLER_ANNOT_LINK':
atext = f.format(
pnum,
annot_mapping.annot.get_modified(),
annot_mapping.annot.get_annot_type().value_nick,
annot_mapping.annot.get_contents()) + "\n"
if pnum in abypage:
abypage[pnum] += atext
else:
abypage[pnum] = atext
#self.em.rclog("Annotations: %s" % abypage)
pagevec = html.split(b"\f")
html = b""
pagenum = 1
for page in pagevec:
html += page
if pagenum in abypage:
html += abypage[pagenum].encode('utf-8')
html += b"\f"
pagenum += 1
return html
def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)'''
self.em.setmimetype('text/html')
@ -421,32 +482,31 @@ class PDFExtractor:
self.em.rclog("Metadata extraction failed: %s %s" %
(err, traceback.format_exc()))
if havepopplerglib:
try:
html = self._process_annotations(html)
except Exception as err:
self.em.rclog("Annotation extraction failed: %s %s" %
(err, traceback.format_exc()))
return (True, html, "", eof)
def maybemaketmpdir(self):
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
return False
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
if not self.attextractdone:
if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow)
path = os.path.join(tmpdir, ipath)
if os.path.isfile(path):
f = open(path, "rb")
docdata = f.read();
f.close()
if self.currentindex == len(self.attachlist) - 1:
eof = rclexecm.RclExecM.eofnext
else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
if self.pdftk and re.match("/snap/", self.pdftk):
# We know this is Unix (Ubuntu actually). Check that tmpdir
# belongs to the user as snap commands can't use /tmp to share
# files. Don't generate an error as this only affects
# attachment extraction
ok = False
if "TMPDIR" in os.environ:
st = os.stat(os.environ["TMPDIR"])
if st.st_uid == os.getuid():
ok = True
if not ok:
self.em.rclog(
"pdftk is a snap command and needs TMPDIR to be "
"a directory you own")
eof = rclexecm.RclExecM.noteof
return (True, docdata, ipath, eof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):