pdf: try to extract annotation text if the python3 poppler-glib binding is available

2020-09-03 16:16:54 +02:00 · 2020-09-03 16:16:54 +02:00 · d62bb9016a
commit d62bb9016a
parent 4df71c4a54
1 changed files with 106 additions and 46 deletions
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -23,22 +23,34 @@
 # If pdftotext produces no text and the configuration allows it, we may try to
 # perform OCR.
 from __future__ import print_function
 import os
 import sys
 import re
-import rclexecm
+import urllib.request
 import subprocess
 import tempfile
 import atexit
 import signal
 import rclconfig
 import glob
 import traceback
 import atexit
 import signal
 import rclexecm
 import rclconfig
 _mswindows = (sys.platform == "win32")
 # Can we access the poppler-glib python3 bindings ? This would allow extracting
 # text from annotations. On Ubuntu, this comes with package gir1.2-poppler-0.18
 # (actual versions may differ of course).
 havepopplerglib = False
 try:
    import gi
    gi.require_version('Poppler', '0.18')
    from gi.repository import Poppler
    havepopplerglib = True
 except:
    pass
 tmpdir = None
 _htmlprefix =b'''<html><head>
@ -204,23 +216,6 @@ class PDFExtractor:
            # Return true anyway, pdf attachments are no big deal
            return True
    def extractone(self, ipath):
        #self.em.rclog("extractone: [%s]" % ipath)
        if not self.attextractdone:
            if not self.extractAttach():
                return (False, "", "", rclexecm.RclExecM.eofnow)
        path = os.path.join(tmpdir, ipath)
        if os.path.isfile(path):
            f = open(path, "rb")
            docdata = f.read();
            f.close()
        if self.currentindex == len(self.attachlist) - 1:
            eof = rclexecm.RclExecM.eofnext
        else:
            eof = rclexecm.RclExecM.noteof
        return (True, docdata, ipath, eof)
    # pdftotext (used to?) badly escape text inside the header
    # fields. We do it here. This is not an html parser, and depends a
    # lot on the actual format output by pdftotext.
@ -385,6 +380,72 @@ class PDFExtractor:
        else:
            return html
    def maybemaketmpdir(self):
        global tmpdir
        if tmpdir:
            if not vacuumdir(tmpdir):
                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
                return False
        else:
            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
            if self.pdftk and re.match("/snap/", self.pdftk):
                # We know this is Unix (Ubuntu actually). Check that tmpdir
                # belongs to the user as snap commands can't use /tmp to share
                # files. Don't generate an error as this only affects
                # attachment extraction
                ok = False
                if "TMPDIR" in os.environ:
                    st = os.stat(os.environ["TMPDIR"])
                    if st.st_uid == os.getuid():
                        ok = True
                if not ok:
                    self.em.rclog(
                        "pdftk is a snap command and needs TMPDIR to be "
                        "a directory you own")
    def _process_annotations(self, html):
        doc = Poppler.Document.new_from_file(
            'file://%s' %
            urllib.request.pathname2url(os.path.abspath(self.filename)), None)
        n_pages = doc.get_n_pages()
        all_annots = 0
        # output format
        f = 'P.: {0}, {1:10}, {2:10}: {3}'
        # Array of annotations indexed by page number. The page number
        # here is the physical one (evince -i), not a page label (evince
        # -p). This may be different for some documents.
        abypage = {}
        for i in range(n_pages):
            page = doc.get_page(i)
            pnum = i+1
            annot_mappings = page.get_annot_mapping ()
            num_annots = len(annot_mappings)
            for annot_mapping in annot_mappings:
                atype = annot_mapping.annot.get_annot_type().value_name
                if atype  != 'POPPLER_ANNOT_LINK':
                    atext = f.format(
                        pnum,
                        annot_mapping.annot.get_modified(),
                        annot_mapping.annot.get_annot_type().value_nick,
                        annot_mapping.annot.get_contents()) + "\n"
                    if pnum in abypage:
                        abypage[pnum] += atext
                    else:
                        abypage[pnum] = atext
        #self.em.rclog("Annotations: %s" % abypage)
        pagevec = html.split(b"\f")
        html = b""
        pagenum = 1
        for page in pagevec:
            html += page
            if pagenum in abypage:
                html += abypage[pagenum].encode('utf-8')
            html += b"\f"
            pagenum += 1
        return html
    def _selfdoc(self):
        '''Extract the text from the pdf doc (as opposed to attachment)'''
        self.em.setmimetype('text/html')
@ -421,31 +482,30 @@ class PDFExtractor:
                self.em.rclog("Metadata extraction failed: %s %s" %
                              (err, traceback.format_exc()))
        if havepopplerglib:
            try:
                html = self._process_annotations(html)
            except Exception as err:
                self.em.rclog("Annotation extraction failed: %s %s" %
                              (err, traceback.format_exc()))
        return (True, html, "", eof)
-    def maybemaketmpdir(self):
+    def extractone(self, ipath):
-        global tmpdir
+        #self.em.rclog("extractone: [%s]" % ipath)
-        if tmpdir:
+        if not self.attextractdone:
-            if not vacuumdir(tmpdir):
+            if not self.extractAttach():
-                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
+                return (False, "", "", rclexecm.RclExecM.eofnow)
-                return False
+        path = os.path.join(tmpdir, ipath)
        if os.path.isfile(path):
            f = open(path, "rb")
            docdata = f.read();
            f.close()
        if self.currentindex == len(self.attachlist) - 1:
            eof = rclexecm.RclExecM.eofnext
        else:
-            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
+            eof = rclexecm.RclExecM.noteof
-            if self.pdftk and re.match("/snap/", self.pdftk):
+        return (True, docdata, ipath, eof)
                # We know this is Unix (Ubuntu actually). Check that tmpdir
                # belongs to the user as snap commands can't use /tmp to share
                # files. Don't generate an error as this only affects
                # attachment extraction
                ok = False
                if "TMPDIR" in os.environ:
                    st = os.stat(os.environ["TMPDIR"])
                    if st.st_uid == os.getuid():
                        ok = True
                if not ok:
                    self.em.rclog(
                        "pdftk is a snap command and needs TMPDIR to be "
                        "a directory you own")
    ###### File type handler api, used by rclexecm ---------->