pdf: try to extract annotation text if the python3 poppler-glib binding is available

2020-09-03 16:16:54 +02:00 · 2020-09-03 16:16:54 +02:00 · d62bb9016a
commit d62bb9016a
parent 4df71c4a54
1 changed files with 106 additions and 46 deletions
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -23,22 +23,34 @@
 # If pdftotext produces no text and the configuration allows it, we may try to
 # perform OCR.

-from __future__ import print_function
-
 import os
 import sys
 import re
-import rclexecm
+import urllib.request
 import subprocess
 import tempfile
-import atexit
-import signal
-import rclconfig
 import glob
 import traceback
+import atexit
+import signal
+
+import rclexecm
+import rclconfig

 _mswindows = (sys.platform == "win32")
-    
+
+# Can we access the poppler-glib python3 bindings ? This would allow extracting
+# text from annotations. On Ubuntu, this comes with package gir1.2-poppler-0.18
+# (actual versions may differ of course).
+havepopplerglib = False
+try:
+    import gi
+    gi.require_version('Poppler', '0.18')
+    from gi.repository import Poppler
+    havepopplerglib = True
+except:
+    pass
+
 tmpdir = None

 _htmlprefix =b'''<html><head>
@ -204,23 +216,6 @@ class PDFExtractor:
            # Return true anyway, pdf attachments are no big deal
            return True

-    def extractone(self, ipath):
-        #self.em.rclog("extractone: [%s]" % ipath)
-        if not self.attextractdone:
-            if not self.extractAttach():
-                return (False, "", "", rclexecm.RclExecM.eofnow)
-        path = os.path.join(tmpdir, ipath)
-        if os.path.isfile(path):
-            f = open(path, "rb")
-            docdata = f.read();
-            f.close()
-        if self.currentindex == len(self.attachlist) - 1:
-            eof = rclexecm.RclExecM.eofnext
-        else:
-            eof = rclexecm.RclExecM.noteof
-        return (True, docdata, ipath, eof)
-
-
    # pdftotext (used to?) badly escape text inside the header
    # fields. We do it here. This is not an html parser, and depends a
    # lot on the actual format output by pdftotext.
@ -385,6 +380,72 @@ class PDFExtractor:
        else:
            return html

+    def maybemaketmpdir(self):
+        global tmpdir
+        if tmpdir:
+            if not vacuumdir(tmpdir):
+                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
+                return False
+        else:
+            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
+            if self.pdftk and re.match("/snap/", self.pdftk):
+                # We know this is Unix (Ubuntu actually). Check that tmpdir
+                # belongs to the user as snap commands can't use /tmp to share
+                # files. Don't generate an error as this only affects
+                # attachment extraction
+                ok = False
+                if "TMPDIR" in os.environ:
+                    st = os.stat(os.environ["TMPDIR"])
+                    if st.st_uid == os.getuid():
+                        ok = True
+                if not ok:
+                    self.em.rclog(
+                        "pdftk is a snap command and needs TMPDIR to be "
+                        "a directory you own")
+
+    def _process_annotations(self, html):
+        doc = Poppler.Document.new_from_file(
+            'file://%s' %
+            urllib.request.pathname2url(os.path.abspath(self.filename)), None)
+        n_pages = doc.get_n_pages()
+        all_annots = 0
+
+        # output format
+        f = 'P.: {0}, {1:10}, {2:10}: {3}'
+
+        # Array of annotations indexed by page number. The page number
+        # here is the physical one (evince -i), not a page label (evince
+        # -p). This may be different for some documents.
+        abypage = {}
+        for i in range(n_pages):
+            page = doc.get_page(i)
+            pnum = i+1
+            annot_mappings = page.get_annot_mapping ()
+            num_annots = len(annot_mappings)
+            for annot_mapping in annot_mappings:
+                atype = annot_mapping.annot.get_annot_type().value_name
+                if atype  != 'POPPLER_ANNOT_LINK':
+                    atext = f.format(
+                        pnum,
+                        annot_mapping.annot.get_modified(),
+                        annot_mapping.annot.get_annot_type().value_nick,
+                        annot_mapping.annot.get_contents()) + "\n"
+                    if pnum in abypage:
+                        abypage[pnum] += atext
+                    else:
+                        abypage[pnum] = atext
+        #self.em.rclog("Annotations: %s" % abypage)
+        pagevec = html.split(b"\f")
+        html = b""
+        pagenum = 1
+        for page in pagevec:
+            html += page
+            if pagenum in abypage:
+                html += abypage[pagenum].encode('utf-8')
+            html += b"\f"
+            pagenum += 1
+        return html
+    
    def _selfdoc(self):
        '''Extract the text from the pdf doc (as opposed to attachment)'''
        self.em.setmimetype('text/html')
@ -421,32 +482,31 @@ class PDFExtractor:
                self.em.rclog("Metadata extraction failed: %s %s" %
                              (err, traceback.format_exc()))

+        if havepopplerglib:
+            try:
+                html = self._process_annotations(html)
+            except Exception as err:
+                self.em.rclog("Annotation extraction failed: %s %s" %
+                              (err, traceback.format_exc()))
        return (True, html, "", eof)


-    def maybemaketmpdir(self):
-        global tmpdir
-        if tmpdir:
-            if not vacuumdir(tmpdir):
-                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
-                return False
+    def extractone(self, ipath):
+        #self.em.rclog("extractone: [%s]" % ipath)
+        if not self.attextractdone:
+            if not self.extractAttach():
+                return (False, "", "", rclexecm.RclExecM.eofnow)
+        path = os.path.join(tmpdir, ipath)
+        if os.path.isfile(path):
+            f = open(path, "rb")
+            docdata = f.read();
+            f.close()
+        if self.currentindex == len(self.attachlist) - 1:
+            eof = rclexecm.RclExecM.eofnext
        else:
-            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
-            if self.pdftk and re.match("/snap/", self.pdftk):
-                # We know this is Unix (Ubuntu actually). Check that tmpdir
-                # belongs to the user as snap commands can't use /tmp to share
-                # files. Don't generate an error as this only affects
-                # attachment extraction
-                ok = False
-                if "TMPDIR" in os.environ:
-                    st = os.stat(os.environ["TMPDIR"])
-                    if st.st_uid == os.getuid():
-                        ok = True
-                if not ok:
-                    self.em.rclog(
-                        "pdftk is a snap command and needs TMPDIR to be "
-                        "a directory you own")
-                                      
+            eof = rclexecm.RclExecM.noteof
+        return (True, docdata, ipath, eof)
+
        
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):