diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 692f1778..dddc38dd 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -443,13 +443,17 @@ class PDFExtractor: #self.em.rclog("Annotations: %s" % abypage) pagevec = html.split(b"\f") html = b"" + annotsfield = b"" pagenum = 1 for page in pagevec: html += page if pagenum in abypage: html += abypage[pagenum].encode('utf-8') + annotsfield += abypage[pagenum].encode('utf-8') + b" - " html += b"\f" pagenum += 1 + if annotsfield: + self.em.setfield("pdfannot", annotsfield) return html def _selfdoc(self): diff --git a/src/sampleconf/fields b/src/sampleconf/fields index b537f113..ba9fb6d0 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -71,7 +71,7 @@ rclUnsplitFN = XSFS xapyear = Y recipient = XTO rclbes = XB ; noterms = 1 - +annotation = XA [values] ########### @@ -106,6 +106,7 @@ keywords= rclaptg= rclbes= recipient= +annotation= [aliases] ########################## @@ -132,6 +133,7 @@ keywords = keyword xesam:keyword tag tags dc:subject xesam:subject \ mtype = mime mimetype xesam:mimetype contenttype xesam:contenttype dc:format recipient = to xesam:recipient url = dc:identifier xesam:url +annotation = pdfannot ################## # The queryaliases section defines aliases which are used exclusively at @@ -140,6 +142,7 @@ url = dc:identifier xesam:url [queryaliases] filename = fn containerfilename = cfn +annotation = annot pa [xattrtofields] ###################### diff --git a/tests/pdf-annots/pdf-annots.sh b/tests/pdf-annots/pdf-annots.sh new file mode 100755 index 00000000..83daa9af --- /dev/null +++ b/tests/pdf-annots/pdf-annots.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +topdir=`dirname $0`/.. +. $topdir/shared.sh + +initvariables $0 + +( + recollq '"new test JF annotation using Adobe Acrobat X"' + + # This supposes that the fields file is customized, which is not + # the case by default + echo + echo "Extracting the value for an annotation field:" + recollq -F annotation pdfannot:'"DAVID: Test of a highlight"' | \ + tail -1 | base64 -d + +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout + +diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 + +checkresult diff --git a/tests/pdf-annots/pdf-annots.txt b/tests/pdf-annots/pdf-annots.txt new file mode 100644 index 00000000..0e5dad9d --- /dev/null +++ b/tests/pdf-annots/pdf-annots.txt @@ -0,0 +1,5 @@ +1 results +application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf-annots/Présidentielle en Biélorussie .pdf] [Présidentielle en Biélorussie : la candidate de l’opposition, Svetlana Tikhanovskaïa, s’est réfugiée en Lituanie] 325430 bytes + +Extracting the value for an annotation field: +P.: 3, D:20200904094331+00'240', highlight : None P.: 3, D:20200904100158+00'240', highlight : DAVID: Test of a highlight that has a note attached to