pdf: ocr: small fixes, plus make pdfocr redefinable in subdirs

2019-06-13 09:47:25 +02:00 · 2019-06-13 09:47:25 +02:00 · 5ff1a92a51
commit 5ff1a92a51
parent 1991e132a7
9 changed files with 45 additions and 19 deletions
--- a/packaging/debian/debian/changelog
+++ b/packaging/debian/debian/changelog
@ -1,3 +1,9 @@
+recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Fix issues in pdf handler
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Thu, 13 Jun 2019 08:41:00 +0200
+
 recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low

  * GUI: fixed webengine result list for newer qt versions (5.12 ok)
--- a/packaging/debian/debiankio/changelog
+++ b/packaging/debian/debiankio/changelog
@ -1,3 +1,9 @@
+kio-recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * keep kio in sync
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Thu, 13 Jun 2019 08:41:00 +0200
+
 kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
                
  * New release 1.25.17
--- a/src/doc/user/recoll.conf.xml
+++ b/src/doc/user/recoll.conf.xml
@ -686,8 +686,8 @@ with possibly meaning-altering missing words.</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
 <term><varname>pdfocr</varname></term>
 <listitem><para>Attempt OCR of PDF files with no text content if both tesseract and
-pdftoppm are installed. The default is off because OCR is so
-very slow.</para></listitem></varlistentry>
+pdftoppm are installed. This can be defined in subdirectories. The default is off because
+OCR is so very slow.</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCRLANG">
 <term><varname>pdfocrlang</varname></term>
 <listitem><para>Language to assume for PDF OCR. This is very important for having a reasonable rate of errors
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -9551,7 +9551,8 @@ for i in range(nres):
                "RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR"></a><span class="term"><code class="varname">pdfocr</code></span></dt>
                <dd>
                  <p>Attempt OCR of PDF files with no text content
-                  if both tesseract and pdftoppm are installed. The
+                  if both tesseract and pdftoppm are installed.
+                  This can be defined in subdirectories. The
                  default is off because OCR is so very slow.</p>
                </dd>
                <dt><a name=
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -22,12 +22,13 @@
 #
 # If pdftotext produces no text and tesseract is available, we try to
 # perform OCR. As this can be very slow and the result not always
-# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
+# good, we only do this if this is required by the configuration
 #
 # We guess the OCR language in order of preference:
 #  - From the content of a ".ocrpdflang" file if it exists in the same
 #    directory as the PDF
-#  - From an RECOLL_TESSERACT_LANG environment variable
+#  - Else from the pdfocrlang in recoll.conf
+#  - Else from an RECOLL_TESSERACT_LANG environment variable
 #  - From the content of $RECOLL_CONFDIR/ocrpdf
 #  - Default to "eng"

@ -119,14 +120,12 @@ class PDFExtractor:
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
-        cf_doocr = self.config.getConfParam("pdfocr")
-        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
-            self.tesseract = rclexecm.which("tesseract")
-            if self.tesseract:
-                self.pdftoppm = rclexecm.which("pdftoppm")
-                if self.pdftoppm:
-                    self.ocrpossible = True
-                    self.maybemaketmpdir()
+        self.tesseract = rclexecm.which("tesseract")
+        if self.tesseract:
+            self.pdftoppm = rclexecm.which("pdftoppm")
+            if self.pdftoppm:
+                self.ocrpossible = True
+                self.maybemaketmpdir()
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)

        # Pdftk is optionally used to extract attachments. This takes
@ -288,6 +287,7 @@ class PDFExtractor:

        files = glob.glob(tmpfile + "*")
        for f in files:
+            out = b''
            try:
                out = subprocess.check_output([self.tesseract, f, f, "-l",
                                               tesseractlang],
@ -305,8 +305,6 @@ class PDFExtractor:
        for f in files:
            data += open(f, "rb").read()

-        if not data:
-            return b""
        return b'''<html><head>
        <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
        </head><body><pre>''' + \
@ -469,7 +467,10 @@ class PDFExtractor:
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

        if isempty and self.ocrpossible:
-            html = self.ocrpdf()
+            self.config.setKeyDir(os.path.dirname(self.filename))
+            cf_doocr = self.config.getConfParam("pdfocr")
+            if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
+                html = self.ocrpdf()

        if self.extrameta:
            try:
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@ -858,8 +858,9 @@ snippetMaxPosWalk = 1000000
 # <var name="pdfocr" type="bool">
 #
 # <brief>Attempt OCR of PDF files with no text content if both tesseract and
-# pdftoppm are installed.</brief><descr>The default is off because OCR is so
-# very slow.</descr></var>
+# pdftoppm are installed.</brief>
+# <descr>This can be defined in subdirectories. The default is off because
+# OCR is so very slow.</descr></var>
 #pdfocr = 0

 # <var name="pdfocrlang" type="string">
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@ -37,6 +37,9 @@ daemSkippedPaths =  \
    /home/dockes/projets/fulltext/testrecoll/config

 pdfextrameta = pdf:Producer dc:identifier
+[/home/dockes/projets/fulltext/testrecoll/pdf]
+pdfocr = 1
+pdfocrlang = eng

 unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE ﬁfi ﬂfl

--- a/tests/pdf/pdf.sh
+++ b/tests/pdf/pdf.sh
@ -12,6 +12,8 @@ initvariables $0
    # url is prefixed for the dc:identifier search to work
    recollq dc:identifier:10.12345/sampledoi
    recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
+    recollq '"bubbleupnp server to simulate openhome"'
+    
 )  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/pdf/pdf.txt
+++ b/tests/pdf/pdf.txt
@ -1,3 +1,9 @@
-2 results 
+2 results
 application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf]	[Thinking In PostScript]	846435	bytes	
 application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf]	[developerWorks: Linux | Unicode : Linux Unicode programming]	72669	bytes	
+1 results
+application/pdf	[10.12345/sampledoi]	[How to edit a recoll python script in order to extract data]	6426	bytes	
+1 results
+application/pdf	[10.12345/sampledoi]	[How to edit a recoll python script in order to extract data]	6426	bytes	
+1 results
+application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/pdf-tesseract.pdf]	[pdf-tesseract.pdf]	226331	bytes