Add capability to run tesseract from rclpdf. Disabled by default, see comments at the top of rclpdf

2015-04-24 18:13:52 +02:00 · 2015-04-24 18:13:52 +02:00 · eaddefa7c5
commit eaddefa7c5
parent 25cecec484
2 changed files with 181 additions and 20 deletions
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,19 +1,28 @@
-#!/bin/sh
+#!/bin/bash
 #================================================================
-# Some parts are Copyright Estraier (GPL v2).
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-# Copyright (C) 2014 J.F. Dockes
+# Copyright (C) 2015 J.F. Dockes
+# There used to be Estraier content in there, but I quite believe that is not
+# the case any more.
 # This file is licensed under the GPL v2
 #================================================================
-#================================================================
 # Convert a pdf file to  HTML.
 #
 # We use pdftotext from the xpdf/poppler-utils package. 
 #
 # pdftotext sometimes outputs unescaped text inside HTML text sections.
 # We try to correct.
-
+#
+# If pdftotext produces no text and tesseract is available, we try to
+# perform OCR. As this can be very slow and the result not always
+# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
+#
+# We guess the OCR language in order of preference:
+#  - From the content of a ".ocrpdflang" file if it exists in the same
+#    directory as the PDF
+#  - From an RECOLL_TESSERACT_LANG environment variable
+#  - From the content of $RECOLL_CONFDIR/ocrpdf
+#  - Default to "eng"
+#
 # Uncomment the following if you get better results without. The
 # pdftotext manual says that the option is no longer recommended The
 # difference in output seems mostly the removal of soft-hyphens when
@ -21,8 +30,6 @@
 # optionraw=-raw

 # set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
 progname="rclpdf"
 filetype=pdf

@ -95,12 +102,36 @@ umask 77

 checkcmds pdftotext iconv awk

-# Run pdftotext and fix the result (add a charset tag and fix the html escaping
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
-iconv -f UTF-8 -t UTF-8 -c -s |
-awk 'BEGIN'\
+ocrpossible=0
+if iscmd tesseract; then
+    if iscmd pdftoppm; then
+        ocrpossible=1
+    fi
+fi
+confdir=${RECOLL_CONFDIR:-~/.recoll}
+test ! -f "$confdir/ocrpdf" && ocrpossible=0
+
+tmpdir=
+
+cleanup()
+{
+    # Note that we're using a constant part (rclpdftmp), that hopefully
+    # guarantees that we can't do big mistakes with the -rf here.
+    if test ! -z "$tmpdir"; then
+        rm -rf $tmpdir/rclpdftmp
+        rmdir $tmpdir
+    fi
+}
+    
+trap cleanup EXIT HUP QUIT INT TERM
+
+runpdftotext()
+{
+    # Run pdftotext and fix the result (add a charset tag and fix the
+    # html escaping.
+    pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
+    iconv -f UTF-8 -t UTF-8 -c -s |
+    awk 'BEGIN'\
 ' {
  doescape = 0
  cont = ""
@ -179,5 +210,125 @@ awk 'BEGIN'\
  }
  print $0
 }
-' 
+'
+}

+# If we're not equipped for ocr, just run pdftotext to stdout
+if test $ocrpossible -eq 0; then
+    runpdftotext
+    exit $?
+fi
+
+
+# tesseract is installed, prepare for running it.
+# We need to check the pdftotext output, but we don't want to run
+# it twice. Use a temporary file.
+if test z"$RECOLL_TMPDIR" != z; then
+    ttdir=$RECOLL_TMPDIR
+elif test z"$TMPDIR" != z ; then
+    ttdir=$TMPDIR
+else
+    ttdir=/tmp
+fi
+tmpdir=$ttdir/rclpdf_tmp$$
+mkdir $tmpdir || senderror mkdir $tmpdir failed
+mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
+
+# Run pdftotext into the temp file
+pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
+runpdftotext > $pdftxtfile
+
+# If text is big, or small but not only tags and empty lines, output
+# it. Given the contents check which we perform, a file in which the
+# only text content is metadata (pdf description field), will be run
+# through OCR, which is not necessarily what we would want. It would
+# be possible to detect the situation if this proved an issue.
+txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
+txtempty=0
+# Use grep to check if there is regular text in there. Only do it on
+# small outputs
+if test $txtsize -lt 5000 ; then
+    realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
+    test -z "$realtext" && txtempty=1
+fi
+
+if test $txtempty -eq 0; then
+    # pdftotext produced actual output, use it. No OCR
+    cat $pdftxtfile
+    exit 0
+fi
+
+# PDF has no text content and tesseract is available. Give it a try
+pdflangfile=`dirname "$infile"`/.ocrpdflang
+if test -f "$pdflangfile"; then
+    tesseractlang=`cat "$pdflangfile"`
+fi
+
+# Try to guess tesseract language. This should depend on the input
+# file, but we have no general way to determine it. So use the
+# environment and hope for the best.
+if test -z "$tesseractlang"; then
+    tesseractlang=${RECOLL_TESSERACT_LANG}
+    if test -z "$tesseractlang"; then
+        # Half assed trial to guess from LANG then default to english
+        localelang=`echo $LANG | awk -F_ '{print $1}'`
+        # echo localelang "$localelang" >&2
+        case "$localelang" in 
+        en) tesseractlang=eng;;
+        de) tesseractlang=deu;;
+        fr) tesseractlang=fra;;
+        # Someone will have to add more tesseract language codes here.
+        esac
+
+        test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
+
+        test -z "$tesseractlang" && tesseractlang="eng"
+    fi
+fi
+
+# echo tesseractlang "$tesseractlang" >&2
+
+TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
+TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
+
+# split pdf-pages
+ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
+if [ $? -ne 0 ] ; then
+    senderror "pdftoppm: $ERR_MSG"
+fi
+
+for i in $TMPFILE* ; do
+    if [ -s "$i" ] ; then
+
+        tesseract $i $i -l  $tesseractlang > $TESSERRORFILE 2>&1
+        TESSERR=$?
+        # ignore tesseract start message
+        LINECOUNT=$(wc -l < $TESSERRORFILE)
+        if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
+            echo "tesseract-error $TESSERR page $i in $infile" >&2
+            # sort "compacts" leptonica-output
+            cat $TESSERRORFILE | sort -u >&2
+        fi
+        # else
+            # debugging purpose
+            # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
+            # echo "no pdftoppm in $infile cp to $SICFILE" >&2
+            # cp -a $infile $SICFILE
+        # fi
+    fi
+done
+
+# don't output "empty" HTML-Files
+CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) 
+if [ "$CHARS" -gt 0 ] ; then
+    echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>" 
+    cat "$TMPFILE"*.txt | \
+        awk '{
+  gsub(/&/, "\\&amp;", $0)
+  gsub(/</, "\\&lt;", $0)
+  gsub(/>/, "\\&gt;", $0)
+  print $0
+}
+'
+    echo "</pre></body></html>"
+fi
--- a/website/features.html
+++ b/website/features.html
@ -162,12 +162,22 @@
        <li><span class="application">pdf</span> with the <span class=
        "command">pdftotext</span> command, which comes with 
          <a href="http://poppler.freedesktop.org/">poppler</a>,
-          (the package name is quite
-          often <tt>poppler-utils</tt>). 
-          <em>Note: the older <span class="command">pdftotext</span> command
+          (the package name is quite often <tt>poppler-utils</tt>). <br/>
+          Note: the older <span class="command">pdftotext</span> command
            which comes with <span class="application">xpdf</span> is
            not compatible with <span class="application">
-              Recoll</span></em>.</li> 
+              Recoll</span><br/>
+
+          <em>New in 1.21</em>: if the <span class="application">
+            tesseract</span> OCR application, and the 
+          <span class="command">pdftoppm</span> command are available
+          on the system, the <span class="command">rclpdf</span>
+          filter has the capability to run OCR. See the comments at
+          the top of <span class="command">rclpdf</span> (usually
+          found
+          in <span class="filename">/usr/share/recoll/filters</span>)
+          for how to enable this and configuration details.
+        </li> 

        <li><span class="application">msword</span> with <a href=
        "http://www.winfield.demon.nl/">antiword</a>.  It is also useful to