Add capability to run tesseract from rclpdf. Disabled by default, see comments at the top of rclpdf

2015-04-24 18:13:52 +02:00 · 2015-04-24 18:13:52 +02:00 · eaddefa7c5
commit eaddefa7c5
parent 25cecec484
2 changed files with 181 additions and 20 deletions
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,19 +1,28 @@
-#!/bin/sh
+#!/bin/bash
 #================================================================
-# Some parts are Copyright Estraier (GPL v2).
+# Copyright (C) 2015 J.F. Dockes
-# Estraier: a personal full-text search system
+# There used to be Estraier content in there, but I quite believe that is not
-# Copyright (C) 2003-2004 Mikio Hirabayashi
+# the case any more.
 # Copyright (C) 2014 J.F. Dockes
 # This file is licensed under the GPL v2
 #================================================================
 #================================================================
 # Convert a pdf file to  HTML.
 #
 # We use pdftotext from the xpdf/poppler-utils package. 
 #
 # pdftotext sometimes outputs unescaped text inside HTML text sections.
 # We try to correct.
-
+#
 # If pdftotext produces no text and tesseract is available, we try to
 # perform OCR. As this can be very slow and the result not always
 # good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
 #
 # We guess the OCR language in order of preference:
 #  - From the content of a ".ocrpdflang" file if it exists in the same
 #    directory as the PDF
 #  - From an RECOLL_TESSERACT_LANG environment variable
 #  - From the content of $RECOLL_CONFDIR/ocrpdf
 #  - Default to "eng"
 #
 # Uncomment the following if you get better results without. The
 # pdftotext manual says that the option is no longer recommended The
 # difference in output seems mostly the removal of soft-hyphens when
@ -21,8 +30,6 @@
 # optionraw=-raw
 # set variables
 LANG=C ; export LANG
 LC_ALL=C ; export LC_ALL
 progname="rclpdf"
 filetype=pdf
@ -95,12 +102,36 @@ umask 77
 checkcmds pdftotext iconv awk
-# Run pdftotext and fix the result (add a charset tag and fix the html escaping
+ocrpossible=0
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
+if iscmd tesseract; then
-# is an awk program
+    if iscmd pdftoppm; then
-pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
+        ocrpossible=1
-iconv -f UTF-8 -t UTF-8 -c -s |
+    fi
-awk 'BEGIN'\
+fi
 confdir=${RECOLL_CONFDIR:-~/.recoll}
 test ! -f "$confdir/ocrpdf" && ocrpossible=0
 tmpdir=
 cleanup()
 {
    # Note that we're using a constant part (rclpdftmp), that hopefully
    # guarantees that we can't do big mistakes with the -rf here.
    if test ! -z "$tmpdir"; then
        rm -rf $tmpdir/rclpdftmp
        rmdir $tmpdir
    fi
 }
 trap cleanup EXIT HUP QUIT INT TERM
 runpdftotext()
 {
    # Run pdftotext and fix the result (add a charset tag and fix the
    # html escaping.
    pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
    iconv -f UTF-8 -t UTF-8 -c -s |
    awk 'BEGIN'\
 ' {
  doescape = 0
  cont = ""
@ -179,5 +210,125 @@ awk 'BEGIN'\
  }
  print $0
 }
-' 
+'
 }
 # If we're not equipped for ocr, just run pdftotext to stdout
 if test $ocrpossible -eq 0; then
    runpdftotext
    exit $?
 fi
 # tesseract is installed, prepare for running it.
 # We need to check the pdftotext output, but we don't want to run
 # it twice. Use a temporary file.
 if test z"$RECOLL_TMPDIR" != z; then
    ttdir=$RECOLL_TMPDIR
 elif test z"$TMPDIR" != z ; then
    ttdir=$TMPDIR
 else
    ttdir=/tmp
 fi
 tmpdir=$ttdir/rclpdf_tmp$$
 mkdir $tmpdir || senderror mkdir $tmpdir failed
 mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
 # Run pdftotext into the temp file
 pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
 runpdftotext > $pdftxtfile
 # If text is big, or small but not only tags and empty lines, output
 # it. Given the contents check which we perform, a file in which the
 # only text content is metadata (pdf description field), will be run
 # through OCR, which is not necessarily what we would want. It would
 # be possible to detect the situation if this proved an issue.
 txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
 txtempty=0
 # Use grep to check if there is regular text in there. Only do it on
 # small outputs
 if test $txtsize -lt 5000 ; then
    realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
    test -z "$realtext" && txtempty=1
 fi
 if test $txtempty -eq 0; then
    # pdftotext produced actual output, use it. No OCR
    cat $pdftxtfile
    exit 0
 fi
 # PDF has no text content and tesseract is available. Give it a try
 pdflangfile=`dirname "$infile"`/.ocrpdflang
 if test -f "$pdflangfile"; then
    tesseractlang=`cat "$pdflangfile"`
 fi
 # Try to guess tesseract language. This should depend on the input
 # file, but we have no general way to determine it. So use the
 # environment and hope for the best.
 if test -z "$tesseractlang"; then
    tesseractlang=${RECOLL_TESSERACT_LANG}
    if test -z "$tesseractlang"; then
        # Half assed trial to guess from LANG then default to english
        localelang=`echo $LANG | awk -F_ '{print $1}'`
        # echo localelang "$localelang" >&2
        case "$localelang" in 
        en) tesseractlang=eng;;
        de) tesseractlang=deu;;
        fr) tesseractlang=fra;;
        # Someone will have to add more tesseract language codes here.
        esac
        test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
        test -z "$tesseractlang" && tesseractlang="eng"
    fi
 fi
 # echo tesseractlang "$tesseractlang" >&2
 TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
 TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
 # split pdf-pages
 ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
 if [ $? -ne 0 ] ; then
    senderror "pdftoppm: $ERR_MSG"
 fi
 for i in $TMPFILE* ; do
    if [ -s "$i" ] ; then
        tesseract $i $i -l  $tesseractlang > $TESSERRORFILE 2>&1
        TESSERR=$?
        # ignore tesseract start message
        LINECOUNT=$(wc -l < $TESSERRORFILE)
        if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
            echo "tesseract-error $TESSERR page $i in $infile" >&2
            # sort "compacts" leptonica-output
            cat $TESSERRORFILE | sort -u >&2
        fi
        # else
            # debugging purpose
            # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
            # echo "no pdftoppm in $infile cp to $SICFILE" >&2
            # cp -a $infile $SICFILE
        # fi
    fi
 done
 # don't output "empty" HTML-Files
 CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) 
 if [ "$CHARS" -gt 0 ] ; then
    echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>" 
    cat "$TMPFILE"*.txt | \
        awk '{
  gsub(/&/, "\\&amp;", $0)
  gsub(/</, "\\&lt;", $0)
  gsub(/>/, "\\&gt;", $0)
  print $0
 }
 '
    echo "</pre></body></html>"
 fi
--- a/website/features.html
+++ b/website/features.html
@ -162,12 +162,22 @@
        <li><span class="application">pdf</span> with the <span class=
        "command">pdftotext</span> command, which comes with 
          <a href="http://poppler.freedesktop.org/">poppler</a>,
-          (the package name is quite
+          (the package name is quite often <tt>poppler-utils</tt>). <br/>
-          often <tt>poppler-utils</tt>). 
+          Note: the older <span class="command">pdftotext</span> command
          <em>Note: the older <span class="command">pdftotext</span> command
            which comes with <span class="application">xpdf</span> is
            not compatible with <span class="application">
-              Recoll</span></em>.</li> 
+              Recoll</span><br/>
          <em>New in 1.21</em>: if the <span class="application">
            tesseract</span> OCR application, and the 
          <span class="command">pdftoppm</span> command are available
          on the system, the <span class="command">rclpdf</span>
          filter has the capability to run OCR. See the comments at
          the top of <span class="command">rclpdf</span> (usually
          found
          in <span class="filename">/usr/share/recoll/filters</span>)
          for how to enable this and configuration details.
        </li> 
        <li><span class="application">msword</span> with <a href=
        "http://www.winfield.demon.nl/">antiword</a>.  It is also useful to