diff --git a/src/filters/rclpdf b/src/filters/rclpdf index 7e187958..e44b3093 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -1,19 +1,28 @@ -#!/bin/sh +#!/bin/bash #================================================================ -# Some parts are Copyright Estraier (GPL v2). -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -# Copyright (C) 2014 J.F. Dockes +# Copyright (C) 2015 J.F. Dockes +# There used to be Estraier content in there, but I quite believe that is not +# the case any more. # This file is licensed under the GPL v2 #================================================================ -#================================================================ # Convert a pdf file to HTML. # # We use pdftotext from the xpdf/poppler-utils package. # # pdftotext sometimes outputs unescaped text inside HTML text sections. # We try to correct. - +# +# If pdftotext produces no text and tesseract is available, we try to +# perform OCR. As this can be very slow and the result not always +# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists +# +# We guess the OCR language in order of preference: +# - From the content of a ".ocrpdflang" file if it exists in the same +# directory as the PDF +# - From an RECOLL_TESSERACT_LANG environment variable +# - From the content of $RECOLL_CONFDIR/ocrpdf +# - Default to "eng" +# # Uncomment the following if you get better results without. The # pdftotext manual says that the option is no longer recommended The # difference in output seems mostly the removal of soft-hyphens when @@ -21,8 +30,6 @@ # optionraw=-raw # set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL progname="rclpdf" filetype=pdf @@ -95,12 +102,36 @@ umask 77 checkcmds pdftotext iconv awk -# Run pdftotext and fix the result (add a charset tag and fix the html escaping -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | -iconv -f UTF-8 -t UTF-8 -c -s | -awk 'BEGIN'\ +ocrpossible=0 +if iscmd tesseract; then + if iscmd pdftoppm; then + ocrpossible=1 + fi +fi +confdir=${RECOLL_CONFDIR:-~/.recoll} +test ! -f "$confdir/ocrpdf" && ocrpossible=0 + +tmpdir= + +cleanup() +{ + # Note that we're using a constant part (rclpdftmp), that hopefully + # guarantees that we can't do big mistakes with the -rf here. + if test ! -z "$tmpdir"; then + rm -rf $tmpdir/rclpdftmp + rmdir $tmpdir + fi +} + +trap cleanup EXIT HUP QUIT INT TERM + +runpdftotext() +{ + # Run pdftotext and fix the result (add a charset tag and fix the + # html escaping. + pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | + iconv -f UTF-8 -t UTF-8 -c -s | + awk 'BEGIN'\ ' { doescape = 0 cont = "" @@ -179,5 +210,125 @@ awk 'BEGIN'\ } print $0 } -' +' +} +# If we're not equipped for ocr, just run pdftotext to stdout +if test $ocrpossible -eq 0; then + runpdftotext + exit $? +fi + + +# tesseract is installed, prepare for running it. +# We need to check the pdftotext output, but we don't want to run +# it twice. Use a temporary file. +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclpdf_tmp$$ +mkdir $tmpdir || senderror mkdir $tmpdir failed +mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed + +# Run pdftotext into the temp file +pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile +runpdftotext > $pdftxtfile + +# If text is big, or small but not only tags and empty lines, output +# it. Given the contents check which we perform, a file in which the +# only text content is metadata (pdf description field), will be run +# through OCR, which is not necessarily what we would want. It would +# be possible to detect the situation if this proved an issue. +txtsize=`ls -l $pdftxtfile | awk '{print $5}'` +txtempty=0 +# Use grep to check if there is regular text in there. Only do it on +# small outputs +if test $txtsize -lt 5000 ; then + realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile` + test -z "$realtext" && txtempty=1 +fi + +if test $txtempty -eq 0; then + # pdftotext produced actual output, use it. No OCR + cat $pdftxtfile + exit 0 +fi + +# PDF has no text content and tesseract is available. Give it a try +pdflangfile=`dirname "$infile"`/.ocrpdflang +if test -f "$pdflangfile"; then + tesseractlang=`cat "$pdflangfile"` +fi + +# Try to guess tesseract language. This should depend on the input +# file, but we have no general way to determine it. So use the +# environment and hope for the best. +if test -z "$tesseractlang"; then + tesseractlang=${RECOLL_TESSERACT_LANG} + if test -z "$tesseractlang"; then + # Half assed trial to guess from LANG then default to english + localelang=`echo $LANG | awk -F_ '{print $1}'` + # echo localelang "$localelang" >&2 + case "$localelang" in + en) tesseractlang=eng;; + de) tesseractlang=deu;; + fr) tesseractlang=fra;; + # Someone will have to add more tesseract language codes here. + esac + + test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"` + + test -z "$tesseractlang" && tesseractlang="eng" + fi +fi + +# echo tesseractlang "$tesseractlang" >&2 + +TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile" +TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX" + +# split pdf-pages +ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1) +if [ $? -ne 0 ] ; then + senderror "pdftoppm: $ERR_MSG" +fi + +for i in $TMPFILE* ; do + if [ -s "$i" ] ; then + + tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1 + TESSERR=$? + # ignore tesseract start message + LINECOUNT=$(wc -l < $TESSERRORFILE) + if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then + echo "tesseract-error $TESSERR page $i in $infile" >&2 + # sort "compacts" leptonica-output + cat $TESSERRORFILE | sort -u >&2 + fi + # else + # debugging purpose + # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX) + # echo "no pdftoppm in $infile cp to $SICFILE" >&2 + # cp -a $infile $SICFILE + # fi + fi +done + +# don't output "empty" HTML-Files +CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) +if [ "$CHARS" -gt 0 ] ; then + echo "
" 
+    cat "$TMPFILE"*.txt | \
+        awk '{
+  gsub(/&/, "\\&", $0)
+  gsub(//, "\\>", $0)
+  print $0
+}
+'
+    echo "
" +fi \ No newline at end of file diff --git a/website/features.html b/website/features.html index 315c1a2f..f5a3e8ee 100644 --- a/website/features.html +++ b/website/features.html @@ -162,12 +162,22 @@
  • pdf with the pdftotext command, which comes with poppler, - (the package name is quite - often poppler-utils). - Note: the older pdftotext command + (the package name is quite often poppler-utils).
    + Note: the older pdftotext command which comes with xpdf is not compatible with - Recoll
    .
  • + Recoll
    + + New in 1.21: if the + tesseract OCR application, and the + pdftoppm command are available + on the system, the rclpdf + filter has the capability to run OCR. See the comments at + the top of rclpdf (usually + found + in /usr/share/recoll/filters) + for how to enable this and configuration details. +
  • msword with antiword. It is also useful to