Add capability to run tesseract from rclpdf. Disabled by default, see comments at the top of rclpdf

This commit is contained in:
Jean-Francois Dockes 2015-04-24 18:13:52 +02:00
parent 25cecec484
commit eaddefa7c5
2 changed files with 181 additions and 20 deletions

View File

@ -1,19 +1,28 @@
#!/bin/sh
#!/bin/bash
#================================================================
# Some parts are Copyright Estraier (GPL v2).
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
# Copyright (C) 2014 J.F. Dockes
# Copyright (C) 2015 J.F. Dockes
# There used to be Estraier content in there, but I quite believe that is not
# the case any more.
# This file is licensed under the GPL v2
#================================================================
#================================================================
# Convert a pdf file to HTML.
#
# We use pdftotext from the xpdf/poppler-utils package.
#
# pdftotext sometimes outputs unescaped text inside HTML text sections.
# We try to correct.
#
# If pdftotext produces no text and tesseract is available, we try to
# perform OCR. As this can be very slow and the result not always
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
#
# We guess the OCR language in order of preference:
# - From the content of a ".ocrpdflang" file if it exists in the same
# directory as the PDF
# - From an RECOLL_TESSERACT_LANG environment variable
# - From the content of $RECOLL_CONFDIR/ocrpdf
# - Default to "eng"
#
# Uncomment the following if you get better results without. The
# pdftotext manual says that the option is no longer recommended The
# difference in output seems mostly the removal of soft-hyphens when
@ -21,8 +30,6 @@
# optionraw=-raw
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclpdf"
filetype=pdf
@ -95,12 +102,36 @@ umask 77
checkcmds pdftotext iconv awk
# Run pdftotext and fix the result (add a charset tag and fix the html escaping
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
# is an awk program
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c -s |
awk 'BEGIN'\
ocrpossible=0
if iscmd tesseract; then
if iscmd pdftoppm; then
ocrpossible=1
fi
fi
confdir=${RECOLL_CONFDIR:-~/.recoll}
test ! -f "$confdir/ocrpdf" && ocrpossible=0
tmpdir=
cleanup()
{
# Note that we're using a constant part (rclpdftmp), that hopefully
# guarantees that we can't do big mistakes with the -rf here.
if test ! -z "$tmpdir"; then
rm -rf $tmpdir/rclpdftmp
rmdir $tmpdir
fi
}
trap cleanup EXIT HUP QUIT INT TERM
runpdftotext()
{
# Run pdftotext and fix the result (add a charset tag and fix the
# html escaping.
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c -s |
awk 'BEGIN'\
' {
doescape = 0
cont = ""
@ -179,5 +210,125 @@ awk 'BEGIN'\
}
print $0
}
'
'
}
# If we're not equipped for ocr, just run pdftotext to stdout
if test $ocrpossible -eq 0; then
runpdftotext
exit $?
fi
# tesseract is installed, prepare for running it.
# We need to check the pdftotext output, but we don't want to run
# it twice. Use a temporary file.
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
tmpdir=$ttdir/rclpdf_tmp$$
mkdir $tmpdir || senderror mkdir $tmpdir failed
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
# Run pdftotext into the temp file
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
runpdftotext > $pdftxtfile
# If text is big, or small but not only tags and empty lines, output
# it. Given the contents check which we perform, a file in which the
# only text content is metadata (pdf description field), will be run
# through OCR, which is not necessarily what we would want. It would
# be possible to detect the situation if this proved an issue.
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
txtempty=0
# Use grep to check if there is regular text in there. Only do it on
# small outputs
if test $txtsize -lt 5000 ; then
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
test -z "$realtext" && txtempty=1
fi
if test $txtempty -eq 0; then
# pdftotext produced actual output, use it. No OCR
cat $pdftxtfile
exit 0
fi
# PDF has no text content and tesseract is available. Give it a try
pdflangfile=`dirname "$infile"`/.ocrpdflang
if test -f "$pdflangfile"; then
tesseractlang=`cat "$pdflangfile"`
fi
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
if test -z "$tesseractlang"; then
tesseractlang=${RECOLL_TESSERACT_LANG}
if test -z "$tesseractlang"; then
# Half assed trial to guess from LANG then default to english
localelang=`echo $LANG | awk -F_ '{print $1}'`
# echo localelang "$localelang" >&2
case "$localelang" in
en) tesseractlang=eng;;
de) tesseractlang=deu;;
fr) tesseractlang=fra;;
# Someone will have to add more tesseract language codes here.
esac
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
test -z "$tesseractlang" && tesseractlang="eng"
fi
fi
# echo tesseractlang "$tesseractlang" >&2
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
# split pdf-pages
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
if [ $? -ne 0 ] ; then
senderror "pdftoppm: $ERR_MSG"
fi
for i in $TMPFILE* ; do
if [ -s "$i" ] ; then
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
TESSERR=$?
# ignore tesseract start message
LINECOUNT=$(wc -l < $TESSERRORFILE)
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
echo "tesseract-error $TESSERR page $i in $infile" >&2
# sort "compacts" leptonica-output
cat $TESSERRORFILE | sort -u >&2
fi
# else
# debugging purpose
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
# cp -a $infile $SICFILE
# fi
fi
done
# don't output "empty" HTML-Files
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
if [ "$CHARS" -gt 0 ] ; then
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
cat "$TMPFILE"*.txt | \
awk '{
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0
}
'
echo "</pre></body></html>"
fi

View File

@ -162,12 +162,22 @@
<li><span class="application">pdf</span> with the <span class=
"command">pdftotext</span> command, which comes with
<a href="http://poppler.freedesktop.org/">poppler</a>,
(the package name is quite
often <tt>poppler-utils</tt>).
<em>Note: the older <span class="command">pdftotext</span> command
(the package name is quite often <tt>poppler-utils</tt>). <br/>
Note: the older <span class="command">pdftotext</span> command
which comes with <span class="application">xpdf</span> is
not compatible with <span class="application">
Recoll</span></em>.</li>
Recoll</span><br/>
<em>New in 1.21</em>: if the <span class="application">
tesseract</span> OCR application, and the
<span class="command">pdftoppm</span> command are available
on the system, the <span class="command">rclpdf</span>
filter has the capability to run OCR. See the comments at
the top of <span class="command">rclpdf</span> (usually
found
in <span class="filename">/usr/share/recoll/filters</span>)
for how to enable this and configuration details.
</li>
<li><span class="application">msword</span> with <a href=
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to