Add capability to run tesseract from rclpdf. Disabled by default, see comments at the top of rclpdf
This commit is contained in:
parent
25cecec484
commit
eaddefa7c5
@ -1,19 +1,28 @@
|
||||
#!/bin/sh
|
||||
#!/bin/bash
|
||||
#================================================================
|
||||
# Some parts are Copyright Estraier (GPL v2).
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
# Copyright (C) 2014 J.F. Dockes
|
||||
# Copyright (C) 2015 J.F. Dockes
|
||||
# There used to be Estraier content in there, but I quite believe that is not
|
||||
# the case any more.
|
||||
# This file is licensed under the GPL v2
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Convert a pdf file to HTML.
|
||||
#
|
||||
# We use pdftotext from the xpdf/poppler-utils package.
|
||||
#
|
||||
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
||||
# We try to correct.
|
||||
|
||||
#
|
||||
# If pdftotext produces no text and tesseract is available, we try to
|
||||
# perform OCR. As this can be very slow and the result not always
|
||||
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
||||
#
|
||||
# We guess the OCR language in order of preference:
|
||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
||||
# directory as the PDF
|
||||
# - From an RECOLL_TESSERACT_LANG environment variable
|
||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
||||
# - Default to "eng"
|
||||
#
|
||||
# Uncomment the following if you get better results without. The
|
||||
# pdftotext manual says that the option is no longer recommended The
|
||||
# difference in output seems mostly the removal of soft-hyphens when
|
||||
@ -21,8 +30,6 @@
|
||||
# optionraw=-raw
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclpdf"
|
||||
filetype=pdf
|
||||
|
||||
@ -95,12 +102,36 @@ umask 77
|
||||
|
||||
checkcmds pdftotext iconv awk
|
||||
|
||||
# Run pdftotext and fix the result (add a charset tag and fix the html escaping
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk 'BEGIN'\
|
||||
ocrpossible=0
|
||||
if iscmd tesseract; then
|
||||
if iscmd pdftoppm; then
|
||||
ocrpossible=1
|
||||
fi
|
||||
fi
|
||||
confdir=${RECOLL_CONFDIR:-~/.recoll}
|
||||
test ! -f "$confdir/ocrpdf" && ocrpossible=0
|
||||
|
||||
tmpdir=
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclpdftmp), that hopefully
|
||||
# guarantees that we can't do big mistakes with the -rf here.
|
||||
if test ! -z "$tmpdir"; then
|
||||
rm -rf $tmpdir/rclpdftmp
|
||||
rmdir $tmpdir
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
runpdftotext()
|
||||
{
|
||||
# Run pdftotext and fix the result (add a charset tag and fix the
|
||||
# html escaping.
|
||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
doescape = 0
|
||||
cont = ""
|
||||
@ -179,5 +210,125 @@ awk 'BEGIN'\
|
||||
}
|
||||
print $0
|
||||
}
|
||||
'
|
||||
'
|
||||
}
|
||||
|
||||
# If we're not equipped for ocr, just run pdftotext to stdout
|
||||
if test $ocrpossible -eq 0; then
|
||||
runpdftotext
|
||||
exit $?
|
||||
fi
|
||||
|
||||
|
||||
# tesseract is installed, prepare for running it.
|
||||
# We need to check the pdftotext output, but we don't want to run
|
||||
# it twice. Use a temporary file.
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclpdf_tmp$$
|
||||
mkdir $tmpdir || senderror mkdir $tmpdir failed
|
||||
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
|
||||
|
||||
# Run pdftotext into the temp file
|
||||
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
|
||||
runpdftotext > $pdftxtfile
|
||||
|
||||
# If text is big, or small but not only tags and empty lines, output
|
||||
# it. Given the contents check which we perform, a file in which the
|
||||
# only text content is metadata (pdf description field), will be run
|
||||
# through OCR, which is not necessarily what we would want. It would
|
||||
# be possible to detect the situation if this proved an issue.
|
||||
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
|
||||
txtempty=0
|
||||
# Use grep to check if there is regular text in there. Only do it on
|
||||
# small outputs
|
||||
if test $txtsize -lt 5000 ; then
|
||||
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
|
||||
test -z "$realtext" && txtempty=1
|
||||
fi
|
||||
|
||||
if test $txtempty -eq 0; then
|
||||
# pdftotext produced actual output, use it. No OCR
|
||||
cat $pdftxtfile
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# PDF has no text content and tesseract is available. Give it a try
|
||||
pdflangfile=`dirname "$infile"`/.ocrpdflang
|
||||
if test -f "$pdflangfile"; then
|
||||
tesseractlang=`cat "$pdflangfile"`
|
||||
fi
|
||||
|
||||
# Try to guess tesseract language. This should depend on the input
|
||||
# file, but we have no general way to determine it. So use the
|
||||
# environment and hope for the best.
|
||||
if test -z "$tesseractlang"; then
|
||||
tesseractlang=${RECOLL_TESSERACT_LANG}
|
||||
if test -z "$tesseractlang"; then
|
||||
# Half assed trial to guess from LANG then default to english
|
||||
localelang=`echo $LANG | awk -F_ '{print $1}'`
|
||||
# echo localelang "$localelang" >&2
|
||||
case "$localelang" in
|
||||
en) tesseractlang=eng;;
|
||||
de) tesseractlang=deu;;
|
||||
fr) tesseractlang=fra;;
|
||||
# Someone will have to add more tesseract language codes here.
|
||||
esac
|
||||
|
||||
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
|
||||
|
||||
test -z "$tesseractlang" && tesseractlang="eng"
|
||||
fi
|
||||
fi
|
||||
|
||||
# echo tesseractlang "$tesseractlang" >&2
|
||||
|
||||
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
|
||||
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
|
||||
|
||||
# split pdf-pages
|
||||
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
|
||||
if [ $? -ne 0 ] ; then
|
||||
senderror "pdftoppm: $ERR_MSG"
|
||||
fi
|
||||
|
||||
for i in $TMPFILE* ; do
|
||||
if [ -s "$i" ] ; then
|
||||
|
||||
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
|
||||
TESSERR=$?
|
||||
# ignore tesseract start message
|
||||
LINECOUNT=$(wc -l < $TESSERRORFILE)
|
||||
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
|
||||
echo "tesseract-error $TESSERR page $i in $infile" >&2
|
||||
# sort "compacts" leptonica-output
|
||||
cat $TESSERRORFILE | sort -u >&2
|
||||
fi
|
||||
# else
|
||||
# debugging purpose
|
||||
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
|
||||
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
|
||||
# cp -a $infile $SICFILE
|
||||
# fi
|
||||
fi
|
||||
done
|
||||
|
||||
# don't output "empty" HTML-Files
|
||||
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
|
||||
if [ "$CHARS" -gt 0 ] ; then
|
||||
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
|
||||
cat "$TMPFILE"*.txt | \
|
||||
awk '{
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
print $0
|
||||
}
|
||||
'
|
||||
echo "</pre></body></html>"
|
||||
fi
|
||||
@ -162,12 +162,22 @@
|
||||
<li><span class="application">pdf</span> with the <span class=
|
||||
"command">pdftotext</span> command, which comes with
|
||||
<a href="http://poppler.freedesktop.org/">poppler</a>,
|
||||
(the package name is quite
|
||||
often <tt>poppler-utils</tt>).
|
||||
<em>Note: the older <span class="command">pdftotext</span> command
|
||||
(the package name is quite often <tt>poppler-utils</tt>). <br/>
|
||||
Note: the older <span class="command">pdftotext</span> command
|
||||
which comes with <span class="application">xpdf</span> is
|
||||
not compatible with <span class="application">
|
||||
Recoll</span></em>.</li>
|
||||
Recoll</span><br/>
|
||||
|
||||
<em>New in 1.21</em>: if the <span class="application">
|
||||
tesseract</span> OCR application, and the
|
||||
<span class="command">pdftoppm</span> command are available
|
||||
on the system, the <span class="command">rclpdf</span>
|
||||
filter has the capability to run OCR. See the comments at
|
||||
the top of <span class="command">rclpdf</span> (usually
|
||||
found
|
||||
in <span class="filename">/usr/share/recoll/filters</span>)
|
||||
for how to enable this and configuration details.
|
||||
</li>
|
||||
|
||||
<li><span class="application">msword</span> with <a href=
|
||||
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user