Add capability to run tesseract from rclpdf. Disabled by default, see comments at the top of rclpdf
This commit is contained in:
parent
25cecec484
commit
eaddefa7c5
@ -1,19 +1,28 @@
|
|||||||
#!/bin/sh
|
#!/bin/bash
|
||||||
#================================================================
|
#================================================================
|
||||||
# Some parts are Copyright Estraier (GPL v2).
|
# Copyright (C) 2015 J.F. Dockes
|
||||||
# Estraier: a personal full-text search system
|
# There used to be Estraier content in there, but I quite believe that is not
|
||||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
# the case any more.
|
||||||
# Copyright (C) 2014 J.F. Dockes
|
|
||||||
# This file is licensed under the GPL v2
|
# This file is licensed under the GPL v2
|
||||||
#================================================================
|
#================================================================
|
||||||
#================================================================
|
|
||||||
# Convert a pdf file to HTML.
|
# Convert a pdf file to HTML.
|
||||||
#
|
#
|
||||||
# We use pdftotext from the xpdf/poppler-utils package.
|
# We use pdftotext from the xpdf/poppler-utils package.
|
||||||
#
|
#
|
||||||
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
||||||
# We try to correct.
|
# We try to correct.
|
||||||
|
#
|
||||||
|
# If pdftotext produces no text and tesseract is available, we try to
|
||||||
|
# perform OCR. As this can be very slow and the result not always
|
||||||
|
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
||||||
|
#
|
||||||
|
# We guess the OCR language in order of preference:
|
||||||
|
# - From the content of a ".ocrpdflang" file if it exists in the same
|
||||||
|
# directory as the PDF
|
||||||
|
# - From an RECOLL_TESSERACT_LANG environment variable
|
||||||
|
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
||||||
|
# - Default to "eng"
|
||||||
|
#
|
||||||
# Uncomment the following if you get better results without. The
|
# Uncomment the following if you get better results without. The
|
||||||
# pdftotext manual says that the option is no longer recommended The
|
# pdftotext manual says that the option is no longer recommended The
|
||||||
# difference in output seems mostly the removal of soft-hyphens when
|
# difference in output seems mostly the removal of soft-hyphens when
|
||||||
@ -21,8 +30,6 @@
|
|||||||
# optionraw=-raw
|
# optionraw=-raw
|
||||||
|
|
||||||
# set variables
|
# set variables
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclpdf"
|
progname="rclpdf"
|
||||||
filetype=pdf
|
filetype=pdf
|
||||||
|
|
||||||
@ -95,12 +102,36 @@ umask 77
|
|||||||
|
|
||||||
checkcmds pdftotext iconv awk
|
checkcmds pdftotext iconv awk
|
||||||
|
|
||||||
# Run pdftotext and fix the result (add a charset tag and fix the html escaping
|
ocrpossible=0
|
||||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
if iscmd tesseract; then
|
||||||
# is an awk program
|
if iscmd pdftoppm; then
|
||||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
ocrpossible=1
|
||||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
fi
|
||||||
awk 'BEGIN'\
|
fi
|
||||||
|
confdir=${RECOLL_CONFDIR:-~/.recoll}
|
||||||
|
test ! -f "$confdir/ocrpdf" && ocrpossible=0
|
||||||
|
|
||||||
|
tmpdir=
|
||||||
|
|
||||||
|
cleanup()
|
||||||
|
{
|
||||||
|
# Note that we're using a constant part (rclpdftmp), that hopefully
|
||||||
|
# guarantees that we can't do big mistakes with the -rf here.
|
||||||
|
if test ! -z "$tmpdir"; then
|
||||||
|
rm -rf $tmpdir/rclpdftmp
|
||||||
|
rmdir $tmpdir
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT HUP QUIT INT TERM
|
||||||
|
|
||||||
|
runpdftotext()
|
||||||
|
{
|
||||||
|
# Run pdftotext and fix the result (add a charset tag and fix the
|
||||||
|
# html escaping.
|
||||||
|
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||||
|
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||||
|
awk 'BEGIN'\
|
||||||
' {
|
' {
|
||||||
doescape = 0
|
doescape = 0
|
||||||
cont = ""
|
cont = ""
|
||||||
@ -179,5 +210,125 @@ awk 'BEGIN'\
|
|||||||
}
|
}
|
||||||
print $0
|
print $0
|
||||||
}
|
}
|
||||||
'
|
'
|
||||||
|
}
|
||||||
|
|
||||||
|
# If we're not equipped for ocr, just run pdftotext to stdout
|
||||||
|
if test $ocrpossible -eq 0; then
|
||||||
|
runpdftotext
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# tesseract is installed, prepare for running it.
|
||||||
|
# We need to check the pdftotext output, but we don't want to run
|
||||||
|
# it twice. Use a temporary file.
|
||||||
|
if test z"$RECOLL_TMPDIR" != z; then
|
||||||
|
ttdir=$RECOLL_TMPDIR
|
||||||
|
elif test z"$TMPDIR" != z ; then
|
||||||
|
ttdir=$TMPDIR
|
||||||
|
else
|
||||||
|
ttdir=/tmp
|
||||||
|
fi
|
||||||
|
tmpdir=$ttdir/rclpdf_tmp$$
|
||||||
|
mkdir $tmpdir || senderror mkdir $tmpdir failed
|
||||||
|
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
|
||||||
|
|
||||||
|
# Run pdftotext into the temp file
|
||||||
|
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
|
||||||
|
runpdftotext > $pdftxtfile
|
||||||
|
|
||||||
|
# If text is big, or small but not only tags and empty lines, output
|
||||||
|
# it. Given the contents check which we perform, a file in which the
|
||||||
|
# only text content is metadata (pdf description field), will be run
|
||||||
|
# through OCR, which is not necessarily what we would want. It would
|
||||||
|
# be possible to detect the situation if this proved an issue.
|
||||||
|
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
|
||||||
|
txtempty=0
|
||||||
|
# Use grep to check if there is regular text in there. Only do it on
|
||||||
|
# small outputs
|
||||||
|
if test $txtsize -lt 5000 ; then
|
||||||
|
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
|
||||||
|
test -z "$realtext" && txtempty=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test $txtempty -eq 0; then
|
||||||
|
# pdftotext produced actual output, use it. No OCR
|
||||||
|
cat $pdftxtfile
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# PDF has no text content and tesseract is available. Give it a try
|
||||||
|
pdflangfile=`dirname "$infile"`/.ocrpdflang
|
||||||
|
if test -f "$pdflangfile"; then
|
||||||
|
tesseractlang=`cat "$pdflangfile"`
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Try to guess tesseract language. This should depend on the input
|
||||||
|
# file, but we have no general way to determine it. So use the
|
||||||
|
# environment and hope for the best.
|
||||||
|
if test -z "$tesseractlang"; then
|
||||||
|
tesseractlang=${RECOLL_TESSERACT_LANG}
|
||||||
|
if test -z "$tesseractlang"; then
|
||||||
|
# Half assed trial to guess from LANG then default to english
|
||||||
|
localelang=`echo $LANG | awk -F_ '{print $1}'`
|
||||||
|
# echo localelang "$localelang" >&2
|
||||||
|
case "$localelang" in
|
||||||
|
en) tesseractlang=eng;;
|
||||||
|
de) tesseractlang=deu;;
|
||||||
|
fr) tesseractlang=fra;;
|
||||||
|
# Someone will have to add more tesseract language codes here.
|
||||||
|
esac
|
||||||
|
|
||||||
|
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
|
||||||
|
|
||||||
|
test -z "$tesseractlang" && tesseractlang="eng"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# echo tesseractlang "$tesseractlang" >&2
|
||||||
|
|
||||||
|
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
|
||||||
|
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
|
||||||
|
|
||||||
|
# split pdf-pages
|
||||||
|
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
|
||||||
|
if [ $? -ne 0 ] ; then
|
||||||
|
senderror "pdftoppm: $ERR_MSG"
|
||||||
|
fi
|
||||||
|
|
||||||
|
for i in $TMPFILE* ; do
|
||||||
|
if [ -s "$i" ] ; then
|
||||||
|
|
||||||
|
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
|
||||||
|
TESSERR=$?
|
||||||
|
# ignore tesseract start message
|
||||||
|
LINECOUNT=$(wc -l < $TESSERRORFILE)
|
||||||
|
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
|
||||||
|
echo "tesseract-error $TESSERR page $i in $infile" >&2
|
||||||
|
# sort "compacts" leptonica-output
|
||||||
|
cat $TESSERRORFILE | sort -u >&2
|
||||||
|
fi
|
||||||
|
# else
|
||||||
|
# debugging purpose
|
||||||
|
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
|
||||||
|
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
|
||||||
|
# cp -a $infile $SICFILE
|
||||||
|
# fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# don't output "empty" HTML-Files
|
||||||
|
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
|
||||||
|
if [ "$CHARS" -gt 0 ] ; then
|
||||||
|
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
|
||||||
|
cat "$TMPFILE"*.txt | \
|
||||||
|
awk '{
|
||||||
|
gsub(/&/, "\\&", $0)
|
||||||
|
gsub(/</, "\\<", $0)
|
||||||
|
gsub(/>/, "\\>", $0)
|
||||||
|
print $0
|
||||||
|
}
|
||||||
|
'
|
||||||
|
echo "</pre></body></html>"
|
||||||
|
fi
|
||||||
@ -162,12 +162,22 @@
|
|||||||
<li><span class="application">pdf</span> with the <span class=
|
<li><span class="application">pdf</span> with the <span class=
|
||||||
"command">pdftotext</span> command, which comes with
|
"command">pdftotext</span> command, which comes with
|
||||||
<a href="http://poppler.freedesktop.org/">poppler</a>,
|
<a href="http://poppler.freedesktop.org/">poppler</a>,
|
||||||
(the package name is quite
|
(the package name is quite often <tt>poppler-utils</tt>). <br/>
|
||||||
often <tt>poppler-utils</tt>).
|
Note: the older <span class="command">pdftotext</span> command
|
||||||
<em>Note: the older <span class="command">pdftotext</span> command
|
|
||||||
which comes with <span class="application">xpdf</span> is
|
which comes with <span class="application">xpdf</span> is
|
||||||
not compatible with <span class="application">
|
not compatible with <span class="application">
|
||||||
Recoll</span></em>.</li>
|
Recoll</span><br/>
|
||||||
|
|
||||||
|
<em>New in 1.21</em>: if the <span class="application">
|
||||||
|
tesseract</span> OCR application, and the
|
||||||
|
<span class="command">pdftoppm</span> command are available
|
||||||
|
on the system, the <span class="command">rclpdf</span>
|
||||||
|
filter has the capability to run OCR. See the comments at
|
||||||
|
the top of <span class="command">rclpdf</span> (usually
|
||||||
|
found
|
||||||
|
in <span class="filename">/usr/share/recoll/filters</span>)
|
||||||
|
for how to enable this and configuration details.
|
||||||
|
</li>
|
||||||
|
|
||||||
<li><span class="application">msword</span> with <a href=
|
<li><span class="application">msword</span> with <a href=
|
||||||
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to
|
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user