powerpoint: decide to use unoconv based on the number of lines in catppt output

This commit is contained in:
Jean-Francois Dockes 2013-11-12 10:40:07 +01:00
parent a9358d2f03
commit 134153e412

View File

@ -30,8 +30,6 @@ LC_ALL=C ; export LC_ALL
progname="rclppt" progname="rclppt"
filetype=powerpoint filetype=powerpoint
RCLPPT_CATPPT=${RCLPPT_CATPPT:=yes}
#RECFILTCOMMONCODE #RECFILTCOMMONCODE
############################################################################## ##############################################################################
# !! Leave the previous line unmodified!! Code imported from the # !! Leave the previous line unmodified!! Code imported from the
@ -98,54 +96,68 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
if test X$RCLPPT_CATPPT = Xyes ; then havecappt=no
checkcmds catppt iscmd cappt && havecappt=yes
haveunoconv=no
iscmd unoconv && haveunoconv=yes
iscmd pdftotext || haveunoconv=no
# output the result if test X$havecatppt = Xno -a X$haveunoconv = Xno ; then
echo '<html><head>' # checkcmds will exit with the appropriate salutations
#echo '<title>' "$title" '</title>' checkcmds catppt unoconv pdftotext
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">' fi
echo '</head><body>'
echo '<pre>' # This needs a temp dir because we first output pdf (outputting html
# would produce one file per page), and pdftotext can't read from
catppt -d utf-8 "$infile" | \ # stdin
sed -e 's/</&lt;/g' -e 's/&/&amp;/g' if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
echo '</pre>' elif test z"$TMPDIR" != z ; then
echo '</body></html>' ttdir=$TMPDIR
else
# exit normally ttdir=/tmp
exit 0 fi
else tmpdir=$ttdir/rclppt_tmp$$
mkdir $tmpdir || exit 1
# Using unoconv mkdir $tmpdir/rclppttmp || exit 1
checkcmds unoconv pdftotext unopdf=$tmpdir/rclppttmp/output.pdf
cattxt=$tmpdir/rclppttmp/output.txt
# This needs a temp dir because we first output pdf (outputting html cleanup()
# would produce one file per page), and pdftotext can't read from {
# stdin # Note that we're using a constant part (rclkwdtmp), that hopefully
if test z"$RECOLL_TMPDIR" != z; then # guarantees that we can't do big mistakes here.
ttdir=$RECOLL_TMPDIR rm -rf $tmpdir/rclppttmp
elif test z"$TMPDIR" != z ; then rmdir $tmpdir
ttdir=$TMPDIR }
else trap cleanup EXIT HUP QUIT INT TERM
ttdir=/tmp
fi # Try catppt. If the output looks too small and unoconv is available, use this
# instead. unoconv is very slow but it handles newer files that catppt will
tmpdir=$ttdir/rclppt_tmp$$ # not convert.
mkdir $tmpdir || exit 1 #
mkdir $tmpdir/rclppttmp || exit 1 # I'm not sure of the right test for detecting catppt failure. On the
unopdf=$tmpdir/rclppttmp/output.pdf # sample I have, it outputs Azure\n1_Azure\n\n. I don't know if Azure
cleanup() # is a good marker of failure. Anyway, it seems unlikely that a real
{ # ppt would have fewer than 5 lines
# Note that we're using a constant part (rclkwdtmp), that hopefully
# guarantees that we can't do big mistakes here. catppt -d utf-8 "$infile" > $cattxt
rm -rf $tmpdir/rclppttmp lines=`wc -l < $cattxt`
rmdir $tmpdir
} if test $lines -lt 5 -a X$haveunoconv = Xyes; then
unoconv -f pdf -o $unopdf "$infile"
trap cleanup EXIT HUP QUIT INT TERM `dirname $0`/rclpdf $unopdf
unoconv -f pdf -o $unopdf "$infile" else
`dirname $0`/rclpdf $unopdf # output the catppt result
echo '<html><head>'
#echo '<title>' "$title" '</title>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>'
echo '<pre>'
catppt -d utf-8 "$infile" | \
sed -e 's/</&lt;/g' -e 's/&/&amp;/g' < $cattxt
echo '</pre>'
echo '</body></html>'
fi fi