diff --git a/src/filters/rcldoc b/src/filters/rcldoc index 9dd96e32..2b9c4690 100755 --- a/src/filters/rcldoc +++ b/src/filters/rcldoc @@ -102,15 +102,18 @@ checkcmds awk antiword iconv $decoder "$infile" | awk 'BEGIN'\ ' { - print "
" cont = "" gotdata = 0 } { - if (!($0 ~ /^[ ]*$/)) + if (!($0 ~ /^[ ]*$/)) { gotdata = 1 + if (gotdata == 0) { + print "
" + } + } $0 = cont $0 cont = "" @@ -139,12 +142,32 @@ END { print "
" }' -# wvWare is much slower and we don't use it by default, but it handles -# some files that antiword won't, so use it as a last resort. Yes the -# html document will have a second header section, but this doesn't -# seem to be an issue with our brave html input handler... + +# Antiword rarely fails, we try to catch the most common reasons: if test $? -eq 1 ; then - if iscmd wvWare ; then - wvWare --nographics --charset=utf-8 "$infile" + # Check actual document type + mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'` + + if test X"$mtype" = Xtext/rtf; then + # RTF document disguising as msword either because it has a .doc + # extension or because it's an attachment with a wrong mime. + exec `dirname $0`/rclrtf $infile fi + + if test X"$mtype" = Xtext/plain; then + # Someone gave a .doc ext to their texts. Happens... + exec `dirname $0`/rcltext $infile + fi + + if test X"$mtype" = Xapplication/msword; then + # Actually application/msword: try wvWare, which is much + # slower and we don't use it by default, but it handles some + # files that antiword won't, so use it as a last resort. + if iscmd wvWare ; then + exec wvWare --nographics --charset=utf-8 "$infile" + fi + fi + + # else let the error be... + exit 1 fi diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 1c304ccc..a01bf763 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -29,10 +29,14 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t # each filter, see the exemples below (ie: msword) [index] -# MSWORD: install wvware and restore the rcldoc version to fix the "text -# stream of this file is too small to handle" error if it's a problem for you -# application/msword = exec rcldoc -application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 +# MSWORD: the rcldoc script handles a number of marginal case that raw +# antiword won't: +# - with wvWare: "text stream of this file is too small to handle" +# - with unrtf: rtf files disguising as doc files. +# The default is now again to use rcldoc. Use raw antiword if speed is more +# important for you than catching all data, +application/msword = exec rcldoc +#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 # You can also use wvware directly but it's much slower. # application/msword = exec wvWare --charset=utf-8 --nographics diff --git a/tests/msword/msword.sh b/tests/msword/msword.sh index 37554871..f4b957e9 100755 --- a/tests/msword/msword.sh +++ b/tests/msword/msword.sh @@ -5,8 +5,10 @@ topdir=`dirname $0`/.. initvariables $0 -recollq '"Evenements et programme 2006"' 2> $mystderr | - egrep -v '^Recoll query: ' > $mystdout +( +recollq '"Evenements et programme 2006"' +recollq 'pcx11 manuel de programmation iamactuallyanrtf' +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff --git a/tests/msword/msword.txt b/tests/msword/msword.txt index 83d382d1..03b44a61 100644 --- a/tests/msword/msword.txt +++ b/tests/msword/msword.txt @@ -1,3 +1,5 @@ 2 results application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc] [programme.doc] 58880 bytes application/msword [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes +1 results +application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC] [DOC PROGRAMMEUR PCX9] 85381 bytes diff --git a/tests/runtests.sh b/tests/runtests.sh index d9540c3b..f1dbc636 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -13,8 +13,9 @@ makeindex() { echo "Indexing" recollindex -z } - +if test x$noindex = x ; then makeindex +fi # Yes, we could/should use the $toptmp from shared.sh here, but what if # this is unset ?