Improve rcldoc filter and switch back to using it for indexing instead of direct antiword exec. This is slightly slower but it does catch a number of .doc files which would not be indexed otherwise

This commit is contained in:
Jean-Francois Dockes 2011-05-10 09:03:13 +02:00
parent ce607032fa
commit dd8f42253c
5 changed files with 49 additions and 17 deletions

View File

@ -102,15 +102,18 @@ checkcmds awk antiword iconv
$decoder "$infile" |
awk 'BEGIN'\
' {
print "<html><head><title></title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
print "</head>\n<body>\n<p>"
cont = ""
gotdata = 0
}
{
if (!($0 ~ /^[ ]*$/))
if (!($0 ~ /^[ ]*$/)) {
gotdata = 1
if (gotdata == 0) {
print "<html><head><title></title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
print "</head>\n<body>\n<p>"
}
}
$0 = cont $0
cont = ""
@ -139,12 +142,32 @@ END {
print "</p></body></html>"
}'
# wvWare is much slower and we don't use it by default, but it handles
# some files that antiword won't, so use it as a last resort. Yes the
# html document will have a second header section, but this doesn't
# seem to be an issue with our brave html input handler...
# Antiword rarely fails, we try to catch the most common reasons:
if test $? -eq 1 ; then
if iscmd wvWare ; then
wvWare --nographics --charset=utf-8 "$infile"
# Check actual document type
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
if test X"$mtype" = Xtext/rtf; then
# RTF document disguising as msword either because it has a .doc
# extension or because it's an attachment with a wrong mime.
exec `dirname $0`/rclrtf $infile
fi
if test X"$mtype" = Xtext/plain; then
# Someone gave a .doc ext to their texts. Happens...
exec `dirname $0`/rcltext $infile
fi
if test X"$mtype" = Xapplication/msword; then
# Actually application/msword: try wvWare, which is much
# slower and we don't use it by default, but it handles some
# files that antiword won't, so use it as a last resort.
if iscmd wvWare ; then
exec wvWare --nographics --charset=utf-8 "$infile"
fi
fi
# else let the error be...
exit 1
fi

View File

@ -29,10 +29,14 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
# each filter, see the exemples below (ie: msword)
[index]
# MSWORD: install wvware and restore the rcldoc version to fix the "text
# stream of this file is too small to handle" error if it's a problem for you
# application/msword = exec rcldoc
application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
# MSWORD: the rcldoc script handles a number of marginal case that raw
# antiword won't:
# - with wvWare: "text stream of this file is too small to handle"
# - with unrtf: rtf files disguising as doc files.
# The default is now again to use rcldoc. Use raw antiword if speed is more
# important for you than catching all data,
application/msword = exec rcldoc
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics

View File

@ -5,8 +5,10 @@ topdir=`dirname $0`/..
initvariables $0
recollq '"Evenements et programme 2006"' 2> $mystderr |
egrep -v '^Recoll query: ' > $mystdout
(
recollq '"Evenements et programme 2006"'
recollq 'pcx11 manuel de programmation iamactuallyanrtf'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

View File

@ -1,3 +1,5 @@
2 results
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc] [programme.doc] 58880 bytes
application/msword [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
1 results
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC] [DOC PROGRAMMEUR PCX9] 85381 bytes

View File

@ -13,8 +13,9 @@ makeindex() {
echo "Indexing"
recollindex -z
}
if test x$noindex = x ; then
makeindex
fi
# Yes, we could/should use the $toptmp from shared.sh here, but what if
# this is unset ?