Improve rcldoc filter and switch back to using it for indexing instead of direct antiword exec. This is slightly slower but it does catch a number of .doc files which would not be indexed otherwise
This commit is contained in:
parent
ce607032fa
commit
dd8f42253c
@ -102,15 +102,18 @@ checkcmds awk antiword iconv
|
||||
$decoder "$infile" |
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
print "<html><head><title></title>"
|
||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
||||
print "</head>\n<body>\n<p>"
|
||||
cont = ""
|
||||
gotdata = 0
|
||||
}
|
||||
{
|
||||
if (!($0 ~ /^[ ]*$/))
|
||||
if (!($0 ~ /^[ ]*$/)) {
|
||||
gotdata = 1
|
||||
if (gotdata == 0) {
|
||||
print "<html><head><title></title>"
|
||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
||||
print "</head>\n<body>\n<p>"
|
||||
}
|
||||
}
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
@ -139,12 +142,32 @@ END {
|
||||
print "</p></body></html>"
|
||||
}'
|
||||
|
||||
# wvWare is much slower and we don't use it by default, but it handles
|
||||
# some files that antiword won't, so use it as a last resort. Yes the
|
||||
# html document will have a second header section, but this doesn't
|
||||
# seem to be an issue with our brave html input handler...
|
||||
|
||||
# Antiword rarely fails, we try to catch the most common reasons:
|
||||
if test $? -eq 1 ; then
|
||||
if iscmd wvWare ; then
|
||||
wvWare --nographics --charset=utf-8 "$infile"
|
||||
# Check actual document type
|
||||
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
|
||||
|
||||
if test X"$mtype" = Xtext/rtf; then
|
||||
# RTF document disguising as msword either because it has a .doc
|
||||
# extension or because it's an attachment with a wrong mime.
|
||||
exec `dirname $0`/rclrtf $infile
|
||||
fi
|
||||
|
||||
if test X"$mtype" = Xtext/plain; then
|
||||
# Someone gave a .doc ext to their texts. Happens...
|
||||
exec `dirname $0`/rcltext $infile
|
||||
fi
|
||||
|
||||
if test X"$mtype" = Xapplication/msword; then
|
||||
# Actually application/msword: try wvWare, which is much
|
||||
# slower and we don't use it by default, but it handles some
|
||||
# files that antiword won't, so use it as a last resort.
|
||||
if iscmd wvWare ; then
|
||||
exec wvWare --nographics --charset=utf-8 "$infile"
|
||||
fi
|
||||
fi
|
||||
|
||||
# else let the error be...
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -29,10 +29,14 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
|
||||
# each filter, see the exemples below (ie: msword)
|
||||
[index]
|
||||
|
||||
# MSWORD: install wvware and restore the rcldoc version to fix the "text
|
||||
# stream of this file is too small to handle" error if it's a problem for you
|
||||
# application/msword = exec rcldoc
|
||||
application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
||||
# MSWORD: the rcldoc script handles a number of marginal case that raw
|
||||
# antiword won't:
|
||||
# - with wvWare: "text stream of this file is too small to handle"
|
||||
# - with unrtf: rtf files disguising as doc files.
|
||||
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||
# important for you than catching all data,
|
||||
application/msword = exec rcldoc
|
||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
||||
# You can also use wvware directly but it's much slower.
|
||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||
|
||||
|
||||
@ -5,8 +5,10 @@ topdir=`dirname $0`/..
|
||||
|
||||
initvariables $0
|
||||
|
||||
recollq '"Evenements et programme 2006"' 2> $mystderr |
|
||||
egrep -v '^Recoll query: ' > $mystdout
|
||||
(
|
||||
recollq '"Evenements et programme 2006"'
|
||||
recollq 'pcx11 manuel de programmation iamactuallyanrtf'
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
2 results
|
||||
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc] [programme.doc] 58880 bytes
|
||||
application/msword [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
|
||||
1 results
|
||||
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC] [DOC PROGRAMMEUR PCX9] 85381 bytes
|
||||
|
||||
@ -13,8 +13,9 @@ makeindex() {
|
||||
echo "Indexing"
|
||||
recollindex -z
|
||||
}
|
||||
|
||||
if test x$noindex = x ; then
|
||||
makeindex
|
||||
fi
|
||||
|
||||
# Yes, we could/should use the $toptmp from shared.sh here, but what if
|
||||
# this is unset ?
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user