Improve rcldoc filter and switch back to using it for indexing instead of direct antiword exec. This is slightly slower but it does catch a number of .doc files which would not be indexed otherwise
This commit is contained in:
parent
ce607032fa
commit
dd8f42253c
@ -102,15 +102,18 @@ checkcmds awk antiword iconv
|
|||||||
$decoder "$infile" |
|
$decoder "$infile" |
|
||||||
awk 'BEGIN'\
|
awk 'BEGIN'\
|
||||||
' {
|
' {
|
||||||
print "<html><head><title></title>"
|
|
||||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
|
||||||
print "</head>\n<body>\n<p>"
|
|
||||||
cont = ""
|
cont = ""
|
||||||
gotdata = 0
|
gotdata = 0
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
if (!($0 ~ /^[ ]*$/))
|
if (!($0 ~ /^[ ]*$/)) {
|
||||||
gotdata = 1
|
gotdata = 1
|
||||||
|
if (gotdata == 0) {
|
||||||
|
print "<html><head><title></title>"
|
||||||
|
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
||||||
|
print "</head>\n<body>\n<p>"
|
||||||
|
}
|
||||||
|
}
|
||||||
$0 = cont $0
|
$0 = cont $0
|
||||||
cont = ""
|
cont = ""
|
||||||
|
|
||||||
@ -139,12 +142,32 @@ END {
|
|||||||
print "</p></body></html>"
|
print "</p></body></html>"
|
||||||
}'
|
}'
|
||||||
|
|
||||||
# wvWare is much slower and we don't use it by default, but it handles
|
|
||||||
# some files that antiword won't, so use it as a last resort. Yes the
|
# Antiword rarely fails, we try to catch the most common reasons:
|
||||||
# html document will have a second header section, but this doesn't
|
|
||||||
# seem to be an issue with our brave html input handler...
|
|
||||||
if test $? -eq 1 ; then
|
if test $? -eq 1 ; then
|
||||||
if iscmd wvWare ; then
|
# Check actual document type
|
||||||
wvWare --nographics --charset=utf-8 "$infile"
|
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
|
||||||
|
|
||||||
|
if test X"$mtype" = Xtext/rtf; then
|
||||||
|
# RTF document disguising as msword either because it has a .doc
|
||||||
|
# extension or because it's an attachment with a wrong mime.
|
||||||
|
exec `dirname $0`/rclrtf $infile
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if test X"$mtype" = Xtext/plain; then
|
||||||
|
# Someone gave a .doc ext to their texts. Happens...
|
||||||
|
exec `dirname $0`/rcltext $infile
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test X"$mtype" = Xapplication/msword; then
|
||||||
|
# Actually application/msword: try wvWare, which is much
|
||||||
|
# slower and we don't use it by default, but it handles some
|
||||||
|
# files that antiword won't, so use it as a last resort.
|
||||||
|
if iscmd wvWare ; then
|
||||||
|
exec wvWare --nographics --charset=utf-8 "$infile"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# else let the error be...
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -29,10 +29,14 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
|
|||||||
# each filter, see the exemples below (ie: msword)
|
# each filter, see the exemples below (ie: msword)
|
||||||
[index]
|
[index]
|
||||||
|
|
||||||
# MSWORD: install wvware and restore the rcldoc version to fix the "text
|
# MSWORD: the rcldoc script handles a number of marginal case that raw
|
||||||
# stream of this file is too small to handle" error if it's a problem for you
|
# antiword won't:
|
||||||
# application/msword = exec rcldoc
|
# - with wvWare: "text stream of this file is too small to handle"
|
||||||
application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
# - with unrtf: rtf files disguising as doc files.
|
||||||
|
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||||
|
# important for you than catching all data,
|
||||||
|
application/msword = exec rcldoc
|
||||||
|
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
||||||
# You can also use wvware directly but it's much slower.
|
# You can also use wvware directly but it's much slower.
|
||||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||||
|
|
||||||
|
|||||||
@ -5,8 +5,10 @@ topdir=`dirname $0`/..
|
|||||||
|
|
||||||
initvariables $0
|
initvariables $0
|
||||||
|
|
||||||
recollq '"Evenements et programme 2006"' 2> $mystderr |
|
(
|
||||||
egrep -v '^Recoll query: ' > $mystdout
|
recollq '"Evenements et programme 2006"'
|
||||||
|
recollq 'pcx11 manuel de programmation iamactuallyanrtf'
|
||||||
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
2 results
|
2 results
|
||||||
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc] [programme.doc] 58880 bytes
|
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc] [programme.doc] 58880 bytes
|
||||||
application/msword [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
|
application/msword [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
|
||||||
|
1 results
|
||||||
|
application/msword [file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC] [DOC PROGRAMMEUR PCX9] 85381 bytes
|
||||||
|
|||||||
@ -13,8 +13,9 @@ makeindex() {
|
|||||||
echo "Indexing"
|
echo "Indexing"
|
||||||
recollindex -z
|
recollindex -z
|
||||||
}
|
}
|
||||||
|
if test x$noindex = x ; then
|
||||||
makeindex
|
makeindex
|
||||||
|
fi
|
||||||
|
|
||||||
# Yes, we could/should use the $toptmp from shared.sh here, but what if
|
# Yes, we could/should use the $toptmp from shared.sh here, but what if
|
||||||
# this is unset ?
|
# this is unset ?
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user