Improve rcldoc filter and switch back to using it for indexing instead of direct antiword exec. This is slightly slower but it does catch a number of .doc files which would not be indexed otherwise

2011-05-10 09:03:13 +02:00 · 2011-05-10 09:03:13 +02:00 · dd8f42253c
commit dd8f42253c
parent ce607032fa
5 changed files with 49 additions and 17 deletions
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@ -102,15 +102,18 @@ checkcmds awk antiword iconv
 $decoder "$infile" |
 awk 'BEGIN'\
 ' {
  print "<html><head><title></title>"
  print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
  print "</head>\n<body>\n<p>"
  cont = ""
  gotdata = 0
 }
 {
-  if (!($0 ~ /^[ 	]*$/))
+  if (!($0 ~ /^[ 	]*$/)) {
    gotdata = 1
    if (gotdata == 0) {
      print "<html><head><title></title>"
      print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
      print "</head>\n<body>\n<p>"
    }
  }
  $0 = cont $0
  cont = ""
@ -139,12 +142,32 @@ END {
    print "</p></body></html>"
 }' 
-# wvWare is much slower and we don't use it by default, but it handles
+
-# some files that antiword won't, so use it as a last resort. Yes the
+# Antiword rarely fails, we try to catch the most common reasons:
 # html document will have a second header section, but this doesn't
 # seem to be an issue with our brave html input handler...
 if test $? -eq 1 ; then
-    if iscmd wvWare ; then
+    # Check actual document type 
-	wvWare --nographics --charset=utf-8 "$infile"
+    mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
    if test X"$mtype" = Xtext/rtf; then
      # RTF document disguising as msword either because it has a .doc
      # extension or because it's an attachment with a wrong mime.
      exec `dirname $0`/rclrtf $infile
    fi
    if test X"$mtype" = Xtext/plain; then
      # Someone gave a .doc ext to their texts. Happens...
      exec `dirname $0`/rcltext $infile
    fi
    if test X"$mtype" = Xapplication/msword; then
      # Actually application/msword: try wvWare, which is much
      # slower and we don't use it by default, but it handles some
      # files that antiword won't, so use it as a last resort.
      if iscmd wvWare ; then
        exec wvWare --nographics --charset=utf-8 "$infile"
      fi
    fi
    # else let the error be...
    exit 1
 fi
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -29,10 +29,14 @@ application/x-bzip2 =  uncompress rcluncomp bunzip2 %f %t
 # each filter, see the exemples below (ie: msword)
 [index]
-# MSWORD: install wvware and restore the rcldoc version to fix the "text
+# MSWORD: the rcldoc script handles a number of marginal case that raw
-# stream of this file is too small to handle" error if it's a problem for you
+# antiword won't:
-# application/msword = exec rcldoc
+#  - with wvWare: "text stream of this file is too small to handle" 
-application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
+#  - with unrtf: rtf files disguising as doc files.
 # The default is now again to use rcldoc. Use raw antiword if speed is more
 # important for you than catching all data, 
 application/msword = exec rcldoc
 #application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
 # You can also use wvware directly but it's much slower.
 # application/msword = exec wvWare --charset=utf-8 --nographics
--- a/tests/msword/msword.sh
+++ b/tests/msword/msword.sh
@ -5,8 +5,10 @@ topdir=`dirname $0`/..
 initvariables $0
-recollq '"Evenements et programme 2006"' 2> $mystderr | 
+(
-	egrep -v '^Recoll query: ' > $mystdout
+recollq '"Evenements et programme 2006"' 
 recollq 'pcx11 manuel de programmation iamactuallyanrtf'
 )  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/msword/msword.txt
+++ b/tests/msword/msword.txt
@ -1,3 +1,5 @@
 2 results
 application/msword	[file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc]	[programme.doc]	58880	bytes	
 application/msword	[file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip]	[misc.zip]	168155	bytes	
 1 results
 application/msword	[file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC]	[DOC PROGRAMMEUR PCX9]	85381	bytes	
--- a/tests/runtests.sh
+++ b/tests/runtests.sh
@ -13,8 +13,9 @@ makeindex() {
  echo "Indexing" 
  recollindex -z
 }
-
+if test x$noindex = x ; then
 makeindex
 fi
 # Yes, we could/should use the $toptmp from shared.sh here, but what if
 # this is unset ?