Improve rcldoc filter and switch back to using it for indexing instead of direct antiword exec. This is slightly slower but it does catch a number of .doc files which would not be indexed otherwise

2011-05-10 09:03:13 +02:00 · 2011-05-10 09:03:13 +02:00 · dd8f42253c
commit dd8f42253c
parent ce607032fa
5 changed files with 49 additions and 17 deletions
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@ -102,15 +102,18 @@ checkcmds awk antiword iconv
 $decoder "$infile" |
 awk 'BEGIN'\
 ' {
-  print "<html><head><title></title>"
-  print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
-  print "</head>\n<body>\n<p>"
  cont = ""
  gotdata = 0
 }
 {
-  if (!($0 ~ /^[ 	]*$/))
+  if (!($0 ~ /^[ 	]*$/)) {
    gotdata = 1
+    if (gotdata == 0) {
+      print "<html><head><title></title>"
+      print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
+      print "</head>\n<body>\n<p>"
+    }
+  }
  $0 = cont $0
  cont = ""

@ -139,12 +142,32 @@ END {
    print "</p></body></html>"
 }' 

-# wvWare is much slower and we don't use it by default, but it handles
-# some files that antiword won't, so use it as a last resort. Yes the
-# html document will have a second header section, but this doesn't
-# seem to be an issue with our brave html input handler...
+
+# Antiword rarely fails, we try to catch the most common reasons:
 if test $? -eq 1 ; then
-    if iscmd wvWare ; then
-	wvWare --nographics --charset=utf-8 "$infile"
+    # Check actual document type 
+    mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
+
+    if test X"$mtype" = Xtext/rtf; then
+      # RTF document disguising as msword either because it has a .doc
+      # extension or because it's an attachment with a wrong mime.
+      exec `dirname $0`/rclrtf $infile
    fi
+    
+    if test X"$mtype" = Xtext/plain; then
+      # Someone gave a .doc ext to their texts. Happens...
+      exec `dirname $0`/rcltext $infile
+    fi
+
+    if test X"$mtype" = Xapplication/msword; then
+      # Actually application/msword: try wvWare, which is much
+      # slower and we don't use it by default, but it handles some
+      # files that antiword won't, so use it as a last resort.
+      if iscmd wvWare ; then
+        exec wvWare --nographics --charset=utf-8 "$infile"
+      fi
+    fi
+
+    # else let the error be...
+    exit 1
 fi
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -29,10 +29,14 @@ application/x-bzip2 =  uncompress rcluncomp bunzip2 %f %t
 # each filter, see the exemples below (ie: msword)
 [index]

-# MSWORD: install wvware and restore the rcldoc version to fix the "text
-# stream of this file is too small to handle" error if it's a problem for you
-# application/msword = exec rcldoc
-application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
+# MSWORD: the rcldoc script handles a number of marginal case that raw
+# antiword won't:
+#  - with wvWare: "text stream of this file is too small to handle" 
+#  - with unrtf: rtf files disguising as doc files.
+# The default is now again to use rcldoc. Use raw antiword if speed is more
+# important for you than catching all data, 
+application/msword = exec rcldoc
+#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
 # You can also use wvware directly but it's much slower.
 # application/msword = exec wvWare --charset=utf-8 --nographics

--- a/tests/msword/msword.sh
+++ b/tests/msword/msword.sh
@ -5,8 +5,10 @@ topdir=`dirname $0`/..

 initvariables $0

-recollq '"Evenements et programme 2006"' 2> $mystderr | 
-	egrep -v '^Recoll query: ' > $mystdout
+(
+recollq '"Evenements et programme 2006"' 
+recollq 'pcx11 manuel de programmation iamactuallyanrtf'
+)  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

--- a/tests/msword/msword.txt
+++ b/tests/msword/msword.txt
@ -1,3 +1,5 @@
 2 results
 application/msword	[file:///home/dockes/projets/fulltext/testrecoll/msword/programme.doc]	[programme.doc]	58880	bytes	
 application/msword	[file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip]	[misc.zip]	168155	bytes	
+1 results
+application/msword	[file:///home/dockes/projets/fulltext/testrecoll/msword/IAmActuallyAnRTF.DOC]	[DOC PROGRAMMEUR PCX9]	85381	bytes	
--- a/tests/runtests.sh
+++ b/tests/runtests.sh
@ -13,8 +13,9 @@ makeindex() {
  echo "Indexing" 
  recollindex -z
 }
-
+if test x$noindex = x ; then
 makeindex
+fi

 # Yes, we could/should use the $toptmp from shared.sh here, but what if
 # this is unset ?