diff --git a/src/filters/rclabw b/src/filters/rclabw index 88135515..02cda362 100755 --- a/src/filters/rclabw +++ b/src/filters/rclabw @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclabw,v 1.2 2007-06-15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclabw,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -83,13 +83,6 @@ umask 77 checkcmds iconv sed -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - encoding=`sed -e '/$//' \ -e '/^' #echo '' "$title" '' echo '' -echo '' -echo '
'
 
 nocaptionexp='s/===.*://'
-kpcaptionexp='s/===[^(]*//'
 
 if test X$RECOLL_FILTER_FORPREVIEW = Xyes ; then
-    captionexp=$kpcaptionexp
-else
-    captionexp=$nocaptionexp
-fi
-
+echo '
'
 id3info "$infile" | \
    sed -e '/Tag information for/d' \
-       -e "$captionexp" \
-       -e 's//>/g' 
+echo '
' -echo '
' -echo '' +else +# Indexing. There are many other fields which we could process this way. +# Would also need that some fields are set up in the "fields" config +# file to be really useful +id3info "$infile" | awk -F: \ +'{ + value="" + for (i = 2; i <=NF; i++) { + value = value " " $i + } + sub("&", "&", value) + sub("\"", """, value) + sub("<", "<", value) + sub(">", ">", value) +} +/TIT2/{ + printf "\n", value + body=body value "\n" + next +} +/TPE1/{ + printf "\n", value + body=body value "\n" + next +} +/TALB/{ + printf "\n", value + body=body value "\n" + next +} +{ + body=body value "\n" +} +END{print "
";print body;print "
"} +' +fi -# exit normally exit 0 diff --git a/src/filters/rclkwd b/src/filters/rclkwd index fadf0739..96529a10 100755 --- a/src/filters/rclkwd +++ b/src/filters/rclkwd @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclkwd,v 1.1 2007-06-08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclkwd,v 1.2 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -87,13 +87,6 @@ umask 77 checkcmds awk unzip gunzip tar -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - # We need a temporary directory if test z"$RECOLL_TMPDIR" != z; then ttdir=$RECOLL_TMPDIR diff --git a/src/filters/rclogg b/src/filters/rclogg index 15ce7585..c8b94451 100755 --- a/src/filters/rclogg +++ b/src/filters/rclogg @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclogg,v 1.1 2007-10-02 14:00:47 dockes Exp $ (C) 2007 J.F.Dockes +# @(#$Id: rclogg,v 1.2 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes #================================================================ # rclogg # Handle ogg audio files for recoll. @@ -74,33 +74,37 @@ fi # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds ogginfo +checkcmds ogginfo sed awk # output the result echo '' -#echo '' "$title" '' echo '' -echo '' -echo '
'
 
-nocaptionexp='/=/s/.*=//p'
-kpcaptionexp='/=/p'
+if test X$RECOLL_FILTER_FORPREVIEW = Xyes 
+then
+
+echo '
'
+ogginfo "$infile" | grep '=' | sed -e 's/&/&/g' -e 's/"/"/g' \
+    -e 's//>/g'
+echo '
' -if test X$RECOLL_FILTER_FORPREVIEW = Xyes ; then - captionexp=$kpcaptionexp else - captionexp=$nocaptionexp + ogginfo "$infile" | grep '=' | awk -F= \ + '{sub("^[ ]+", "", $1) + sub("&", "&", $2) + sub("\"", """, $2) + sub("<", "<", $2) + sub(">", ">", $2) + printf "\n", $1, $2 + body=body $2 "\n" + } + END{print "
";print body;print "
"}' fi # Note: ogginfo output is unparseable: the COMMENT field can be # multiline, but there is no way to detect the end of the 'User # comments' area -ogginfo "$infile" | \ - sed -n \ - -e "$captionexp" -echo '
' -echo '' # exit normally exit 0 diff --git a/src/filters/rclopxml b/src/filters/rclopxml index 9065e306..719ad5f9 100755 --- a/src/filters/rclopxml +++ b/src/filters/rclopxml @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclopxml,v 1.2 2008-09-01 17:31:47 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes #================================================================ # rcldocx # Extract text from an openxml msword file (will be extended for spreadsheets) @@ -80,13 +80,6 @@ umask 77 checkcmds xsltproc unzip -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - # We need a temporary directory if test z"$RECOLL_TMPDIR" != z; then ttdir=$RECOLL_TMPDIR diff --git a/src/filters/rclppt b/src/filters/rclppt index d6fabf84..798ec612 100755 --- a/src/filters/rclppt +++ b/src/filters/rclppt @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclppt,v 1.3 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or @@ -100,10 +100,6 @@ umask 77 #ENDRECFILTCOMMONCODE checkcmds catppt -if test X$cmdsok = X0 ; then - printf "Catppt not found" - exit 1 -fi # output the result echo '' diff --git a/src/filters/rclsoff b/src/filters/rclsoff index f704de74..aae69329 100755 --- a/src/filters/rclsoff +++ b/src/filters/rclsoff @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclsoff,v 1.11 2007-06-15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -89,13 +89,6 @@ umask 77 checkcmds awk iconv unzip -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - # We need a temporary directory if test z"$RECOLL_TMPDIR" != z; then ttdir=$RECOLL_TMPDIR diff --git a/src/filters/rclsvg b/src/filters/rclsvg index 12167586..eca854d6 100755 --- a/src/filters/rclsvg +++ b/src/filters/rclsvg @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclsvg,v 1.2 2008-02-03 16:05:57 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -83,13 +83,6 @@ umask 77 checkcmds iconv sed -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - encoding=`sed -ne '/' diff --git a/src/sampleconf/fields b/src/sampleconf/fields index ff5974c2..e54f60ab 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -1,4 +1,4 @@ -# @(#$Id: fields,v 1.4 2008-10-07 06:44:23 dockes Exp $ (C) 2007 J.F.Dockes +# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes # Field names configuration. This defines how one may search ie for # author:Hemingway # Important: @@ -53,7 +53,7 @@ stored = author [aliases] abstract = summary dc:summary description xesam:description -author = creator dc:creator xesam:author xesam:creator +author = creator dc:creator xesam:author xesam:creator from caption = title title dc:title subject # catg = dc:type contentCategory dbytes = size xesam:size diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 7bf40bc4..fc6bd5e8 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.44 2008-09-28 14:20:50 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.45 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -26,12 +26,15 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t # The external "exec" filters are typically scripts. They output the # document in simple html format, have a look at the scripts. [index] -application/msword = exec rcldoc +# Note: rcldoc did some work to splice hyphenated words at eol. Seems +# actually not needed because antiword apparently does it too +# application/msword = exec rcldoc +application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 application/ogg = exec rclogg application/pdf = exec rclpdf application/postscript = exec rclps -application/vnd.ms-excel = exec rclxls -application/vnd.ms-powerpoint = exec rclppt +application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain +application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ exec rclopxml application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ @@ -54,7 +57,7 @@ application/vnd.sun.xml.math = exec rclsoff application/vnd.sun.xml.writer = exec rclsoff application/vnd.sun.xml.writer.global = exec rclsoff application/vnd.sun.xml.writer.template = exec rclsoff -application/vnd.wordperfect = exec rclwpd +application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = exec rclabw application/x-dvi = exec rcldvi application/x-flac = exec rclflac @@ -72,7 +75,7 @@ image/svg+xml = exec rclsvg message/rfc822 = internal text/html = internal text/plain = internal -text/rtf = exec rclrtf +text/rtf = exec unrtf --nopict --html;charset=iso-8859-1;mimetype=text/html text/x-c = exec rcltext text/x-gaim-log = exec rclgaim text/x-html-sidux-man = exec rclsiduxman