diff --git a/src/filters/rclkwd b/src/filters/rclkwd new file mode 100755 index 00000000..fadf0739 --- /dev/null +++ b/src/filters/rclkwd @@ -0,0 +1,204 @@ +#!/bin/sh +# @(#$Id: rclkwd,v 1.1 2007-06-08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclkword +# Extract text from a kword file +# +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclkwd" +filetype=kword + + + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds awk unzip gunzip tar + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclkwd_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rclkwdtmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rclkwdtmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rclkwdtmp + rmdir $tmpdir +} + +trap cleanup EXIT HUP QUIT INT TERM + +# Old kwd files are gzip/tar archibes. Newer ones are zip archives. +if file $infile | grep -qi gzip ; then + # Unzip the input file and change to the unzipped directory + gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -) +else + echo new kwd + # Unzip the input file and change to the unzipped directory + unzip -q -d $tmpdir/rclkwdtmp "$infile" +fi +cd $tmpdir/rclkwdtmp + +metafile=documentinfo.xml +contentfile=maindoc.xml + +if test -f $metafile ; then + + # Note: there can be newlines inside the description field, we don't want + # them... + abssedprog='//,/<\/abstract>/{ +s!.*!! +s!.*!! +p +} +' + abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \ + sed -e '1s///'` + subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ + < $metafile` + title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ + < $metafile | tr '\n' ' '` + keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \ + < $metafile` +fi + +# Note: next expr inserts a newline at each end of paragraph (for preview) +content="`sed -e 's!</TEXT>!\\ +!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`" + +#echo abstract "$abstract" +#echo subject "$subject" +#echo title "$title" +#echo keywords "$keywords" +#echo content "$content" + +# output the result +echo '<html><head>' +echo '<title>' "$title" '' +echo '' +echo '' +echo '' +echo '

' + +# The strange 'BEGIN' setup is to prevent 'file' from thinking this file +# is an awk program +echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ +awk 'BEGIN'\ +' { + cont = "" +} +{ + $0 = cont $0 + cont = "" + + if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line + } + + if($0 == "\f") { + print "

\n
\n

" + next + } + + print $0 "
" +} +END { + printf("

\n"); +}' | iconv -f UTF-8 -t UTF-8 -c -s + +cd / +# exit normally +exit 0 diff --git a/src/sampleconf/mimeview b/src/sampleconf/mimeview index 71eab445..593c2c7d 100644 --- a/src/sampleconf/mimeview +++ b/src/sampleconf/mimeview @@ -1,4 +1,4 @@ -# @(#$Id: mimeview,v 1.4 2007-02-14 10:10:43 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeview,v 1.5 2007-06-08 13:59:55 dockes Exp $ (C) 2004 J.F.Dockes ## ########################################## # External viewers, launched by the recoll GUI when you click on a result @@ -8,6 +8,8 @@ # Pseudo entry used if the 'use desktop' preference is set in the GUI application/x-all = xdg-open %f +application/x-kword = kword %f + application/msword = openoffice %f application/pdf = xpdf %f application/postscript = gv %f