diff --git a/src/filters/rclabw b/src/filters/rclabw new file mode 100755 index 00000000..ed2767ef --- /dev/null +++ b/src/filters/rclabw @@ -0,0 +1,173 @@ +#!/bin/sh +# @(#$Id: rclabw,v 1.1 2007-06-15 09:25:23 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# Extract text from an abiword file +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclabw" +filetype=abiword + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds iconv sed + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +encoding=`sed -e '/$//' \ + -e '/^\([^<]*\)<\/m>/ { +s//\1/ +p +q +} +//,/<\/m>/ { +s!.*!! +s!.*!! +H +} +${ +g +s/\n/ /g +p +} +' + +description=`sed -n -e "$descsedprog" < "$infile"` +#echo description: "$description" + +# Set program for the single line meta elements. Takes element name as +# parameter +setmetasedprog() { +metasedprog='//{ +s/.*\([^<]*\).*/\1/ +'"s/\"/'/g"' +p +}' +} + +setmetasedprog dc.subject +subject=`sed -n -e "$metasedprog" "$infile"` +#echo subject: "$subject" + +setmetasedprog dc.title +title=`sed -n -e "$metasedprog" "$infile"` +#echo titre: "$title" + +setmetasedprog abiword.keywords +keywords=`sed -n -e "$metasedprog" "$infile"` +#echo keywords: "$keywords" + +setmetasedprog dc.creator +creator=`sed -n -e "$metasedprog" "$infile"` +#echo creator: "$creator" + +# Note: next expr supposes that paragraphs are always all by themselves on +# a single line in the xml (no multiple

per line, no embedded newlines +# in text). +contentsedprog=' +/

]*>/ /g +p +} +' +content=`sed -n -e "$contentsedprog" "$infile"` +#echo content: "$content" + +# output the result +(echo '' "$title" '' +echo '' +echo '' +echo '' +echo '' +echo '

'
+echo "$content" 
+echo '
') \ +| iconv -f $encoding -t UTF-8 -c -s + + +# exit normally +exit 0