From 4d780d0c5877026e61cec972a6c97683efb9d17c Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 22 Jan 2007 16:32:55 +0000 Subject: [PATCH] added scribus support --- src/filters/rclscribus | 151 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 src/filters/rclscribus diff --git a/src/filters/rclscribus b/src/filters/rclscribus new file mode 100755 index 00000000..26d4107d --- /dev/null +++ b/src/filters/rclscribus @@ -0,0 +1,151 @@ +#!/bin/sh +# @(#$Id: rclscribus,v 1.1 2007-01-22 16:32:55 dockes Exp $ (C) 2004 J.F.Dockes +# There may still be code from Estraier in here: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclscribus +# Convert a scribus file to recoll HTML. This only handles the newer .sla +# files until I can have a look at an older .scd. +# +# We just hack into the scribus XML, taking advantage that the tag of +# interest is apparently always output on a single line. +# The text seems to be found in attribute CH of tag ITEXT, it is utf-8 + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclscribus" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Extract scribus text as basic HTML.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds grep awk sed + +# A small sed program to join lines where they are broken inside an +# attribute value. The idea is that all scribus tag are apparently on one +# line except when there are embedded new lines in an attribute lie +# 'comments'. The first version of the sed script joins line which does not +# end with > with the next. It doesn't guard against an embedded '>'. The +# seconf joins line not beginning with '<' with the previous. It is much +# slower for some reason. +sedjoinprog=':a +/[^>] *$/N; s/\n/ /; ta' +#sedjoinprog1=':a +#$!N;/^ *[^<]/s/\n/ /;ta +#P;D' + +# Extract description title author and keywords +description=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +title=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +author=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +keywords=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +#echo description: [$description];echo title: [$title]; +#echo author: [$author];echo keywords: [$keywords] + +cat < +$title + + + + + +

+EOF + + +sed -e ':a' -e '/[^>] *$/N; s/\n/ /; ta' < $infile | \ +awk ' +/" + } +} +END { + print "

" +} +' | \ +sed -e 's//
/g' -e 's//
/g'