From 17393bad473c27476a8aa91171dd5652b4f48ba3 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 22 Mar 2010 18:24:55 +0100 Subject: [PATCH] converted to xslt --- src/filters/rclabw | 156 +++++++++++++++++++++------------------- src/filters/rclkwd | 145 +++++++++++++++++++++---------------- src/filters/rclscribus | 5 +- src/filters/rclsoff | 160 +++++++++++++++++++++-------------------- src/filters/rclsvg | 92 +++++++++++------------- 5 files changed, 292 insertions(+), 266 deletions(-) diff --git a/src/filters/rclabw b/src/filters/rclabw index 48941484..ffb88658 100755 --- a/src/filters/rclabw +++ b/src/filters/rclabw @@ -82,88 +82,98 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds iconv sed +checkcmds xsltproc -encoding=`sed -e '/$//' \ - -e '/^ + -# Note: there can be newlines inside the description field, we don't want -# them... Have 2 use 2 different selectors for the single-line and -# multiple-line cases because of the generic tag end ( for all meta -# tags) -descsedprog=' -/\([^<]*\)<\/m>/ { -s//\1/ -p -q -} -//,/<\/m>/ { -s!.*!! -s!.*!! -H -} -${ -g -s/\n/ /g -p -} -' + -description=`sed -n -e "$descsedprog" < "$infile"` -#echo description: "$description" + + + + + + -# Set program for the single line meta elements. Takes element name as -# parameter -setmetasedprog() { -metasedprog='//{ -s/.*\([^<]*\).*/\1/ -'"s/\"/'/g"' -p -}' -} + + + + -setmetasedprog dc.subject -subject=`sed -n -e "$metasedprog" "$infile"` -#echo subject: "$subject" + + + +

+ +
+
-setmetasedprog dc.title -title=`sed -n -e "$metasedprog" "$infile"` -#echo titre: "$title" + + +
-setmetasedprog abiword.keywords -keywords=`sed -n -e "$metasedprog" "$infile"` -#echo keywords: "$keywords" + +

+ +
-setmetasedprog dc.creator -creator=`sed -n -e "$metasedprog" "$infile"` -#echo creator: "$creator" - -# Note: next expr supposes that paragraphs are always all by themselves on -# a single line in the xml (no multiple

per line, no embedded newlines -# in text). -contentsedprog=' -/]/{ -s/<[^>]*>/ /g -p -} -' -content=`sed -n -e "$contentsedprog" "$infile"` -#echo content: "$content" - -# output the result -(echo '' "$title" '' -echo '' -echo '' -echo '' -echo '' -echo '

'
-echo "$content" 
-echo '
') \ -| iconv -f $encoding -t UTF-8 -c -s + + + + + + author + + + + + + + + + keywords + + + + + + + + + keywords + + + + + + + + + abstract + + + + + + + + <xsl:value-of select="."/> + + + + + + + +
+EOF # exit normally exit 0 diff --git a/src/filters/rclkwd b/src/filters/rclkwd index a416af72..671c9db9 100755 --- a/src/filters/rclkwd +++ b/src/filters/rclkwd @@ -86,7 +86,7 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds awk unzip gunzip tar +checkcmds unzip gunzip tar xsltproc # We need a temporary directory if test z"$RECOLL_TMPDIR" != z; then @@ -115,7 +115,6 @@ if file $infile | grep -qi gzip ; then # Unzip the input file and change to the unzipped directory gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -) else - echo new kwd # Unzip the input file and change to the unzipped directory unzip -q -d $tmpdir/rclkwdtmp "$infile" fi @@ -124,74 +123,98 @@ cd $tmpdir/rclkwdtmp metafile=documentinfo.xml contentfile=maindoc.xml -if test -f $metafile ; then +echo ' +' - # Note: there can be newlines inside the description field, we don't want - # them... - abssedprog='//,/<\/abstract>/{ -s!.*!! -s!.*!! -p -} -' - abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \ - sed -e '1s///'` - subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ - < $metafile` - title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ - < $metafile | tr '\n' ' '` - keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \ - < $metafile` +if test -f $metafile ; then + xsltproc --novalid - $metafile <<EOF +<?xml version="1.0"?> +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:kw="http://www.koffice.org/DTD/document-info" + exclude-result-prefixes="kw" + > + +<xsl:output method="html" encoding="UTF-8"/> + +<xsl:template match="/"> + <xsl:apply-templates select="kw:document-info|document-info"/> +</xsl:template> +<xsl:template match="/kw:document-info|/document-info"> + <xsl:apply-templates select="kw:author|author"/> + <xsl:apply-templates select="kw:about/kw:abstract|abstract"/> + <xsl:apply-templates select="kw:about/kw:title|title"/> + <xsl:apply-templates select="kw:about/kw:keyword|keyword"/> + <xsl:apply-templates select="kw:about/kw:subject|subject"/> +</xsl:template> + +<xsl:template match="kw:author|author"> + <meta> + <xsl:attribute name="name">author</xsl:attribute> + <xsl:attribute name="content"> + <xsl:value-of select="kw:full-name|full-name"/> + </xsl:attribute> + </meta><xsl:text> + </xsl:text> +</xsl:template> + +<xsl:template match="kw:abstract|abstract"> + <meta> + <xsl:attribute name="name">abtract</xsl:attribute> + <xsl:attribute name="content"> + <xsl:value-of select="."/> + </xsl:attribute> + </meta><xsl:text> + </xsl:text> +</xsl:template> + +<xsl:template match="kw:keyword|kw:subject|keyword|subject"> + <meta> + <xsl:attribute name="name">keywords</xsl:attribute> + <xsl:attribute name="content"> + <xsl:value-of select="."/> + </xsl:attribute> + </meta><xsl:text> + </xsl:text> +</xsl:template> + +<xsl:template match="kw:title|title"> + <title><xsl:value-of select="."/> + + + + + +EOF fi -# Note: next expr inserts a newline at each end of paragraph (for preview) -content="`sed -e 's!!\\ -!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`" +echo '' +xsltproc --novalid - $contentfile < + -#echo abstract "$abstract" -#echo subject "$subject" -#echo title "$title" -#echo keywords "$keywords" -#echo content "$content" + -# output the result -echo '' -echo '' "$title" '' -echo '' -echo '' -echo '' -echo '

' + + + -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ -awk 'BEGIN'\ -' { - cont = "" -} -{ - $0 = cont $0 - cont = "" + + +

+ + + - if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - } +
- if($0 == "\f") { - print "

\n
\n

" - next - } +EOF - print $0 "
" -} -END { - printf("

\n"); -}' | iconv -f UTF-8 -t UTF-8 -c -s +echo '' cd / # exit normally diff --git a/src/filters/rclscribus b/src/filters/rclscribus index 9be6c19a..871f0003 100755 --- a/src/filters/rclscribus +++ b/src/filters/rclscribus @@ -13,6 +13,9 @@ # We just hack into the scribus XML, taking advantage that the tag of # interest is apparently always output on a single line. # The text seems to be found in attribute CH of tag ITEXT, it is utf-8 +# +# Tried to convert this to xsltproc but it seems that quite a few +# Scribus document are not actually proper xml # set variables LANG=C ; export LANG @@ -22,8 +25,6 @@ filetype=Scribus - - #RECFILTCOMMONCODE ############################################################################## # !! Leave the previous line unmodified!! Code imported from the diff --git a/src/filters/rclsoff b/src/filters/rclsoff index 626c44d9..1bb150af 100755 --- a/src/filters/rclsoff +++ b/src/filters/rclsoff @@ -18,10 +18,6 @@ progname="rclsoff" filetype=openoffice - - - - #RECFILTCOMMONCODE ############################################################################## # !! Leave the previous line unmodified!! Code imported from the @@ -88,7 +84,7 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds awk iconv unzip +checkcmds xsltproc # We need a temporary directory if test z"$RECOLL_TMPDIR" != z; then @@ -116,92 +112,98 @@ trap cleanup EXIT HUP QUIT INT TERM unzip -q -d $tmpdir/rclsofftmp "$infile" cd $tmpdir/rclsofftmp -# Note: there can be newlines inside the description field, we don't want -# them... -descsedprog='//,/<\/dc:description>/{ -s!.*!! -s!.*!! -H -${ -g -s/\n/ /g -p -} -} -' -description=`sed -n -e "$descsedprog" meta.xml` -#echo description "$description" +echo ' +' -# Takes tag name as parameter and creates sed program to extract single -# line meta tags values. -setmetasedprog() { -metasedprog="s/\"/'/g"' -/.*<'"$1"'>\([^<]*\).*/s//\1/p -' -} +xsltproc - meta.xml < + -setmetasedprog dc:subject -subject=`sed -n -e "$metasedprog" meta.xml` -#echo subject: $subject + -setmetasedprog dc:title -title=`sed -n -e "$metasedprog" meta.xml` -#echo title: $title + + + + + + + -setmetasedprog meta:keyword -keywords=`sed -n -e "$metasedprog" meta.xml` -#echo keywords: $keywords + + <xsl:value-of select="."/> + + -setmetasedprog dc:creator -creator=`sed -n -e "$metasedprog" meta.xml` -#echo creator: $creator + + + abstract + + + + + + -# Note: next expr inserts a newline at each end of paragraph (for preview) -content="`sed -e 's!!\\ -!g' -e 's/<[^>]*>/ /g' < content.xml`" + + + keywords + + + + + + -#echo content "$content" + + + author + + + + + + -# output the result -echo '' -echo '' "$title" '' -echo '' -echo '' -echo '' -echo '' -echo '

' + + + keywords + + + + + + -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ -awk 'BEGIN'\ -' { - cont = "" -} -{ - $0 = cont $0 - cont = "" + +EOF - if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - } +echo '' - if($0 == "\f") { - print "

\n
\n

" - next - } +xsltproc - content.xml < + - print $0 "
" -} -END { - printf("

\n"); -}' | iconv -f UTF-8 -t UTF-8 -c -s + + +

+ +
+ +
+EOF +echo '' cd / -# exit normally exit 0 diff --git a/src/filters/rclsvg b/src/filters/rclsvg index b2acbfba..82278f51 100755 --- a/src/filters/rclsvg +++ b/src/filters/rclsvg @@ -1,10 +1,5 @@ #!/bin/sh -# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ + #================================================================ # Extract text from a Scalable Vector Graphics file #================================================================ @@ -82,56 +77,51 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds iconv sed +checkcmds xsltproc -encoding=`sed -ne '/ + -if test X$encoding = X ; then encoding=UTF-8;fi + -# We use several sed instances to make our life easier. Not good for -# performance, and a sed guru might be able to do better. -# -# The first sed makes sure each tag starts on a new line -# The second one selects the tags we're interested in. -# The last strips the tags, leaving only text. -# -# The whole thing wholly ignore issues like '<' inside quoted strings. -# -# We could/should add code to explicitly separate title and other -# metadata elements. + + + + + + + + + + + -# Insert new line before each tag -sptagonline='s/ + + keywords + + + + + + -# Select tags -spselecttags='//p -/<desc/,/<\/desc>/p -/<metadata/,/<\/metadata>/p -/<text/,/<\/text>/p' +<xsl:template match="svg:title"> + <title><xsl:value-of select="."/> + + + + +

+ +
-# Strip tags -spstriptags='#n -//!{ - N - b c - } - />/s/<.*>//g -} -/^[ ]*$/!p' +
+EOF -content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \ - sed -ne "$spstriptags"` - -(echo '' -echo '' -echo '
'
-echo "$content" 
-echo '
') \ -| iconv -f $encoding -t UTF-8 -c -s - - -# exit normally exit 0