From 17393bad473c27476a8aa91171dd5652b4f48ba3 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Mon, 22 Mar 2010 18:24:55 +0100
Subject: [PATCH] converted to xslt

---
 src/filters/rclabw     | 156 +++++++++++++++++++++-------------------
 src/filters/rclkwd     | 145 +++++++++++++++++++++----------------
 src/filters/rclscribus |   5 +-
 src/filters/rclsoff    | 160 +++++++++++++++++++++--------------------
 src/filters/rclsvg     |  92 +++++++++++-------------
 5 files changed, 292 insertions(+), 266 deletions(-)
diff --git a/src/filters/rclabw b/src/filters/rclabw
index 48941484..ffb88658 100755
--- a/src/filters/rclabw
+++ b/src/filters/rclabw
@@ -82,88 +82,98 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
 
-checkcmds iconv sed
+checkcmds xsltproc
 
-encoding=`sed -e  '/<?xml version=/s/"?>$//' \
-	      -e '/^<?xml version=/s/.*encoding="//p;D;q' \
-	      -e D \
-< $infile`
-if test X$encoding = X ; then encoding=UTF-8;fi
+xsltproc - $infile <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:ab="http://www.abisource.com/awml.dtd" 
+  exclude-result-prefixes="ab"
+  >
 
-# Note: there can be newlines inside the description field, we don't want
-# them... Have 2 use 2 different selectors for the single-line and
-# multiple-line cases because of the generic tag end (</m> for all meta
-# tags)
-descsedprog='
-/<m key="dc.description">\([^<]*\)<\/m>/ {
-s//\1/
-p
-q
-}
-/<m key="dc.description">/,/<\/m>/ {
-s!.*<m key="dc.description">!!
-s!</m>.*!!
-H
-}
-${
-g
-s/\n/ /g
-p
-}
-'
+<xsl:output method="html" encoding="UTF-8"/>
 
-description=`sed -n -e "$descsedprog" < "$infile"`
-#echo description: "$description"
+<xsl:template match="/">
+<html>
+  <head>
+    <xsl:apply-templates select="ab:abiword/ab:metadata"/>
+  </head>
+  <body>
 
-# Set program for the single line meta elements. Takes element name as
-# parameter 
-setmetasedprog() {
-metasedprog='/<m key="'$1'">/{
-s/.*<m key="'$1'">\([^<]*\).*/\1/
-'"s/\"/'/g"'
-p
-}'
-}
+    <!-- This is for the older abiword format with no namespaces -->
+    <xsl:for-each select="abiword/section">
+      <xsl:apply-templates select="p"/>
+    </xsl:for-each>
 
-setmetasedprog dc.subject
-subject=`sed -n -e "$metasedprog" "$infile"`
-#echo subject: "$subject"
+    <!-- Newer namespaced format -->
+    <xsl:for-each select="ab:abiword/ab:section">
+      <xsl:for-each select="ab:p">
+        <p><xsl:value-of select="."/></p><xsl:text>
+        </xsl:text>
+      </xsl:for-each>
+    </xsl:for-each>
 
-setmetasedprog dc.title
-title=`sed -n -e "$metasedprog" "$infile"`
-#echo titre: "$title"
+  </body>
+</html>
+</xsl:template>
 
-setmetasedprog abiword.keywords
-keywords=`sed -n -e "$metasedprog" "$infile"`
-#echo keywords: "$keywords"
+<xsl:template match="p">
+  <p><xsl:value-of select="."/></p><xsl:text>
+      </xsl:text>
+</xsl:template>
 
-setmetasedprog dc.creator
-creator=`sed -n -e "$metasedprog" "$infile"`
-#echo creator: "$creator"
-
-# Note: next expr supposes that paragraphs are always all by themselves on
-# a single line in the xml (no multiple <p> per line, no embedded newlines
-# in text).
-contentsedprog='
-/<p[ >]/{
-s/<[^>]*>/ /g
-p
-}
-'
-content=`sed -n -e "$contentsedprog" "$infile"`
-#echo content: "$content"
-
-# output the result
-(echo '<html><head><title>' "$title" '</title>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '<meta name="description" content="' "$description $subject" '">'
-echo '<meta name="keywords" content="' "$keywords" '">'
-echo '<meta name="author" content="' "$creator" '">'
-echo '</head><body><pre>'
-echo "$content" 
-echo '</pre></body></html>') \
-| iconv -f $encoding -t UTF-8 -c -s 
+<xsl:template match="ab:metadata">
+    <xsl:for-each select="ab:m">
+      <xsl:choose>
+        <xsl:when test="@key = 'dc.creator'">
+	  <meta>
+	    <xsl:attribute name="name">author</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'abiword.keywords'">
+	  <meta>
+	    <xsl:attribute name="name">keywords</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.subject'">
+	  <meta>
+	    <xsl:attribute name="name">keywords</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.description'">
+	  <meta>
+	    <xsl:attribute name="name">abstract</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.title'">
+	  <title><xsl:value-of select="."/></title><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:otherwise>
+        </xsl:otherwise>
+      </xsl:choose>
+    </xsl:for-each>
+</xsl:template>
 
+</xsl:stylesheet>
+EOF
 
 # exit normally
 exit 0
diff --git a/src/filters/rclkwd b/src/filters/rclkwd
index a416af72..671c9db9 100755
--- a/src/filters/rclkwd
+++ b/src/filters/rclkwd
@@ -86,7 +86,7 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
 
-checkcmds awk unzip gunzip tar
+checkcmds unzip gunzip tar xsltproc
 
 # We need a temporary directory
 if test z"$RECOLL_TMPDIR" != z; then
@@ -115,7 +115,6 @@ if file $infile | grep -qi gzip ; then
    # Unzip the input file and change to the unzipped directory
    gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -)
 else
-    echo new kwd
    # Unzip the input file and change to the unzipped directory
    unzip -q -d $tmpdir/rclkwdtmp "$infile"
 fi
@@ -124,74 +123,98 @@ cd $tmpdir/rclkwdtmp
 metafile=documentinfo.xml
 contentfile=maindoc.xml
 
-if test -f $metafile ; then
+echo '<html><head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
 
-  # Note: there can be newlines inside the description field, we don't want
-  # them...
-  abssedprog='/<abstract>/,/<\/abstract>/{
-s!.*<abstract>!!
-s!</abstract>.*!!
-p
-}
-'
-  abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \
-	sed -e '1s/<!\[CDATA\[//' -e 's/\]\]>//'`
-  subject=`sed -e "s/\"/'/" -e 's/.*<subject>\([^<]*\).*/\1/p;d' \
-	     < $metafile`
-  title=`sed -e "s/\"/'/" -e 's/.*<title>\([^<]*\).*/\1/p;d' \
-	     < $metafile | tr '\n' ' '`
-  keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \
-	      < $metafile`
+if test -f $metafile ; then
+  xsltproc --novalid - $metafile <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:kw="http://www.koffice.org/DTD/document-info"
+  exclude-result-prefixes="kw"
+  >
+
+<xsl:output method="html" encoding="UTF-8"/>
+
+<xsl:template match="/">
+  <xsl:apply-templates select="kw:document-info|document-info"/>
+</xsl:template>
+<xsl:template match="/kw:document-info|/document-info">
+  <xsl:apply-templates select="kw:author|author"/>
+  <xsl:apply-templates select="kw:about/kw:abstract|abstract"/>
+  <xsl:apply-templates select="kw:about/kw:title|title"/>
+  <xsl:apply-templates select="kw:about/kw:keyword|keyword"/>
+  <xsl:apply-templates select="kw:about/kw:subject|subject"/>
+</xsl:template>
+
+<xsl:template match="kw:author|author">
+  <meta>
+    <xsl:attribute name="name">author</xsl:attribute>
+    <xsl:attribute name="content">
+    <xsl:value-of select="kw:full-name|full-name"/>
+    </xsl:attribute>
+         </meta><xsl:text>
+    </xsl:text>
+</xsl:template>
+
+<xsl:template match="kw:abstract|abstract">
+  <meta>
+    <xsl:attribute name="name">abtract</xsl:attribute>
+    <xsl:attribute name="content">
+    <xsl:value-of select="."/>
+    </xsl:attribute>
+         </meta><xsl:text>
+    </xsl:text>
+</xsl:template>
+
+<xsl:template match="kw:keyword|kw:subject|keyword|subject">
+  <meta>
+    <xsl:attribute name="name">keywords</xsl:attribute>
+    <xsl:attribute name="content">
+    <xsl:value-of select="."/>
+    </xsl:attribute>
+         </meta><xsl:text>
+    </xsl:text>
+</xsl:template>
+
+<xsl:template match="kw:title|title">
+  <title><xsl:value-of select="."/></title><xsl:text>
+    </xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
+
+EOF
 fi
 
-# Note: next expr inserts a newline at each end of paragraph (for preview)
-content="`sed -e 's!</TEXT>!\\
-!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ 	]*$/d'`"
+echo '</head><body>'
+xsltproc --novalid - $contentfile <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:kw="http://www.koffice.org/DTD/kword"
+  exclude-result-prefixes="kw"
+  >
 
-#echo abstract "$abstract"
-#echo subject "$subject"
-#echo title "$title"
-#echo keywords "$keywords"
-#echo content "$content"
+<xsl:output method="html" encoding="UTF-8"/>
 
-# output the result
-echo '<html><head>'
-echo '<title>' "$title" '</title>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '<meta name="abstract" content="' "$abstract $subject" '">'
-echo '<meta name="keywords" content="' "$keywords" '">'
-echo '</head><body><p>'
+<xsl:template match="/">
+  <xsl:apply-templates select="//kw:TEXT|//TEXT"/>
+</xsl:template>
 
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
-awk 'BEGIN'\
-' {
-  cont = ""
-}
-{
-    $0 = cont $0
-    cont = ""
+<xsl:template match="kw:TEXT|TEXT">
+  <xsl:if test="normalize-space(.) != ''">
+    <p><xsl:value-of select="."/></p><xsl:text>
+    </xsl:text>
+  </xsl:if>
+</xsl:template>
 
-    if ($0 ~ /[�-]$/) {
-      # Note : soft-hyphen is iso8859 0xad
-      # Break at last whitespace
-      match($0, "[ \t][^ \t]+$")
-      line = substr($0, 0, RSTART)
-      cont = substr($0, RSTART, RLENGTH-1)
-      $0 = line
-    }
+</xsl:stylesheet>
 
-    if($0 == "\f") {
-        print "</p>\n<hr>\n<p>"
-        next
-    } 
+EOF
 
-    print $0 "<br>"
-}
-END {
-    printf("</p></body></html>\n");
-}' | iconv -f UTF-8 -t UTF-8 -c -s 
+echo '</body></html>'
 
 cd /
 # exit normally
diff --git a/src/filters/rclscribus b/src/filters/rclscribus
index 9be6c19a..871f0003 100755
--- a/src/filters/rclscribus
+++ b/src/filters/rclscribus
@@ -13,6 +13,9 @@
 # We just hack into the scribus XML, taking advantage that the tag of
 # interest is apparently always output on a single line.
 # The text seems to be found in attribute CH of tag ITEXT, it is utf-8
+#
+# Tried to convert this to xsltproc but it seems that quite a few
+# Scribus document are not actually proper xml
 
 # set variables
 LANG=C ; export LANG
@@ -22,8 +25,6 @@ filetype=Scribus
 
 
 
-
-
 #RECFILTCOMMONCODE
 ##############################################################################
 # !! Leave the previous line unmodified!! Code imported from the
diff --git a/src/filters/rclsoff b/src/filters/rclsoff
index 626c44d9..1bb150af 100755
--- a/src/filters/rclsoff
+++ b/src/filters/rclsoff
@@ -18,10 +18,6 @@ progname="rclsoff"
 filetype=openoffice
 
 
-
-
-
-
 #RECFILTCOMMONCODE
 ##############################################################################
 # !! Leave the previous line unmodified!! Code imported from the
@@ -88,7 +84,7 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
 
-checkcmds awk iconv unzip
+checkcmds xsltproc
 
 # We need a temporary directory
 if test z"$RECOLL_TMPDIR" != z; then
@@ -116,92 +112,98 @@ trap cleanup EXIT HUP QUIT INT TERM
 unzip -q -d $tmpdir/rclsofftmp "$infile"
 cd $tmpdir/rclsofftmp
 
-# Note: there can be newlines inside the description field, we don't want
-# them...
-descsedprog='/<dc:description>/,/<\/dc:description>/{
-s!.*<dc:description>!!
-s!</dc:description>.*!!
-H
-${
-g
-s/\n/ /g
-p
-}
-}
-'
-description=`sed -n -e "$descsedprog" meta.xml`
-#echo description "$description"
+echo '<html><head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
 
-# Takes tag name as parameter and creates sed program to extract single
-# line meta tags values.
-setmetasedprog() {
-metasedprog="s/\"/'/g"'
-/.*<'"$1"'>\([^<]*\).*/s//\1/p
-'
-}
+xsltproc - meta.xml <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" 
+  xmlns:xlink="http://www.w3.org/1999/xlink" 
+  xmlns:dc="http://purl.org/dc/elements/1.1/" 
+  xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" 
+  xmlns:ooo="http://openoffice.org/2004/office"
+  exclude-result-prefixes="office xlink meta ooo dc"
+  >
 
-setmetasedprog dc:subject
-subject=`sed -n -e "$metasedprog" meta.xml`
-#echo subject: $subject
+<xsl:output method="html" encoding="UTF-8"/>
 
-setmetasedprog dc:title
-title=`sed -n -e "$metasedprog" meta.xml`
-#echo title: $title
+<xsl:template match="/office:document-meta">
+  <xsl:apply-templates select="office:meta/dc:description"/>
+  <xsl:apply-templates select="office:meta/dc:subject"/>
+  <xsl:apply-templates select="office:meta/dc:title"/>
+  <xsl:apply-templates select="office:meta/meta:keyword"/>
+  <xsl:apply-templates select="office:meta/dc:creator"/>
+</xsl:template>
 
-setmetasedprog meta:keyword
-keywords=`sed -n -e "$metasedprog" meta.xml`
-#echo keywords: $keywords
+<xsl:template match="dc:title">
+<title> <xsl:value-of select="."/> </title><xsl:text>
+</xsl:text>
+</xsl:template>
 
-setmetasedprog dc:creator
-creator=`sed -n -e "$metasedprog" meta.xml`
-#echo creator: $creator
+<xsl:template match="dc:description">
+  <meta>
+  <xsl:attribute name="name">abstract</xsl:attribute>
+  <xsl:attribute name="content">
+     <xsl:value-of select="."/>
+  </xsl:attribute>
+  </meta><xsl:text>
+</xsl:text>
+</xsl:template>
 
-# Note: next expr inserts a newline at each end of paragraph (for preview)
-content="`sed -e 's!</text:p>!\\
-!g' -e 's/<[^>]*>/ /g' < content.xml`"
+<xsl:template match="dc:subject">
+  <meta>
+  <xsl:attribute name="name">keywords</xsl:attribute>
+  <xsl:attribute name="content">
+     <xsl:value-of select="."/>
+  </xsl:attribute>
+  </meta><xsl:text>
+</xsl:text>
+</xsl:template>
 
-#echo content "$content"
+<xsl:template match="dc:creator">
+  <meta>
+  <xsl:attribute name="name">author</xsl:attribute>
+  <xsl:attribute name="content">
+     <xsl:value-of select="."/>
+  </xsl:attribute>
+  </meta><xsl:text>
+</xsl:text>
+</xsl:template>
 
-# output the result
-echo '<html><head>'
-echo '<title>' "$title" '</title>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '<meta name="description" content="' "$description $subject" '">'
-echo '<meta name="keywords" content="' "$keywords" '">'
-echo '<meta name="author" content="' "$creator" '">'
-echo '</head><body><p>'
+<xsl:template match="meta:keyword">
+  <meta>
+  <xsl:attribute name="name">keywords</xsl:attribute>
+  <xsl:attribute name="content">
+     <xsl:value-of select="."/>
+  </xsl:attribute>
+  </meta><xsl:text>
+</xsl:text>
+</xsl:template>
 
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
-awk 'BEGIN'\
-' {
-  cont = ""
-}
-{
-    $0 = cont $0
-    cont = ""
+</xsl:stylesheet>
+EOF
 
-    if ($0 ~ /[�-]$/) {
-      # Note : soft-hyphen is iso8859 0xad
-      # Break at last whitespace
-      match($0, "[ \t][^ \t]+$")
-      line = substr($0, 0, RSTART)
-      cont = substr($0, RSTART, RLENGTH-1)
-      $0 = line
-    }
+echo '</head><body>'
 
-    if($0 == "\f") {
-        print "</p>\n<hr>\n<p>"
-        next
-    } 
+xsltproc - content.xml <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
+  exclude-result-prefixes="text"
+>
 
-    print $0 "<br>"
-}
-END {
-    printf("</p></body></html>\n");
-}' | iconv -f UTF-8 -t UTF-8 -c -s 
+<xsl:output method="html" encoding="UTF-8"/>
 
+<xsl:template match="text:p">
+  <p><xsl:value-of select="."/></p><xsl:text>
+  </xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
+EOF
+echo '</body></html>'
 cd /
-# exit normally
 exit 0
diff --git a/src/filters/rclsvg b/src/filters/rclsvg
index b2acbfba..82278f51 100755
--- a/src/filters/rclsvg
+++ b/src/filters/rclsvg
@@ -1,10 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
+
 #================================================================
 # Extract text from a Scalable Vector Graphics file
 #================================================================
@@ -82,56 +77,51 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
 
-checkcmds iconv sed
+checkcmds xsltproc
 
-encoding=`sed -ne '/<?xml/s/.*encoding="\([^"]*\).*/\1/p' < $infile`
+xsltproc - $infile <<EOF
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:svg="http://www.w3.org/2000/svg"
+  exclude-result-prefixes="svg"
+  >
 
-if test X$encoding = X ; then encoding=UTF-8;fi
+<xsl:output method="html" encoding="UTF-8"/>
 
-# We use several sed instances to make our life easier. Not good for
-# performance, and a sed guru might be able to do better.
-#
-# The first sed makes sure each tag starts on a new line
-# The second one selects the tags we're interested in.
-# The last strips the tags, leaving only text.
-#
-# The whole thing wholly ignore issues like '<' inside quoted strings.
-#
-# We could/should add code to explicitly separate title and other
-# metadata elements.
+<xsl:template match="/">
+  <html>
+  <head>
+  <xsl:apply-templates select="svg:svg/svg:title"/>
+  <xsl:apply-templates select="svg:svg/svg:desc"/>
+  </head>
+  <body>
+  <xsl:apply-templates select="//svg:text"/>
+  </body>
+  </html>
+</xsl:template>
 
-# Insert new line before each tag
-sptagonline='s/</\
-</g'
+<xsl:template match="svg:desc"> 
+  <meta>
+  <xsl:attribute name="name">keywords</xsl:attribute>
+  <xsl:attribute name="content">
+     <xsl:value-of select="."/>
+  </xsl:attribute>
+  </meta><xsl:text>
+</xsl:text>
+</xsl:template>
 
-# Select tags
-spselecttags='/<title/,/<\/title>/p
-/<desc/,/<\/desc>/p
-/<metadata/,/<\/metadata>/p
-/<text/,/<\/text>/p'
+<xsl:template match="svg:title"> 
+  <title><xsl:value-of select="."/></title><xsl:text>
+  </xsl:text>
+</xsl:template>
+	    
+<xsl:template match="svg:text"> 
+  <p><xsl:value-of select="."/></p><xsl:text>
+  </xsl:text>
+</xsl:template>
 
-# Strip tags
-spstriptags='#n
-/</{
-    :c
-     />/!{
-	N
-	b c
-     }
-     />/s/<.*>//g
-}
-/^[ 	]*$/!p'
+</xsl:stylesheet>
+EOF
 
-content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \
-    sed -ne "$spstriptags"`
-
-(echo '<html><head>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '</head><body><pre>'
-echo "$content" 
-echo '</pre></body></html>') \
-| iconv -f $encoding -t UTF-8 -c -s 
-
-
-# exit normally
 exit 0