converted to xslt

This commit is contained in:
Jean-Francois Dockes 2010-03-22 18:24:55 +01:00
parent 6fd41e77a5
commit 17393bad47
5 changed files with 292 additions and 266 deletions

View File

@ -82,88 +82,98 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
checkcmds iconv sed checkcmds xsltproc
encoding=`sed -e '/<?xml version=/s/"?>$//' \ xsltproc - $infile <<EOF
-e '/^<?xml version=/s/.*encoding="//p;D;q' \ <?xml version="1.0"?>
-e D \ <xsl:stylesheet version="1.0"
< $infile` xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
if test X$encoding = X ; then encoding=UTF-8;fi xmlns:ab="http://www.abisource.com/awml.dtd"
exclude-result-prefixes="ab"
>
# Note: there can be newlines inside the description field, we don't want <xsl:output method="html" encoding="UTF-8"/>
# them... Have 2 use 2 different selectors for the single-line and
# multiple-line cases because of the generic tag end (</m> for all meta
# tags)
descsedprog='
/<m key="dc.description">\([^<]*\)<\/m>/ {
s//\1/
p
q
}
/<m key="dc.description">/,/<\/m>/ {
s!.*<m key="dc.description">!!
s!</m>.*!!
H
}
${
g
s/\n/ /g
p
}
'
description=`sed -n -e "$descsedprog" < "$infile"` <xsl:template match="/">
#echo description: "$description" <html>
<head>
<xsl:apply-templates select="ab:abiword/ab:metadata"/>
</head>
<body>
# Set program for the single line meta elements. Takes element name as <!-- This is for the older abiword format with no namespaces -->
# parameter <xsl:for-each select="abiword/section">
setmetasedprog() { <xsl:apply-templates select="p"/>
metasedprog='/<m key="'$1'">/{ </xsl:for-each>
s/.*<m key="'$1'">\([^<]*\).*/\1/
'"s/\"/'/g"'
p
}'
}
setmetasedprog dc.subject <!-- Newer namespaced format -->
subject=`sed -n -e "$metasedprog" "$infile"` <xsl:for-each select="ab:abiword/ab:section">
#echo subject: "$subject" <xsl:for-each select="ab:p">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:for-each>
</xsl:for-each>
setmetasedprog dc.title </body>
title=`sed -n -e "$metasedprog" "$infile"` </html>
#echo titre: "$title" </xsl:template>
setmetasedprog abiword.keywords <xsl:template match="p">
keywords=`sed -n -e "$metasedprog" "$infile"` <p><xsl:value-of select="."/></p><xsl:text>
#echo keywords: "$keywords" </xsl:text>
</xsl:template>
setmetasedprog dc.creator <xsl:template match="ab:metadata">
creator=`sed -n -e "$metasedprog" "$infile"` <xsl:for-each select="ab:m">
#echo creator: "$creator" <xsl:choose>
<xsl:when test="@key = 'dc.creator'">
# Note: next expr supposes that paragraphs are always all by themselves on <meta>
# a single line in the xml (no multiple <p> per line, no embedded newlines <xsl:attribute name="name">author</xsl:attribute>
# in text). <xsl:attribute name="content">
contentsedprog=' <xsl:value-of select="."/>
/<p[ >]/{ </xsl:attribute>
s/<[^>]*>/ /g </meta><xsl:text>
p </xsl:text>
} </xsl:when>
' <xsl:when test="@key = 'abiword.keywords'">
content=`sed -n -e "$contentsedprog" "$infile"` <meta>
#echo content: "$content" <xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
# output the result <xsl:value-of select="."/>
(echo '<html><head><title>' "$title" '</title>' </xsl:attribute>
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">' </meta><xsl:text>
echo '<meta name="description" content="' "$description $subject" '">' </xsl:text>
echo '<meta name="keywords" content="' "$keywords" '">' </xsl:when>
echo '<meta name="author" content="' "$creator" '">' <xsl:when test="@key = 'dc.subject'">
echo '</head><body><pre>' <meta>
echo "$content" <xsl:attribute name="name">keywords</xsl:attribute>
echo '</pre></body></html>') \ <xsl:attribute name="content">
| iconv -f $encoding -t UTF-8 -c -s <xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'dc.description'">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'dc.title'">
<title><xsl:value-of select="."/></title><xsl:text>
</xsl:text>
</xsl:when>
<xsl:otherwise>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
EOF
# exit normally # exit normally
exit 0 exit 0

View File

@ -86,7 +86,7 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
checkcmds awk unzip gunzip tar checkcmds unzip gunzip tar xsltproc
# We need a temporary directory # We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then if test z"$RECOLL_TMPDIR" != z; then
@ -115,7 +115,6 @@ if file $infile | grep -qi gzip ; then
# Unzip the input file and change to the unzipped directory # Unzip the input file and change to the unzipped directory
gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -) gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -)
else else
echo new kwd
# Unzip the input file and change to the unzipped directory # Unzip the input file and change to the unzipped directory
unzip -q -d $tmpdir/rclkwdtmp "$infile" unzip -q -d $tmpdir/rclkwdtmp "$infile"
fi fi
@ -124,74 +123,98 @@ cd $tmpdir/rclkwdtmp
metafile=documentinfo.xml metafile=documentinfo.xml
contentfile=maindoc.xml contentfile=maindoc.xml
if test -f $metafile ; then echo '<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
# Note: there can be newlines inside the description field, we don't want if test -f $metafile ; then
# them... xsltproc --novalid - $metafile <<EOF
abssedprog='/<abstract>/,/<\/abstract>/{ <?xml version="1.0"?>
s!.*<abstract>!! <xsl:stylesheet version="1.0"
s!</abstract>.*!! xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
p xmlns:kw="http://www.koffice.org/DTD/document-info"
} exclude-result-prefixes="kw"
' >
abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \
sed -e '1s/<!\[CDATA\[//' -e 's/\]\]>//'` <xsl:output method="html" encoding="UTF-8"/>
subject=`sed -e "s/\"/'/" -e 's/.*<subject>\([^<]*\).*/\1/p;d' \
< $metafile` <xsl:template match="/">
title=`sed -e "s/\"/'/" -e 's/.*<title>\([^<]*\).*/\1/p;d' \ <xsl:apply-templates select="kw:document-info|document-info"/>
< $metafile | tr '\n' ' '` </xsl:template>
keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \ <xsl:template match="/kw:document-info|/document-info">
< $metafile` <xsl:apply-templates select="kw:author|author"/>
<xsl:apply-templates select="kw:about/kw:abstract|abstract"/>
<xsl:apply-templates select="kw:about/kw:title|title"/>
<xsl:apply-templates select="kw:about/kw:keyword|keyword"/>
<xsl:apply-templates select="kw:about/kw:subject|subject"/>
</xsl:template>
<xsl:template match="kw:author|author">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="kw:full-name|full-name"/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="kw:abstract|abstract">
<meta>
<xsl:attribute name="name">abtract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="kw:keyword|kw:subject|keyword|subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="kw:title|title">
<title><xsl:value-of select="."/></title><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
EOF
fi fi
# Note: next expr inserts a newline at each end of paragraph (for preview) echo '</head><body>'
content="`sed -e 's!</TEXT>!\\ xsltproc --novalid - $contentfile <<EOF
!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`" <?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:kw="http://www.koffice.org/DTD/kword"
exclude-result-prefixes="kw"
>
#echo abstract "$abstract" <xsl:output method="html" encoding="UTF-8"/>
#echo subject "$subject"
#echo title "$title"
#echo keywords "$keywords"
#echo content "$content"
# output the result <xsl:template match="/">
echo '<html><head>' <xsl:apply-templates select="//kw:TEXT|//TEXT"/>
echo '<title>' "$title" '</title>' </xsl:template>
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '<meta name="abstract" content="' "$abstract $subject" '">'
echo '<meta name="keywords" content="' "$keywords" '">'
echo '</head><body><p>'
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file <xsl:template match="kw:TEXT|TEXT">
# is an awk program <xsl:if test="normalize-space(.) != ''">
echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\ <p><xsl:value-of select="."/></p><xsl:text>
awk 'BEGIN'\ </xsl:text>
' { </xsl:if>
cont = "" </xsl:template>
}
{
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) { </xsl:stylesheet>
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
if($0 == "\f") { EOF
print "</p>\n<hr>\n<p>"
next
}
print $0 "<br>" echo '</body></html>'
}
END {
printf("</p></body></html>\n");
}' | iconv -f UTF-8 -t UTF-8 -c -s
cd / cd /
# exit normally # exit normally

View File

@ -13,6 +13,9 @@
# We just hack into the scribus XML, taking advantage that the tag of # We just hack into the scribus XML, taking advantage that the tag of
# interest is apparently always output on a single line. # interest is apparently always output on a single line.
# The text seems to be found in attribute CH of tag ITEXT, it is utf-8 # The text seems to be found in attribute CH of tag ITEXT, it is utf-8
#
# Tried to convert this to xsltproc but it seems that quite a few
# Scribus document are not actually proper xml
# set variables # set variables
LANG=C ; export LANG LANG=C ; export LANG
@ -22,8 +25,6 @@ filetype=Scribus
#RECFILTCOMMONCODE #RECFILTCOMMONCODE
############################################################################## ##############################################################################
# !! Leave the previous line unmodified!! Code imported from the # !! Leave the previous line unmodified!! Code imported from the

View File

@ -18,10 +18,6 @@ progname="rclsoff"
filetype=openoffice filetype=openoffice
#RECFILTCOMMONCODE #RECFILTCOMMONCODE
############################################################################## ##############################################################################
# !! Leave the previous line unmodified!! Code imported from the # !! Leave the previous line unmodified!! Code imported from the
@ -88,7 +84,7 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
checkcmds awk iconv unzip checkcmds xsltproc
# We need a temporary directory # We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then if test z"$RECOLL_TMPDIR" != z; then
@ -116,92 +112,98 @@ trap cleanup EXIT HUP QUIT INT TERM
unzip -q -d $tmpdir/rclsofftmp "$infile" unzip -q -d $tmpdir/rclsofftmp "$infile"
cd $tmpdir/rclsofftmp cd $tmpdir/rclsofftmp
# Note: there can be newlines inside the description field, we don't want echo '<html><head>
# them... <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
descsedprog='/<dc:description>/,/<\/dc:description>/{
s!.*<dc:description>!!
s!</dc:description>.*!!
H
${
g
s/\n/ /g
p
}
}
'
description=`sed -n -e "$descsedprog" meta.xml`
#echo description "$description"
# Takes tag name as parameter and creates sed program to extract single xsltproc - meta.xml <<EOF
# line meta tags values. <?xml version="1.0"?>
setmetasedprog() { <xsl:stylesheet version="1.0"
metasedprog="s/\"/'/g"' xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
/.*<'"$1"'>\([^<]*\).*/s//\1/p xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
' xmlns:xlink="http://www.w3.org/1999/xlink"
} xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
xmlns:ooo="http://openoffice.org/2004/office"
exclude-result-prefixes="office xlink meta ooo dc"
>
setmetasedprog dc:subject <xsl:output method="html" encoding="UTF-8"/>
subject=`sed -n -e "$metasedprog" meta.xml`
#echo subject: $subject
setmetasedprog dc:title <xsl:template match="/office:document-meta">
title=`sed -n -e "$metasedprog" meta.xml` <xsl:apply-templates select="office:meta/dc:description"/>
#echo title: $title <xsl:apply-templates select="office:meta/dc:subject"/>
<xsl:apply-templates select="office:meta/dc:title"/>
<xsl:apply-templates select="office:meta/meta:keyword"/>
<xsl:apply-templates select="office:meta/dc:creator"/>
</xsl:template>
setmetasedprog meta:keyword <xsl:template match="dc:title">
keywords=`sed -n -e "$metasedprog" meta.xml` <title> <xsl:value-of select="."/> </title><xsl:text>
#echo keywords: $keywords </xsl:text>
</xsl:template>
setmetasedprog dc:creator <xsl:template match="dc:description">
creator=`sed -n -e "$metasedprog" meta.xml` <meta>
#echo creator: $creator <xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
# Note: next expr inserts a newline at each end of paragraph (for preview) <xsl:template match="dc:subject">
content="`sed -e 's!</text:p>!\\ <meta>
!g' -e 's/<[^>]*>/ /g' < content.xml`" <xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
#echo content "$content" <xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
# output the result <xsl:template match="meta:keyword">
echo '<html><head>' <meta>
echo '<title>' "$title" '</title>' <xsl:attribute name="name">keywords</xsl:attribute>
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">' <xsl:attribute name="content">
echo '<meta name="description" content="' "$description $subject" '">' <xsl:value-of select="."/>
echo '<meta name="keywords" content="' "$keywords" '">' </xsl:attribute>
echo '<meta name="author" content="' "$creator" '">' </meta><xsl:text>
echo '</head><body><p>' </xsl:text>
</xsl:template>
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file </xsl:stylesheet>
# is an awk program EOF
echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
awk 'BEGIN'\
' {
cont = ""
}
{
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) { echo '</head><body>'
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
if($0 == "\f") { xsltproc - content.xml <<EOF
print "</p>\n<hr>\n<p>" <?xml version="1.0"?>
next <xsl:stylesheet version="1.0"
} xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
exclude-result-prefixes="text"
>
print $0 "<br>" <xsl:output method="html" encoding="UTF-8"/>
}
END {
printf("</p></body></html>\n");
}' | iconv -f UTF-8 -t UTF-8 -c -s
<xsl:template match="text:p">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
EOF
echo '</body></html>'
cd / cd /
# exit normally
exit 0 exit 0

View File

@ -1,10 +1,5 @@
#!/bin/sh #!/bin/sh
# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================ #================================================================
# Extract text from a Scalable Vector Graphics file # Extract text from a Scalable Vector Graphics file
#================================================================ #================================================================
@ -82,56 +77,51 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
checkcmds iconv sed checkcmds xsltproc
encoding=`sed -ne '/<?xml/s/.*encoding="\([^"]*\).*/\1/p' < $infile` xsltproc - $infile <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:svg="http://www.w3.org/2000/svg"
exclude-result-prefixes="svg"
>
if test X$encoding = X ; then encoding=UTF-8;fi <xsl:output method="html" encoding="UTF-8"/>
# We use several sed instances to make our life easier. Not good for <xsl:template match="/">
# performance, and a sed guru might be able to do better. <html>
# <head>
# The first sed makes sure each tag starts on a new line <xsl:apply-templates select="svg:svg/svg:title"/>
# The second one selects the tags we're interested in. <xsl:apply-templates select="svg:svg/svg:desc"/>
# The last strips the tags, leaving only text. </head>
# <body>
# The whole thing wholly ignore issues like '<' inside quoted strings. <xsl:apply-templates select="//svg:text"/>
# </body>
# We could/should add code to explicitly separate title and other </html>
# metadata elements. </xsl:template>
# Insert new line before each tag <xsl:template match="svg:desc">
sptagonline='s/</\ <meta>
</g' <xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
# Select tags <xsl:template match="svg:title">
spselecttags='/<title/,/<\/title>/p <title><xsl:value-of select="."/></title><xsl:text>
/<desc/,/<\/desc>/p </xsl:text>
/<metadata/,/<\/metadata>/p </xsl:template>
/<text/,/<\/text>/p'
<xsl:template match="svg:text">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:template>
# Strip tags </xsl:stylesheet>
spstriptags='#n EOF
/</{
:c
/>/!{
N
b c
}
/>/s/<.*>//g
}
/^[ ]*$/!p'
content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \
sed -ne "$spstriptags"`
(echo '<html><head>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body><pre>'
echo "$content"
echo '</pre></body></html>') \
| iconv -f $encoding -t UTF-8 -c -s
# exit normally
exit 0 exit 0