From 17393bad473c27476a8aa91171dd5652b4f48ba3 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Mon, 22 Mar 2010 18:24:55 +0100
Subject: [PATCH] converted to xslt
---
src/filters/rclabw | 156 +++++++++++++++++++++-------------------
src/filters/rclkwd | 145 +++++++++++++++++++++----------------
src/filters/rclscribus | 5 +-
src/filters/rclsoff | 160 +++++++++++++++++++++--------------------
src/filters/rclsvg | 92 +++++++++++-------------
5 files changed, 292 insertions(+), 266 deletions(-)
diff --git a/src/filters/rclabw b/src/filters/rclabw
index 48941484..ffb88658 100755
--- a/src/filters/rclabw
+++ b/src/filters/rclabw
@@ -82,88 +82,98 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
-checkcmds iconv sed
+checkcmds xsltproc
-encoding=`sed -e '/$//' \
- -e '/^
+
-# Note: there can be newlines inside the description field, we don't want
-# them... Have 2 use 2 different selectors for the single-line and
-# multiple-line cases because of the generic tag end ( for all meta
-# tags)
-descsedprog='
-/\([^<]*\)<\/m>/ {
-s//\1/
-p
-q
-}
-//,/<\/m>/ {
-s!.*!!
-s!.*!!
-H
-}
-${
-g
-s/\n/ /g
-p
-}
-'
+
-description=`sed -n -e "$descsedprog" < "$infile"`
-#echo description: "$description"
+
+
+
+
+
+
-# Set program for the single line meta elements. Takes element name as
-# parameter
-setmetasedprog() {
-metasedprog='//{
-s/.*\([^<]*\).*/\1/
-'"s/\"/'/g"'
-p
-}'
-}
+
+
+
+
-setmetasedprog dc.subject
-subject=`sed -n -e "$metasedprog" "$infile"`
-#echo subject: "$subject"
+
+
+
+
+
+
+
-setmetasedprog dc.title
-title=`sed -n -e "$metasedprog" "$infile"`
-#echo titre: "$title"
+
+
+
-setmetasedprog abiword.keywords
-keywords=`sed -n -e "$metasedprog" "$infile"`
-#echo keywords: "$keywords"
+
+
+
+
-setmetasedprog dc.creator
-creator=`sed -n -e "$metasedprog" "$infile"`
-#echo creator: "$creator"
-
-# Note: next expr supposes that paragraphs are always all by themselves on
-# a single line in the xml (no multiple per line, no embedded newlines
-# in text).
-contentsedprog='
-/
]/{
-s/<[^>]*>/ /g
-p
-}
-'
-content=`sed -n -e "$contentsedprog" "$infile"`
-#echo content: "$content"
-
-# output the result
-(echo '
' "$title" ''
-echo ''
-echo ''
-echo ''
-echo ''
-echo ''
-echo "$content"
-echo '
') \
-| iconv -f $encoding -t UTF-8 -c -s
+
+
+
+
+
+ author
+
+
+
+
+
+
+
+
+ keywords
+
+
+
+
+
+
+
+
+ keywords
+
+
+
+
+
+
+
+
+ abstract
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+EOF
# exit normally
exit 0
diff --git a/src/filters/rclkwd b/src/filters/rclkwd
index a416af72..671c9db9 100755
--- a/src/filters/rclkwd
+++ b/src/filters/rclkwd
@@ -86,7 +86,7 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
-checkcmds awk unzip gunzip tar
+checkcmds unzip gunzip tar xsltproc
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
@@ -115,7 +115,6 @@ if file $infile | grep -qi gzip ; then
# Unzip the input file and change to the unzipped directory
gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -)
else
- echo new kwd
# Unzip the input file and change to the unzipped directory
unzip -q -d $tmpdir/rclkwdtmp "$infile"
fi
@@ -124,74 +123,98 @@ cd $tmpdir/rclkwdtmp
metafile=documentinfo.xml
contentfile=maindoc.xml
-if test -f $metafile ; then
+echo '
+'
- # Note: there can be newlines inside the description field, we don't want
- # them...
- abssedprog='//,/<\/abstract>/{
-s!.*!!
-s!.*!!
-p
-}
-'
- abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \
- sed -e '1s///'`
- subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \
- < $metafile`
- title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \
- < $metafile | tr '\n' ' '`
- keywords=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \
- < $metafile`
+if test -f $metafile ; then
+ xsltproc --novalid - $metafile <
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ author
+
+
+
+
+
+
+
+
+
+ abtract
+
+
+
+
+
+
+
+
+
+ keywords
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+EOF
fi
-# Note: next expr inserts a newline at each end of paragraph (for preview)
-content="`sed -e 's!!\\
-!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`"
+echo ''
+xsltproc --novalid - $contentfile <
+
-#echo abstract "$abstract"
-#echo subject "$subject"
-#echo title "$title"
-#echo keywords "$keywords"
-#echo content "$content"
+
-# output the result
-echo ''
-echo '' "$title" ''
-echo ''
-echo ''
-echo ''
-echo ''
+
+
+
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\
-awk 'BEGIN'\
-' {
- cont = ""
-}
-{
- $0 = cont $0
- cont = ""
+
+
+
+
+
+
- if ($0 ~ /[-]$/) {
- # Note : soft-hyphen is iso8859 0xad
- # Break at last whitespace
- match($0, "[ \t][^ \t]+$")
- line = substr($0, 0, RSTART)
- cont = substr($0, RSTART, RLENGTH-1)
- $0 = line
- }
+
- if($0 == "\f") {
- print "
\n
\n"
- next
- }
+EOF
- print $0 "
"
-}
-END {
- printf("
\n");
-}' | iconv -f UTF-8 -t UTF-8 -c -s
+echo ''
cd /
# exit normally
diff --git a/src/filters/rclscribus b/src/filters/rclscribus
index 9be6c19a..871f0003 100755
--- a/src/filters/rclscribus
+++ b/src/filters/rclscribus
@@ -13,6 +13,9 @@
# We just hack into the scribus XML, taking advantage that the tag of
# interest is apparently always output on a single line.
# The text seems to be found in attribute CH of tag ITEXT, it is utf-8
+#
+# Tried to convert this to xsltproc but it seems that quite a few
+# Scribus document are not actually proper xml
# set variables
LANG=C ; export LANG
@@ -22,8 +25,6 @@ filetype=Scribus
-
-
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
diff --git a/src/filters/rclsoff b/src/filters/rclsoff
index 626c44d9..1bb150af 100755
--- a/src/filters/rclsoff
+++ b/src/filters/rclsoff
@@ -18,10 +18,6 @@ progname="rclsoff"
filetype=openoffice
-
-
-
-
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
@@ -88,7 +84,7 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
-checkcmds awk iconv unzip
+checkcmds xsltproc
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
@@ -116,92 +112,98 @@ trap cleanup EXIT HUP QUIT INT TERM
unzip -q -d $tmpdir/rclsofftmp "$infile"
cd $tmpdir/rclsofftmp
-# Note: there can be newlines inside the description field, we don't want
-# them...
-descsedprog='//,/<\/dc:description>/{
-s!.*!!
-s!.*!!
-H
-${
-g
-s/\n/ /g
-p
-}
-}
-'
-description=`sed -n -e "$descsedprog" meta.xml`
-#echo description "$description"
+echo '
+'
-# Takes tag name as parameter and creates sed program to extract single
-# line meta tags values.
-setmetasedprog() {
-metasedprog="s/\"/'/g"'
-/.*<'"$1"'>\([^<]*\).*/s//\1/p
-'
-}
+xsltproc - meta.xml <
+
-setmetasedprog dc:subject
-subject=`sed -n -e "$metasedprog" meta.xml`
-#echo subject: $subject
+
-setmetasedprog dc:title
-title=`sed -n -e "$metasedprog" meta.xml`
-#echo title: $title
+
+
+
+
+
+
+
-setmetasedprog meta:keyword
-keywords=`sed -n -e "$metasedprog" meta.xml`
-#echo keywords: $keywords
+
+
+
+
-setmetasedprog dc:creator
-creator=`sed -n -e "$metasedprog" meta.xml`
-#echo creator: $creator
+
+
+ abstract
+
+
+
+
+
+
-# Note: next expr inserts a newline at each end of paragraph (for preview)
-content="`sed -e 's!!\\
-!g' -e 's/<[^>]*>/ /g' < content.xml`"
+
+
+ keywords
+
+
+
+
+
+
-#echo content "$content"
+
+
+ author
+
+
+
+
+
+
-# output the result
-echo ''
-echo '' "$title" ''
-echo ''
-echo ''
-echo ''
-echo ''
-echo ''
+
+
+ keywords
+
+
+
+
+
+
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\
-awk 'BEGIN'\
-' {
- cont = ""
-}
-{
- $0 = cont $0
- cont = ""
+
+EOF
- if ($0 ~ /[-]$/) {
- # Note : soft-hyphen is iso8859 0xad
- # Break at last whitespace
- match($0, "[ \t][^ \t]+$")
- line = substr($0, 0, RSTART)
- cont = substr($0, RSTART, RLENGTH-1)
- $0 = line
- }
+echo ''
- if($0 == "\f") {
- print "\n
\n"
- next
- }
+xsltproc - content.xml <
+
- print $0 "
"
-}
-END {
- printf("
\n");
-}' | iconv -f UTF-8 -t UTF-8 -c -s
+
+
+
+
+
+
+
+EOF
+echo ''
cd /
-# exit normally
exit 0
diff --git a/src/filters/rclsvg b/src/filters/rclsvg
index b2acbfba..82278f51 100755
--- a/src/filters/rclsvg
+++ b/src/filters/rclsvg
@@ -1,10 +1,5 @@
#!/bin/sh
-# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
+
#================================================================
# Extract text from a Scalable Vector Graphics file
#================================================================
@@ -82,56 +77,51 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
-checkcmds iconv sed
+checkcmds xsltproc
-encoding=`sed -ne '/
+
-if test X$encoding = X ; then encoding=UTF-8;fi
+
-# We use several sed instances to make our life easier. Not good for
-# performance, and a sed guru might be able to do better.
-#
-# The first sed makes sure each tag starts on a new line
-# The second one selects the tags we're interested in.
-# The last strips the tags, leaving only text.
-#
-# The whole thing wholly ignore issues like '<' inside quoted strings.
-#
-# We could/should add code to explicitly separate title and other
-# metadata elements.
+
+
+
+
+
+
+
+
+
+
+
-# Insert new line before each tag
-sptagonline='s/\
-
+
+ keywords
+
+
+
+
+
+
-# Select tags
-spselecttags='//p
-//p
-//p
-//p'
+
+
+
+
+
+
+
+
+
-# Strip tags
-spstriptags='#n
-/{
- :c
- />/!{
- N
- b c
- }
- />/s/<.*>//g
-}
-/^[ ]*$/!p'
+
+EOF
-content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \
- sed -ne "$spstriptags"`
-
-(echo ''
-echo ''
-echo ''
-echo "$content"
-echo '
') \
-| iconv -f $encoding -t UTF-8 -c -s
-
-
-# exit normally
exit 0