From abaa134dd255d955b54dbbd3c7535958c7642e12 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 1 Sep 2008 17:21:18 +0000 Subject: [PATCH] almost almost ok excepts outputs some formatting directives for ppt --- src/filters/rclopxml | 234 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100755 src/filters/rclopxml diff --git a/src/filters/rclopxml b/src/filters/rclopxml new file mode 100755 index 00000000..7b9c6923 --- /dev/null +++ b/src/filters/rclopxml @@ -0,0 +1,234 @@ +#!/bin/sh +# @(#$Id: rclopxml,v 1.1 2008-09-01 17:21:18 dockes Exp $ (C) 2004 J.F.Dockes +#================================================================ +# rcldocx +# Extract text from an openxml msword file (will be extended for spreadsheets) +# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname=rclopxml +filetype=openxml + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds xsltproc unzip + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclopxml_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rclopxmltmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rclopxmltmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rclopxmltmp + rmdir $tmpdir +} + +trap cleanup EXIT HUP QUIT INT TERM + +# Unzip the input file and change to the unzipped directory +unzip -q -d $tmpdir/rclopxmltmp "$infile" +cd $tmpdir/rclopxmltmp + +echo ' +' + +xsltproc - docProps/core.xml < + + + + + + + + + + + + + + + + + author + + + + + + + + + + + + date + + + + + + + + + + + + +EOF + +echo ' +' + +filename='' +if test -f word/document.xml ; then + filenames=word/document.xml + tagmatch="w:p" + xmlns_decls=' + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" + xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" + xmlns:o="urn:schemas-microsoft-com:office:office" + xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" + xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" + xmlns:v="urn:schemas-microsoft-com:vml" + xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" + xmlns:w10="urn:schemas-microsoft-com:office:word" + xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" + xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"' +elif test -f xl/sharedStrings.xml ; then + filenames=xl/sharedStrings.xml + tagmatch='x:t' + xmlns_decls=' + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" + xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" + ' +elif test -f ppt/slides/slide1.xml ; then + filenames=`echo ppt/slides/slide*.xml` + tagmatch='a:t' + xmlns_decls=' + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" + xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" + xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" + xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" +' +else + exit 1 +fi +for filename in $filenames;do +xsltproc - $filename < + + + + + +
+ +
+
+ + +

+ +

+
+ + + +
+EOF +done + +echo ''