new gnumeric and okular notes filters

This commit is contained in:
Jean-Francois Dockes 2012-01-23 20:25:55 +01:00
parent 3c65886366
commit 17542969a5
10 changed files with 412 additions and 24 deletions

193
src/filters/rclgnm Executable file
View File

@ -0,0 +1,193 @@
#!/bin/sh
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# Extract text from a gnumeric spreadsheet
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclgnumeric"
filetype=gnumeric
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc gunzip
# We need a temporary file
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
tmpfile=$ttdir/rclgnm.XXXXXX
tmpfile=`mktemp "$tmpfile"`
if [ $? -ne 0 ]; then
senderror "$0: Can't create temp file, exiting..."
fi
cleanup()
{
# Note that we're using a constant part (rclsofftmp), that hopefully
# guarantees that we can't do big mistakes here.
rm -f $tmpfile
}
trap cleanup EXIT HUP QUIT INT TERM
gunzip < $1 > $tmpfile || senderror "Cant uncompress input"
xsltproc --novalid --nonet - $tmpfile <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
xmlns:ooo="http://openoffice.org/2004/office"
xmlns:gnm="http://www.gnumeric.org/v10.dtd"
exclude-result-prefixes="office xlink meta ooo dc"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<xsl:apply-templates select="//office:document-meta/office:meta"/>
</head>
<body>
<xsl:apply-templates select="//gnm:Cells"/>
<xsl:apply-templates select="//gnm:Objects"/>
</body>
</html>
</xsl:template>
<xsl:template match="//dc:date">
<meta>
<xsl:attribute name="name">date</xsl:attribute>
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
</meta>
</xsl:template>
<xsl:template match="//dc:description">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
</meta>
</xsl:template>
<xsl:template match="//meta:keyword">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
</meta>
</xsl:template>
<xsl:template match="//dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
</meta>
</xsl:template>
<xsl:template match="//dc:title">
<title> <xsl:value-of select="."/> </title>
</xsl:template>
<xsl:template match="//meta:initial-creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
</meta>
</xsl:template>
<xsl:template match="office:meta/*"/>
<xsl:template match="gnm:Cell">
<p><xsl:value-of select="."/></p>
</xsl:template>
<xsl:template match="gnm:CellComment">
<blockquote><xsl:value-of select="@Text"/></blockquote>
</xsl:template>
</xsl:stylesheet>
EOF

130
src/filters/rclokulnote Executable file
View File

@ -0,0 +1,130 @@
#!/bin/sh
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# Extract text from a gnumeric spreadsheet
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclgnumeric"
filetype=gnumeric
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc
xsltproc --novalid --nonet - $infile <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="html" encoding="UTF-8"/>
<xsl:strip-space elements="*" />
<xsl:template match="/">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>
Okular notes about: <xsl:value-of select="/documentInfo/@url" />
</title>
</head>
<body>
<xsl:apply-templates />
</body>
</html>
</xsl:template>
<xsl:template match="node()">
<xsl:apply-templates select="@* | node() "/>
</xsl:template>
<xsl:template match="text()">
<p><xsl:value-of select="."/></p>
<xsl:text >
</xsl:text>
</xsl:template>
<xsl:template match="@contents|@author">
<p><xsl:value-of select="local-name()"/>=<xsl:value-of select="." /></p>
<xsl:text >
</xsl:text>
</xsl:template>
<xsl:template match="@*"/>
</xsl:stylesheet>
EOF

View File

@ -74,9 +74,11 @@ application/x-chm = execm rclchm
application/x-dvi = exec rcldvi application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio application/x-flac = execm rclaudio
application/x-gnuinfo = execm rclinfo application/x-gnuinfo = execm rclinfo
application/x-gnumeric = exec rclgnm
application/x-kword = exec rclkwd application/x-kword = exec rclkwd
application/x-lyx = exec rcllyx application/x-lyx = exec rcllyx
application/x-mimehtml = internal message/rfc822 application/x-mimehtml = internal message/rfc822
application/x-okular-notes = exec rclokulnote
application/x-perl = internal text/plain application/x-perl = internal text/plain
application/x-rar = execm rclrar;charset=default application/x-rar = execm rclrar;charset=default
application/x-scribus = exec rclscribus application/x-scribus = exec rclscribus
@ -153,8 +155,10 @@ application/x-abiword = wordprocessing
application/x-dvi = document application/x-dvi = document
application/x-flac = sownd application/x-flac = sownd
application/x-fsdirectory = folder application/x-fsdirectory = folder
application/x-gnumeric = spreadsheet
application/x-kword = wordprocessing application/x-kword = wordprocessing
application/x-lyx = wordprocessing application/x-lyx = wordprocessing
application/x-okular-notes = document
application/x-scribus = document application/x-scribus = document
application/x-gnuinfo = document application/x-gnuinfo = document
application/x-tex = wordprocessing application/x-tex = wordprocessing
@ -207,6 +211,7 @@ text = \
application/x-dvi \ application/x-dvi \
application/x-kword \ application/x-kword \
application/x-lyx \ application/x-lyx \
application/x-okular-notes \
application/x-perl \ application/x-perl \
application/x-scribus \ application/x-scribus \
application/x-gnuinfo \ application/x-gnuinfo \
@ -233,7 +238,8 @@ spreadsheet = \
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet \ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet \
application/vnd.openxmlformats-officedocument.spreadsheetml.template \ application/vnd.openxmlformats-officedocument.spreadsheetml.template \
application/vnd.sun.xml.calc \ application/vnd.sun.xml.calc \
application/vnd.sun.xml.calc.template application/vnd.sun.xml.calc.template \
application/x-gnumeric
presentation = application/vnd.ms-powerpoint \ presentation = application/vnd.ms-powerpoint \
application/vnd.openxmlformats-officedocument.presentationml.template \ application/vnd.openxmlformats-officedocument.presentationml.template \

View File

@ -96,6 +96,7 @@
.scd = application/x-scribus .scd = application/x-scribus
.info = application/x-gnuinfo .info = application/x-gnuinfo
.kwd = application/x-kword .kwd = application/x-kword
.gnumeric = application/x-gnumeric
.wpd = application/vnd.wordperfect .wpd = application/vnd.wordperfect
@ -133,7 +134,7 @@
recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \ recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \
.o .lib .dll .a .sys .exe .com \ .o .lib .dll .a .sys .exe .com \
.dat .bak .rdf .log .db .msf .pid \ .dat .bak .rdf .log .db .msf .pid \
.gnm .gnumeric \ .gnm \
,v ~ # ,v ~ #
# Special handling of .txt files inside ~/.gaim and ~/.purple directories # Special handling of .txt files inside ~/.gaim and ~/.purple directories
@ -185,3 +186,9 @@ recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \
.8 = text/x-man .8 = text/x-man
.9 = text/x-man .9 = text/x-man
.n = text/x-man .n = text/x-man
# Special handling for okular notes
[~/.kde4/share/apps/okular/docdata]
.xml = application/x-okular-notes
[~/.kde/share/apps/okular/docdata]
.xml = application/x-okular-notes

View File

@ -54,6 +54,7 @@ application/vnd.wordperfect = libreoffice %f
application/x-chm = kchmviewer %f application/x-chm = kchmviewer %f
application/x-fsdirectory = dolphin %f application/x-fsdirectory = dolphin %f
application/x-gnuinfo = xterm -e "info -f %f" application/x-gnuinfo = xterm -e "info -f %f"
application/x-gnumeric = gnumeric %f
application/x-flac = rhythmbox %f application/x-flac = rhythmbox %f
audio/mpeg = rhythmbox %f audio/mpeg = rhythmbox %f

16
tests/gnumeric/gnumeric.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/sh
topdir=`dirname $0`/..
. $topdir/shared.sh
initvariables $0
(
recollq author=gnumericAuthor
recollq gnumerictext
recollq gnumericcommentaire
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

View File

@ -0,0 +1,5 @@
0 results
1 results
application/x-gnumeric [file:///home/dockes/projets/fulltext/testrecoll/gnumeric/trygnumeric.gnumeric] [The gnumericTitle] 2111 bytes
1 results
application/x-gnumeric [file:///home/dockes/projets/fulltext/testrecoll/gnumeric/trygnumeric.gnumeric] [The gnumericTitle] 2111 bytes

View File

@ -142,6 +142,9 @@
<li><span class="literal">OpenOffice</span> files.</li> <li><span class="literal">OpenOffice</span> files.</li>
<li><span class="literal">SVG</span> files.</li> <li><span class="literal">SVG</span> files.</li>
<li><span class="literal">Gnumeric</span> files.</li>
<li><span class="literal">Okular</span> annotations files.</li>
</ul> </ul>
<h5>Other formats</h5> <h5>Other formats</h5>
@ -200,16 +203,30 @@
"http://www.gnu.org/software/ghostscript/ghostscript.html"> "http://www.gnu.org/software/ghostscript/ghostscript.html">
ghostscript</a> and <a href= ghostscript</a> and <a href=
"http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">pstotext</a>. "http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">pstotext</a>.
Actually the pstotext 1.9 found at the latter link has a Pstotext 1.9 has a serious issue with special characters in
problem with file names using special shell characters, and file names, and you should either use the version packaged for
you should either use the version packaged for your system your system which is probably patched, or apply the Debian
which is probably patched, or apply the Debian patch which patch which is stored <a href=
is stored <a href=
"files/pstotext-1.9_4-debian.patch">here</a> for "files/pstotext-1.9_4-debian.patch">here</a> for
convenience. See convenience. See http://packages.debian.org/squeeze/pstotext
http://packages.debian.org/squeeze/pstotext and and http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=356988
http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=356988 for for references/explanations.
references/explanations.</li> <blockquote>
To make things a bit easier, I also
store <a href="files/pstotext-1.9-patched.tar.gz">an
already patched version</a>. I added an
install target to the Makefile... This installs to
/usr/local, use <i>make install PREFIX=/usr</i> to
change. So all you need is:
<pre>
tar xvzf pstotext-1.9-patched.tar.gz
cd pstotext-1.9-patched
make
make install
</pre>
</blockquote>
</li>
<li><span class="literal">RTF</span> files with <a href= <li><span class="literal">RTF</span> files with <a href=
"http://www.gnu.org/software/unrtf/unrtf.html">unrtf</a>. Please "http://www.gnu.org/software/unrtf/unrtf.html">unrtf</a>. Please

View File

@ -64,37 +64,49 @@
<a href="mimeconf">mimeconf</a> <a href="mimeconf">mimeconf</a>
<a href="mimeview">mimeview</a> </p> <a href="mimeview">mimeview</a> </p>
<!--
<p>Notes:</p>
<blockquote> <blockquote>
<p>All filters are up to date in Recoll 1.1.04, except rclics.</p> <p>All filters are up to date in Recoll 1.16.2, except
<p>If you are using an older version, you should update to 1.13.04.</p> rclchm, and the new ones for gnumeric and Okular annotations.</p>
<p>Recoll 1.15 may benefit from some of the newer
filters linked below.</p>
<p>If you are running an older recoll version, you really
should upgrade.</p>
</blockquote> </blockquote>
-->
<h2>Okular annotations</h2>
<p><a href="rclokulnote">rclokulnote</a>. Okular lets you create
annotations for PDF documents and stores them in xml format
somewhere under ~/.kde. This filter does not do a nice job to
format the data, but will at least let you find it...</p>
<h2>Gnumeric</h2>
<p><a href="rclgnm">rclgnm</a>. Needs xsltproc and gunzip.</p>
<h2>Rar archive support</h2> <h2>Rar archive support</h2>
<p><a href="rclrar">rclrar</a>. This needs the Python rarfile module. <p><a href="rclrar">rclrar</a>. This is up to date in Recoll
</p> 1.16.2 but may be added to Recoll 1.15. It needs the Python
rarfile module. </p>
<h2>Mimehtml support</h2> <h2>Mimehtml support</h2>
<p>This is based on the internal mail filter, you just need to <p>This is based on the internal mail filter, you just need to
download and install the config files. Will only work with download and install the configuration files (mimemap and
1.15.</p> mimeconf. Will only work with 1.15 and later.</p>
<h2>Konqueror webarchive (.war) filter</h2> <h2>Konqueror webarchive (.war) filter</h2>
<p><a href="rclwar">rclwar</a></p> <p><a href="rclwar">rclwar</a></p>
<h2>Updated zip archive filter</h2> <h2>Updated zip archive filter</h2>
<p>The filter is corrected to handle utf-8 paths in zip archives: <p>The filter is corrected to handle utf-8 paths in zip archives:
<a href="rclzip">rclzip</a></p> <a href="rclzip">rclzip</a>. Up to date in Recoll 1.16, but
may be useful with Recoll 1.15</p>
<h2>Updated audio tag filter</h2> <h2>Updated audio tag filter</h2>
<p>The mutagen-based rclaudio filter delivered with recoll 1.14.2 <p>The mutagen-based rclaudio filter delivered with recoll 1.14.2
used a very recent mutagen interface which will only work with used a very recent mutagen interface which will only work with
mutagen versions after 1.17 (probably. at least works with 1.19, mutagen versions after 1.17 (probably. at least works with 1.19,
doesn't with 1.15). doesn't with 1.15).
You can download the <a href="rclaudio">corrected script here</a>. You can download the <a href="rclaudio">corrected script
here. Not useful with Recoll 1.5 or 1.6</a>.
</p> </p>
</div> </div>

View File

@ -63,7 +63,8 @@
the <a href="usermanual/rcl.search.tips.html">search the <a href="usermanual/rcl.search.tips.html">search
tips</a> might prove useful ! Also the tips</a> might prove useful ! Also the
<a href="http://bitbucket.org/medoc/recoll/wiki/FaqsAndHowTos"> <a href="http://bitbucket.org/medoc/recoll/wiki/FaqsAndHowTos">
Faqs and Howtos</a> on bitbucket.org.</p> Faqs and Howtos</a> on bitbucket.org, and some contributed
customisation/beautification tricks .</p>
<h2>News: </h2> <h2>News: </h2>