Use the python-based filters written for ms-win on Linux too

This commit is contained in:
Jean-Francois Dockes 2015-10-11 08:41:15 +02:00
parent dc9d9900be
commit 4c3e112c27
17 changed files with 41 additions and 1740 deletions

View File

@ -90,11 +90,8 @@ src/qtgui/Makefile
src/qtgui/qrc_recoll.cpp
src/qtgui/recoll
src/qtgui/recoll.app
src/query/alldeps
src/query/recollq
src/sampleconf/rclmon.sh
src/sampleconf/recoll.conf
src/utils/alldeps
tests/casediac/aspdict.en.rws
tests/casediac/idxstatus.txt
tests/casediac/index.pid

View File

@ -1,176 +0,0 @@
#!/bin/sh
# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# Extract text from an msword file by executing either antiword
# or wvware
#
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rcldoc"
filetype=ms-word
decoder="antiword -t -i 1 -m UTF-8"
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds awk antiword iconv
# We need to do some strange stuff to retrieve the status from antiword. Things
# would be simpler if we relied on using bash.
# Explanations:
#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status
stdintoexitstatus() {
read exitstatus
return $exitstatus
}
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
# is an awk program
(((($decoder "$infile"; echo $? >&3) |
awk 'BEGIN'\
' {
cont = ""
gotdata = 0
}
{
if (!($0 ~ /^[ ]*$/) && gotdata == 0) {
print "<html><head><title></title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
print "</head>\n<body>\n<p>"
gotdata = 1
}
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
if($0 == "\f") {
print "</p><hr><p>"
next
}
if (gotdata == 1) {
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0 "<br>"
}
}
END {
if (gotdata == 1)
print "</p></body></html>"
}' >&4) 3>&1) | stdintoexitstatus) 4>&1
# Antiword rarely fails, we try to catch the most common reasons:
if test $? -eq 1 ; then
# Check actual document type
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
if test X"$mtype" = Xtext/rtf; then
# RTF document disguising as msword either because it has a .doc
# extension or because it's an attachment with a wrong mime.
exec `dirname $0`/rclrtf "$infile"
fi
if test X"$mtype" = Xtext/plain; then
# Someone gave a .doc ext to their texts. Happens...
exec `dirname $0`/rcltext "$infile"
fi
if test X"$mtype" = Xapplication/msword; then
# Actually application/msword: try wvWare, which is much
# slower and we don't use it by default, but it handles some
# files that antiword won't, so use it as a last resort.
if iscmd wvWare ; then
exec wvWare --nographics --charset=utf-8 "$infile"
fi
fi
# else let the error be...
exit 1
fi

View File

@ -1,238 +0,0 @@
#!/bin/sh
# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
#================================================================
# Extract text from an openxml msword file (will be extended for spreadsheets)
# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname=rclopxml
filetype=openxml
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc unzip
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
tmpdir=$ttdir/rclopxml_tmp$$
mkdir $tmpdir || exit 1
mkdir $tmpdir/rclopxmltmp || exit 1
cleanup()
{
# Note that we're using a constant part (rclopxmltmp), that hopefully
# guarantees that we can't do big mistakes here.
rm -rf $tmpdir/rclopxmltmp
rmdir $tmpdir
}
trap cleanup EXIT HUP QUIT INT TERM
# Unzip the input file and change to the unzipped directory
unzip -q -d $tmpdir/rclopxmltmp "$infile"
cd $tmpdir/rclopxmltmp
echo '<html>
<head>'
xsltproc --novalid --nonet - docProps/core.xml <<EOF
<?xml version="1.0"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<!-- <xsl:output method="text"/> -->
<xsl:output omit-xml-declaration="yes"/>
<xsl:template match="cp:coreProperties">
<xsl:text>&#10;</xsl:text>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<xsl:text>&#10;</xsl:text>
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">
<!-- <xsl:value-of select="name()"/> pour sortir tous les meta avec
le meme nom que dans le xml (si on devenait dc-natif) -->
<xsl:text>author</xsl:text>
</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta>
<xsl:text>&#10;</xsl:text>
</xsl:template>
<xsl:template match="dcterms:modified">
<meta>
<xsl:attribute name="name">
<xsl:text>date</xsl:text>
</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta>
<xsl:text>&#10;</xsl:text>
</xsl:template>
<xsl:template match="*">
</xsl:template>
</xsl:stylesheet>
EOF
echo '</head>
<body>'
filename=''
if test -f word/document.xml ; then
filenames=word/document.xml
tagmatch="w:p"
xmlns_decls='
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
xmlns:w10="urn:schemas-microsoft-com:office:word"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
'
elif test -f xl/sharedStrings.xml ; then
filenames=xl/sharedStrings.xml
tagmatch='x:t'
xmlns_decls='
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
'
elif test -f ppt/slides/slide1.xml ; then
filenames=`echo ppt/slides/slide*.xml`
tagmatch='a:t'
xmlns_decls='
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
'
# I want to suppress text output for all except a:t, don't know how to do it
# help ! At least get rid of these:
moretemplates='
<xsl:template match="p:attrName">
</xsl:template>
'
else
# ??
exit 1
fi
for filename in $filenames;do
xsltproc --novalid --nonet - $filename <<EOF
<?xml version="1.0"?>
<xsl:stylesheet $xmlns_decls >
<xsl:output omit-xml-declaration="yes"/>
<xsl:template match="/">
<div>
<xsl:apply-templates/>
</div>
</xsl:template>
<xsl:template match="$tagmatch">
<p>
<xsl:value-of select="."/>
</p>
</xsl:template>
$moretemplates
</xsl:stylesheet>
EOF
done
echo '</html>'

View File

@ -1,351 +0,0 @@
#!/bin/bash
#================================================================
# Copyright (C) 2015 J.F. Dockes
# There used to be Estraier content in there, but I quite believe that is not
# the case any more.
# This file is licensed under the GPL v2
#================================================================
# Convert a pdf file to HTML.
#
# We use pdftotext from the xpdf/poppler-utils package.
#
# pdftotext sometimes outputs unescaped text inside HTML text sections.
# We try to correct.
#
# If pdftotext produces no text and tesseract is available, we try to
# perform OCR. As this can be very slow and the result not always
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
#
# We guess the OCR language in order of preference:
# - From the content of a ".ocrpdflang" file if it exists in the same
# directory as the PDF
# - From an RECOLL_TESSERACT_LANG environment variable
# - From the content of $RECOLL_CONFDIR/ocrpdf
# - Default to "eng"
#
# Uncomment the following if you get better results without. The
# pdftotext manual says that the option is no longer recommended The
# difference in output seems mostly the removal of soft-hyphens when
# -raw is not set
# optionraw=-raw
# set variables
progname="rclpdf"
filetype=pdf
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds pdftotext iconv awk
ocrpossible=0
if iscmd tesseract; then
if iscmd pdftoppm; then
ocrpossible=1
fi
fi
confdir=${RECOLL_CONFDIR:-~/.recoll}
test ! -f "$confdir/ocrpdf" && ocrpossible=0
tmpdir=
cleanup()
{
# Note that we're using a constant part (rclpdftmp), that hopefully
# guarantees that we can't do big mistakes with the -rf here.
if test ! -z "$tmpdir"; then
rm -rf $tmpdir/rclpdftmp
rmdir $tmpdir
fi
}
trap cleanup EXIT HUP QUIT INT TERM
runpdftotext()
{
# Test poppler version: at some point before 0.24, poppler began
# to properly escape text inside the header (but not the body).
XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
MAJOR=`echo $XYZ | cut -d. -f 1`
MINOR=`echo $XYZ | cut -d. -f 2`
escapeheader=1
escapebody=1
if test "$MAJOR" -gt 0 ; then
escapeheader=0
elif test "$MINOR" -ge 24; then
escapeheader=0;
fi
# Run pdftotext and fix the result (add a charset tag and fix the
# html escaping). The escaping is a half-hearted job. We do try to
# fix some header fields, only for those which are single-line.
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c -s |
awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
' {
inbodypre = 0
cont = ""
}
function escapehtml(s)
{
gsub(/&/, "\\&amp;", s)
gsub(/</, "\\&lt;", s)
gsub(/>/, "\\&gt;", s)
gsub(/"/, "\\&quot;", s)
return s
}
{
$0 = cont $0
cont = ""
# Insert charset meta tag at end of header
if(inbodypre == 0 && $0 ~ /<\/head>/) {
match($0, /<\/head>/)
part1 = substr($0, 0, RSTART-1)
part2 = substr($0, RSTART, length($0))
charsetmeta = "<meta http-equiv=\"Content-Type\" "\
"content=\"text/html; charset=UTF-8\">"
$0 = part1 charsetmeta "\n" part2
}
if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
match($0, /<title>.*<\/title>/)
part1 = substr($0, 0, RSTART-1)
mid = substr($0, RSTART, RLENGTH)
part2 = substr($0, RSTART + RLENGTH, length($0))
gsub(/<title>/, "", mid)
gsub(/<\/title>/, "", mid)
if (escapeheader) {
mid = escapehtml(mid)
}
mid = "<title>" mid "</title>"
$0 = part1 mid part2
}
# This matches all single-line meta fields
if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
match($0, /content=".*"\/>/)
part1 = substr($0, 0, RSTART-1)
mid = substr($0, RSTART, RLENGTH)
part2 = substr($0, RSTART + RLENGTH, length($0))
gsub(/content="/, "", mid)
gsub(/"\/>/, "", mid)
if (escapeheader) {
mid = escapehtml(mid)
}
mid = "content=\"" mid "\"/>"
$0 = part1 mid part2
}
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
# "Subject" metadata field is more like an HTML "description"
if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
gsub(/="Subject"/, "=\"Description\"", $0)
}
if ($0 == "<pre>"){
# Begin of body text.
inbodypre++
print $0
next
} else if ($0 ~ /<\/pre>/){
inbodypre--
print $0
next
} else if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
# print "LINE [" $0 "] CONT[" cont "]"
}
if(inbodypre > 0 && escapebody){
$0 = escapehtml($0)
}
print $0
}
'
}
# If we're not equipped for ocr, just run pdftotext to stdout
if test $ocrpossible -eq 0; then
runpdftotext
exit $?
fi
# tesseract is installed, prepare for running it.
# We need to check the pdftotext output, but we don't want to run
# it twice. Use a temporary file.
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
tmpdir=$ttdir/rclpdf_tmp$$
mkdir $tmpdir || senderror mkdir $tmpdir failed
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
# Run pdftotext into the temp file
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
runpdftotext > $pdftxtfile
# If text is big, or small but not only tags and empty lines, output
# it. Given the contents check which we perform, a file in which the
# only text content is metadata (pdf description field), will be run
# through OCR, which is not necessarily what we would want. It would
# be possible to detect the situation if this proved an issue.
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
txtempty=0
# Use grep to check if there is regular text in there. Only do it on
# small outputs
if test $txtsize -lt 5000 ; then
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
test -z "$realtext" && txtempty=1
fi
if test $txtempty -eq 0; then
# pdftotext produced actual output, use it. No OCR
cat $pdftxtfile
exit 0
fi
# PDF has no text content and tesseract is available. Give it a try
pdflangfile=`dirname "$infile"`/.ocrpdflang
if test -f "$pdflangfile"; then
tesseractlang=`cat "$pdflangfile"`
fi
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
if test -z "$tesseractlang"; then
tesseractlang=${RECOLL_TESSERACT_LANG}
if test -z "$tesseractlang"; then
# Half assed trial to guess from LANG then default to english
localelang=`echo $LANG | awk -F_ '{print $1}'`
# echo localelang "$localelang" >&2
case "$localelang" in
en) tesseractlang=eng;;
de) tesseractlang=deu;;
fr) tesseractlang=fra;;
# Someone will have to add more tesseract language codes here.
esac
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
test -z "$tesseractlang" && tesseractlang="eng"
fi
fi
# echo tesseractlang "$tesseractlang" >&2
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
# split pdf-pages
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
if [ $? -ne 0 ] ; then
senderror "pdftoppm: $ERR_MSG"
fi
for i in $TMPFILE* ; do
if [ -s "$i" ] ; then
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
TESSERR=$?
# ignore tesseract start message
LINECOUNT=$(wc -l < $TESSERRORFILE)
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
echo "tesseract-error $TESSERR page $i in $infile" >&2
# sort "compacts" leptonica-output
cat $TESSERRORFILE | sort -u >&2
fi
# else
# debugging purpose
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
# cp -a $infile $SICFILE
# fi
fi
done
# don't output "empty" HTML-Files
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
if [ "$CHARS" -gt 0 ] ; then
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
cat "$TMPFILE"*.txt | \
awk '{
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0
}
'
echo "</pre></body></html>"
fi

View File

@ -1,110 +0,0 @@
#!/bin/sh
# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#================================================================
# Handle powerpoint files for recoll.
# Use unoconv, this is very slow, but catppt just can't handle the majority
# of semi-modern ppt files
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclppt"
filetype=powerpoint
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
filtersdir=`dirname $0`
checkcmds $filtersdir/ppt-dump.py
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
cat <<EOF
<html><head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
</head><body><pre>
EOF
$mso "$infile"| sed -e 's/</&lt;/g' -e 's/&/&amp;/g'
echo '</pre></body></html>'

View File

@ -6,15 +6,12 @@ import re
import sys
import os
# Processing the output from unrtf
class PPTProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \
@ -22,7 +19,7 @@ class PPTProcessData:
'''content="text/html;charset=UTF-8">''' + \
'''</head><body><pre>'''
self.gotdata = True
self.out += self.em.htmlescape(line)
self.out += self.em.htmlescape(line) + "<br>\n"
def wrapData(self):
return self.out + '''</pre></body></html>'''

View File

@ -1,102 +0,0 @@
#!/bin/sh
# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
# Some inspiration from estraier
#================================================================
# convert rtf to html, by executing the unrtf program:
# http://www.gnu.org/software/unrtf/unrtf.html
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclrtl"
filetype=rtf
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds awk unrtf
# output the result
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
# is an awk program
# The thing about the charset is that unrtf outputs a garbled one.
unrtf --nopict --html "$infile" 2> /dev/null |
awk 'BEGIN'\
' {
gothead = 0
}
/<\/head>/{
if (gothead == 0) {
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
gothead = 1
}
}
/<meta http-equiv=/{next}
{
print
}
'

View File

@ -1,225 +0,0 @@
#!/bin/sh
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# Extract text from an openoffice/soffice file
#
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclsoff"
filetype=openoffice
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc unzip
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
tmpdir=$ttdir/rclsoff_tmp$$
mkdir $tmpdir || exit 1
mkdir $tmpdir/rclsofftmp || exit 1
cleanup()
{
# Note that we're using a constant part (rclsofftmp), that hopefully
# guarantees that we can't do big mistakes here.
rm -rf $tmpdir/rclsofftmp
rmdir $tmpdir
}
trap cleanup EXIT HUP QUIT INT TERM
# Unzip the input file and change to the unzipped directory
unzip -q -d $tmpdir/rclsofftmp "$infile"
cd $tmpdir/rclsofftmp
echo '<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
xsltproc --novalid --nonet - meta.xml <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
xmlns:ooo="http://openoffice.org/2004/office"
exclude-result-prefixes="office xlink meta ooo dc"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/office:document-meta">
<xsl:apply-templates select="office:meta/dc:description"/>
<xsl:apply-templates select="office:meta/dc:subject"/>
<xsl:apply-templates select="office:meta/dc:title"/>
<xsl:apply-templates select="office:meta/meta:keyword"/>
<xsl:apply-templates select="office:meta/dc:creator"/>
</xsl:template>
<xsl:template match="dc:title">
<title> <xsl:value-of select="."/> </title><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:description">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="meta:keyword">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
EOF
echo '</head><body>'
xsltproc --novalid --nonet - content.xml <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
exclude-result-prefixes="text"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="text:p">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:h">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:s">
<xsl:text> </xsl:text>
</xsl:template>
<xsl:template match="text:line-break">
<br />
</xsl:template>
<xsl:template match="text:tab">
<xsl:text> </xsl:text>
</xsl:template>
</xsl:stylesheet>
EOF
echo '</body></html>'
cd /
exit 0

View File

@ -1,161 +0,0 @@
#!/bin/sh
#================================================================
# Extract text from a Scalable Vector Graphics file
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclsvg"
filetype=svg
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc
xsltproc --novalid --nonet - "$infile" <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns:dc="http://purl.org/dc/elements/1.1/"
exclude-result-prefixes="svg"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<xsl:apply-templates select="svg:svg/svg:title"/>
<xsl:apply-templates select="svg:svg/svg:desc"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:creator"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:subject"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:description"/>
</head>
<body>
<xsl:apply-templates select="//svg:text"/>
</body>
</html>
</xsl:template>
<xsl:template match="svg:desc">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:description">
<meta>
<xsl:attribute name="name">description</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="svg:title">
<title><xsl:value-of select="."/></title><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="svg:text">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
EOF
exit 0

View File

@ -116,7 +116,7 @@ class SVGExtractor:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):

View File

@ -1,91 +0,0 @@
#!/bin/sh
# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# Wrap generic text (ie: program text) in html
# Assumes ascii or iso-8859-1
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rcltext"
filetype=text
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds sed
echo '<html><head><title></title></head><body><pre>'
sed -e 's/\&/\&amp;/g' -e 's/</\&lt;/g' "$infile"
echo '</pre></body></html>'

View File

@ -1,116 +0,0 @@
#!/bin/sh
# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#================================================================
# Handle excel files for recoll.
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclxls"
filetype=excel
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
top=`dirname $0`
XLSDUMP="$top/xls-dump.py"
XMLTOCSV="$top/xlsxmltocsv.py"
checkcmds $XLSDUMP $XLSTOCSV
# output the result
echo '<html><head>'
#echo '<title>' "$title" '</title>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>'
echo '<pre>'
$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
$XMLTOCSV | \
sed -e 's/</&lt;/g' -e 's/&/&amp;/g'
echo '</pre>'
echo '</body></html>'
# exit normally
exit 0

View File

@ -8,7 +8,6 @@ import sys
import os
import xml.sax
# Processing the output from unrtf
class XLSProcessData:
def __init__(self, em):
self.em = em
@ -16,8 +15,6 @@ class XLSProcessData:
self.gotdata = 0
self.xmldata = ""
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''<html><head>''' + \

View File

@ -1,119 +0,0 @@
#!/bin/sh
#================================================================
# Extract text from a generic XML file (Justus Piater)
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rclxml"
filetype=xml
#RECFILTCOMMONCODE
##############################################################################
# !! Leave the previous line unmodified!! Code imported from the
# recfiltcommon file
# Utility code common to all shell filters. This could be sourced at run
# time, but it's slightly more efficient to include the code in the
# filters at build time (with a sed script).
# Describe error in a way that can be interpreted by our caller
senderror()
{
echo RECFILTERROR $*
# Also alert on stderr just in case
echo ":2:$progname::: $*" 1>&2
exit 1
}
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
senderror HELPERNOTFOUND $cmd
fi
done
}
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
echo "Convert a $filetype file to HTML text for Recoll indexing."
echo "Usage: $progname [infile]"
exit 1
fi
infile="$1"
# check the input file existence (may be '-' for stdin)
if test "X$infile" != X- -a ! -f "$infile"
then
senderror INPUTNOSUCHFILE "$infile"
fi
# protect access to our temp files and directories
umask 77
##############################################################################
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds xsltproc
xsltproc --novalid --nonet - "$infile" <<EOF
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<xsl:if test="//*[local-name() = 'title']">
<title>
<xsl:value-of select="//*[local-name() = 'title'][1]"/>
</title>
</xsl:if>
</head>
<body>
<xsl:apply-templates/>
</body>
</html>
</xsl:template>
<xsl:template match="text()">
<xsl:if test="string-length(normalize-space(.)) &gt; 0">
<p><xsl:value-of select="."/></p>
<xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="*">
<xsl:apply-templates/>
</xsl:template>
</xsl:stylesheet>
EOF
exit 0

View File

@ -74,7 +74,7 @@ class XMLExtractor:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):

View File

@ -51,7 +51,7 @@ application/javascript = internal text/plain
# - with unrtf: rtf files disguising as doc files.
# The default is now again to use rcldoc. Use raw antiword if speed is more
# important for you than catching all data,
application/msword = exec rcldoc
application/msword = execm rcldoc.py
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics
@ -59,41 +59,40 @@ application/msword = exec rcldoc
# Also Handle the mime type returned by "file -i" for a suffix-less word
# file. This could probably just as well be an excel file, but we have to
# chose one.
application/vnd.ms-office = exec rcldoc
application/vnd.ms-office = execm rcldoc.py
application/ogg = execm rclaudio
application/pdf = exec rclpdf
# application/pdf = execm rclmpdf.py
application/pdf = execm rclmpdf.py
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
application/vnd.ms-excel = exec rclxls
application/vnd.ms-powerpoint = exec rclppt
application/vnd.oasis.opendocument.text = exec rclsoff
application/vnd.oasis.opendocument.text-template = exec rclsoff
application/vnd.oasis.opendocument.presentation = exec rclsoff
application/vnd.oasis.opendocument.spreadsheet = exec rclsoff
application/vnd.oasis.opendocument.graphics = exec rclsoff
application/vnd.ms-excel = execm rclxls.py
application/vnd.ms-powerpoint = execm rclppt.py
application/vnd.oasis.opendocument.text = execm rclsoff.py
application/vnd.oasis.opendocument.text-template = execm rclsoff.py
application/vnd.oasis.opendocument.presentation = execm rclsoff.py
application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
application/vnd.oasis.opendocument.graphics = execm rclsoff.py
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
exec rclopxml
execm rclopxml.py
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
exec rclopxml
execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.template = \
exec rclopxml
execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
exec rclopxml
execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
exec rclopxml
execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
exec rclopxml
application/vnd.sun.xml.calc = exec rclsoff
application/vnd.sun.xml.calc.template = exec rclsoff
application/vnd.sun.xml.draw = exec rclsoff
application/vnd.sun.xml.draw.template = exec rclsoff
application/vnd.sun.xml.impress = exec rclsoff
application/vnd.sun.xml.impress.template = exec rclsoff
application/vnd.sun.xml.math = exec rclsoff
application/vnd.sun.xml.writer = exec rclsoff
application/vnd.sun.xml.writer.global = exec rclsoff
application/vnd.sun.xml.writer.template = exec rclsoff
execm rclopxml.py
application/vnd.sun.xml.calc = execm rclsoff.py
application/vnd.sun.xml.calc.template = execm rclsoff.py
application/vnd.sun.xml.draw = execm rclsoff.py
application/vnd.sun.xml.draw.template = execm rclsoff.py
application/vnd.sun.xml.impress = execm rclsoff.py
application/vnd.sun.xml.impress.template = execm rclsoff.py
application/vnd.sun.xml.math = execm rclsoff.py
application/vnd.sun.xml.writer = execm rclsoff.py
application/vnd.sun.xml.writer.global = execm rclsoff.py
application/vnd.sun.xml.writer.template = execm rclsoff.py
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw
application/x-awk = internal text/plain
@ -101,7 +100,7 @@ application/x-chm = execm rclchm
application/x-dia-diagram = execm rcldia;mimetype=text/plain
application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio
application/x-gnote = exec rclxml
application/x-gnote = execm rclxml.py
application/x-gnuinfo = execm rclinfo
application/x-gnumeric = exec rclgnm
application/x-kword = exec rclkwd
@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio
audio/mp4 = execm rclaudio
audio/aac = execm rclaudio
audio/x-karaoke = execm rclkar
image/gif = execm rclimg
image/jp2 = execm rclimg
image/jpeg = execm rclimg
image/png = execm rclimg
image/tiff = execm rclimg
image/gif = execm rclimg.py
image/jp2 = execm rclimg.py
image/jpeg = execm rclimg.py
image/png = execm rclimg.py
image/tiff = execm rclimg.py
image/vnd.djvu = exec rcldjvu
image/svg+xml = exec rclsvg
image/x-xcf = execm rclimg
image/svg+xml = execm rclsvg.py
image/x-xcf = execm rclimg.py
inode/symlink = internal
application/x-zerosize = internal
inode/x-empty = internal application/x-zerosize
@ -159,9 +158,8 @@ text/x-python = exec rclpython
text/x-shellscript = internal text/plain
text/x-srt = internal text/plain
text/x-tex = exec rcltex
application/xml = exec rclxml
text/xml = exec rclxml
application/xml = execm rclxml.py
text/xml = execm rclxml.py
# Using these instead of the two above would index all parameter and tag
# names, attribute values etc, instead of just the text content.
#application/xml = internal text/plain

View File

@ -135,7 +135,8 @@ message/rfc822 = internal
text/calendar = execm python rclics;mimetype=text/plain
text/html = internal
text/plain = internal
text/rtf = execm python rclrtf.py
text/rtf = exec unrtf --nopict --html;mimetype=text/html
#text/rtf = execm python rclrtf.py
text/x-c = internal
text/x-c++ = internal
text/x-c+ = internal