diff --git a/.hgignore b/.hgignore
index 825ae08c..b3592592 100644
--- a/.hgignore
+++ b/.hgignore
@@ -90,11 +90,8 @@ src/qtgui/Makefile
src/qtgui/qrc_recoll.cpp
src/qtgui/recoll
src/qtgui/recoll.app
-src/query/alldeps
-src/query/recollq
src/sampleconf/rclmon.sh
src/sampleconf/recoll.conf
-src/utils/alldeps
tests/casediac/aspdict.en.rws
tests/casediac/idxstatus.txt
tests/casediac/index.pid
diff --git a/src/filters/rcldoc b/src/filters/rcldoc
deleted file mode 100755
index 78d28cbd..00000000
--- a/src/filters/rcldoc
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/bin/sh
-# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $ (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Extract text from an msword file by executing either antiword
-# or wvware
-#
-#================================================================
-
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rcldoc"
-filetype=ms-word
-
-decoder="antiword -t -i 1 -m UTF-8"
-
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds awk antiword iconv
-
-# We need to do some strange stuff to retrieve the status from antiword. Things
-# would be simpler if we relied on using bash.
-# Explanations:
-#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status
-
-stdintoexitstatus() {
- read exitstatus
- return $exitstatus
-}
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-(((($decoder "$infile"; echo $? >&3) |
-awk 'BEGIN'\
-' {
- cont = ""
- gotdata = 0
-}
-{
- if (!($0 ~ /^[ ]*$/) && gotdata == 0) {
- print "
"
- print ""
- print "\n\n"
- gotdata = 1
- }
- $0 = cont $0
- cont = ""
-
- if ($0 ~ /[-]$/) {
- # Note : soft-hyphen is iso8859 0xad
- # Break at last whitespace
- match($0, "[ \t][^ \t]+$")
- line = substr($0, 0, RSTART)
- cont = substr($0, RSTART, RLENGTH-1)
- $0 = line
- }
-
- if($0 == "\f") {
- print "
"
- next
- }
-
- if (gotdata == 1) {
- gsub(/&/, "\\&", $0)
- gsub(/, "\\<", $0)
- gsub(/>/, "\\>", $0)
-
- print $0 "
"
- }
-}
-END {
- if (gotdata == 1)
- print "
"
-}' >&4) 3>&1) | stdintoexitstatus) 4>&1
-
-
-# Antiword rarely fails, we try to catch the most common reasons:
-if test $? -eq 1 ; then
- # Check actual document type
- mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
-
- if test X"$mtype" = Xtext/rtf; then
- # RTF document disguising as msword either because it has a .doc
- # extension or because it's an attachment with a wrong mime.
- exec `dirname $0`/rclrtf "$infile"
- fi
-
- if test X"$mtype" = Xtext/plain; then
- # Someone gave a .doc ext to their texts. Happens...
- exec `dirname $0`/rcltext "$infile"
- fi
-
- if test X"$mtype" = Xapplication/msword; then
- # Actually application/msword: try wvWare, which is much
- # slower and we don't use it by default, but it handles some
- # files that antiword won't, so use it as a last resort.
- if iscmd wvWare ; then
- exec wvWare --nographics --charset=utf-8 "$infile"
- fi
- fi
-
- # else let the error be...
- exit 1
-fi
diff --git a/src/filters/rclopxml b/src/filters/rclopxml
deleted file mode 100755
index 13ba54a2..00000000
--- a/src/filters/rclopxml
+++ /dev/null
@@ -1,238 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
-#================================================================
-# Extract text from an openxml msword file (will be extended for spreadsheets)
-# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname=rclopxml
-filetype=openxml
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc unzip
-
-# We need a temporary directory
-if test z"$RECOLL_TMPDIR" != z; then
- ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
- ttdir=$TMPDIR
-else
- ttdir=/tmp
-fi
-tmpdir=$ttdir/rclopxml_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclopxmltmp || exit 1
-
-cleanup()
-{
- # Note that we're using a constant part (rclopxmltmp), that hopefully
- # guarantees that we can't do big mistakes here.
- rm -rf $tmpdir/rclopxmltmp
- rmdir $tmpdir
-}
-
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Unzip the input file and change to the unzipped directory
-unzip -q -d $tmpdir/rclopxmltmp "$infile"
-cd $tmpdir/rclopxmltmp
-
-echo '
-'
-
-xsltproc --novalid --nonet - docProps/core.xml <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- author
-
-
-
-
-
-
-
-
-
-
-
- date
-
-
-
-
-
-
-
-
-
-
-
-
-EOF
-
-echo '
-'
-
-filename=''
-if test -f word/document.xml ; then
- filenames=word/document.xml
- tagmatch="w:p"
- xmlns_decls='
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
- xmlns:o="urn:schemas-microsoft-com:office:office"
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
- xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
- xmlns:v="urn:schemas-microsoft-com:vml"
- xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
- xmlns:w10="urn:schemas-microsoft-com:office:word"
- xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
- xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
- '
-
-elif test -f xl/sharedStrings.xml ; then
- filenames=xl/sharedStrings.xml
- tagmatch='x:t'
- xmlns_decls='
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
- '
-
-elif test -f ppt/slides/slide1.xml ; then
- filenames=`echo ppt/slides/slide*.xml`
- tagmatch='a:t'
- xmlns_decls='
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
- '
-# I want to suppress text output for all except a:t, don't know how to do it
-# help ! At least get rid of these:
- moretemplates='
-
-
-'
-else
- # ??
- exit 1
-fi
-
-
-for filename in $filenames;do
-xsltproc --novalid --nonet - $filename <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- $moretemplates
-
-
-EOF
-done
-
-echo ''
diff --git a/src/filters/rclpdf b/src/filters/rclpdf
deleted file mode 100755
index 3e11ea6d..00000000
--- a/src/filters/rclpdf
+++ /dev/null
@@ -1,351 +0,0 @@
-#!/bin/bash
-#================================================================
-# Copyright (C) 2015 J.F. Dockes
-# There used to be Estraier content in there, but I quite believe that is not
-# the case any more.
-# This file is licensed under the GPL v2
-#================================================================
-# Convert a pdf file to HTML.
-#
-# We use pdftotext from the xpdf/poppler-utils package.
-#
-# pdftotext sometimes outputs unescaped text inside HTML text sections.
-# We try to correct.
-#
-# If pdftotext produces no text and tesseract is available, we try to
-# perform OCR. As this can be very slow and the result not always
-# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
-#
-# We guess the OCR language in order of preference:
-# - From the content of a ".ocrpdflang" file if it exists in the same
-# directory as the PDF
-# - From an RECOLL_TESSERACT_LANG environment variable
-# - From the content of $RECOLL_CONFDIR/ocrpdf
-# - Default to "eng"
-#
-# Uncomment the following if you get better results without. The
-# pdftotext manual says that the option is no longer recommended The
-# difference in output seems mostly the removal of soft-hyphens when
-# -raw is not set
-# optionraw=-raw
-
-# set variables
-progname="rclpdf"
-filetype=pdf
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds pdftotext iconv awk
-
-ocrpossible=0
-if iscmd tesseract; then
- if iscmd pdftoppm; then
- ocrpossible=1
- fi
-fi
-confdir=${RECOLL_CONFDIR:-~/.recoll}
-test ! -f "$confdir/ocrpdf" && ocrpossible=0
-
-tmpdir=
-
-cleanup()
-{
- # Note that we're using a constant part (rclpdftmp), that hopefully
- # guarantees that we can't do big mistakes with the -rf here.
- if test ! -z "$tmpdir"; then
- rm -rf $tmpdir/rclpdftmp
- rmdir $tmpdir
- fi
-}
-
-trap cleanup EXIT HUP QUIT INT TERM
-
-runpdftotext()
-{
- # Test poppler version: at some point before 0.24, poppler began
- # to properly escape text inside the header (but not the body).
- XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
- MAJOR=`echo $XYZ | cut -d. -f 1`
- MINOR=`echo $XYZ | cut -d. -f 2`
- escapeheader=1
- escapebody=1
- if test "$MAJOR" -gt 0 ; then
- escapeheader=0
- elif test "$MINOR" -ge 24; then
- escapeheader=0;
- fi
-
- # Run pdftotext and fix the result (add a charset tag and fix the
- # html escaping). The escaping is a half-hearted job. We do try to
- # fix some header fields, only for those which are single-line.
- pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
- iconv -f UTF-8 -t UTF-8 -c -s |
- awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
-' {
- inbodypre = 0
- cont = ""
-}
-function escapehtml(s)
-{
- gsub(/&/, "\\&", s)
- gsub(/, "\\<", s)
- gsub(/>/, "\\>", s)
- gsub(/"/, "\\"", s)
- return s
-}
-{
- $0 = cont $0
- cont = ""
- # Insert charset meta tag at end of header
- if(inbodypre == 0 && $0 ~ /<\/head>/) {
- match($0, /<\/head>/)
- part1 = substr($0, 0, RSTART-1)
- part2 = substr($0, RSTART, length($0))
- charsetmeta = ""
- $0 = part1 charsetmeta "\n" part2
- }
- if(inbodypre == 0 && $0 ~ /.*<\/title>/){
- match($0, /.*<\/title>/)
- part1 = substr($0, 0, RSTART-1)
- mid = substr($0, RSTART, RLENGTH)
- part2 = substr($0, RSTART + RLENGTH, length($0))
- gsub(//, "", mid)
- gsub(/<\/title>/, "", mid)
- if (escapeheader) {
- mid = escapehtml(mid)
- }
- mid = "" mid ""
- $0 = part1 mid part2
- }
- # This matches all single-line meta fields
- if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
- match($0, /content=".*"\/>/)
- part1 = substr($0, 0, RSTART-1)
- mid = substr($0, RSTART, RLENGTH)
- part2 = substr($0, RSTART + RLENGTH, length($0))
- gsub(/content="/, "", mid)
- gsub(/"\/>/, "", mid)
- if (escapeheader) {
- mid = escapehtml(mid)
- }
- mid = "content=\"" mid "\"/>"
- $0 = part1 mid part2
- }
-
- # Recoll treats "Subject" as a "title" element (based on emails). The PDF
- # "Subject" metadata field is more like an HTML "description"
- if(inbodypre == 0 && $0 ~ /"){
- # Begin of body text.
- inbodypre++
- print $0
- next
- } else if ($0 ~ /<\/pre>/){
- inbodypre--
- print $0
- next
- } else if ($0 ~ /[-]$/) {
- # Note : soft-hyphen is iso8859 0xad
- # Break at last whitespace
- match($0, "[ \t][^ \t]+$")
- line = substr($0, 0, RSTART)
- cont = substr($0, RSTART, RLENGTH-1)
- $0 = line
- # print "LINE [" $0 "] CONT[" cont "]"
- }
- if(inbodypre > 0 && escapebody){
- $0 = escapehtml($0)
- }
- print $0
-}
-'
-}
-
-# If we're not equipped for ocr, just run pdftotext to stdout
-if test $ocrpossible -eq 0; then
- runpdftotext
- exit $?
-fi
-
-
-# tesseract is installed, prepare for running it.
-# We need to check the pdftotext output, but we don't want to run
-# it twice. Use a temporary file.
-if test z"$RECOLL_TMPDIR" != z; then
- ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
- ttdir=$TMPDIR
-else
- ttdir=/tmp
-fi
-tmpdir=$ttdir/rclpdf_tmp$$
-mkdir $tmpdir || senderror mkdir $tmpdir failed
-mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
-
-# Run pdftotext into the temp file
-pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
-runpdftotext > $pdftxtfile
-
-# If text is big, or small but not only tags and empty lines, output
-# it. Given the contents check which we perform, a file in which the
-# only text content is metadata (pdf description field), will be run
-# through OCR, which is not necessarily what we would want. It would
-# be possible to detect the situation if this proved an issue.
-txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
-txtempty=0
-# Use grep to check if there is regular text in there. Only do it on
-# small outputs
-if test $txtsize -lt 5000 ; then
- realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
- test -z "$realtext" && txtempty=1
-fi
-
-if test $txtempty -eq 0; then
- # pdftotext produced actual output, use it. No OCR
- cat $pdftxtfile
- exit 0
-fi
-
-# PDF has no text content and tesseract is available. Give it a try
-pdflangfile=`dirname "$infile"`/.ocrpdflang
-if test -f "$pdflangfile"; then
- tesseractlang=`cat "$pdflangfile"`
-fi
-
-# Try to guess tesseract language. This should depend on the input
-# file, but we have no general way to determine it. So use the
-# environment and hope for the best.
-if test -z "$tesseractlang"; then
- tesseractlang=${RECOLL_TESSERACT_LANG}
- if test -z "$tesseractlang"; then
- # Half assed trial to guess from LANG then default to english
- localelang=`echo $LANG | awk -F_ '{print $1}'`
- # echo localelang "$localelang" >&2
- case "$localelang" in
- en) tesseractlang=eng;;
- de) tesseractlang=deu;;
- fr) tesseractlang=fra;;
- # Someone will have to add more tesseract language codes here.
- esac
-
- test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
-
- test -z "$tesseractlang" && tesseractlang="eng"
- fi
-fi
-
-# echo tesseractlang "$tesseractlang" >&2
-
-TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
-TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
-
-# split pdf-pages
-ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
-if [ $? -ne 0 ] ; then
- senderror "pdftoppm: $ERR_MSG"
-fi
-
-for i in $TMPFILE* ; do
- if [ -s "$i" ] ; then
-
- tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
- TESSERR=$?
- # ignore tesseract start message
- LINECOUNT=$(wc -l < $TESSERRORFILE)
- if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
- echo "tesseract-error $TESSERR page $i in $infile" >&2
- # sort "compacts" leptonica-output
- cat $TESSERRORFILE | sort -u >&2
- fi
- # else
- # debugging purpose
- # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
- # echo "no pdftoppm in $infile cp to $SICFILE" >&2
- # cp -a $infile $SICFILE
- # fi
- fi
-done
-
-# don't output "empty" HTML-Files
-CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
-if [ "$CHARS" -gt 0 ] ; then
- echo ""
- cat "$TMPFILE"*.txt | \
- awk '{
- gsub(/&/, "\\&", $0)
- gsub(/, "\\<", $0)
- gsub(/>/, "\\>", $0)
- print $0
-}
-'
- echo ""
-fi
\ No newline at end of file
diff --git a/src/filters/rclppt b/src/filters/rclppt
deleted file mode 100755
index 467acab6..00000000
--- a/src/filters/rclppt
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-#================================================================
-# Handle powerpoint files for recoll.
-# Use unoconv, this is very slow, but catppt just can't handle the majority
-# of semi-modern ppt files
-
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclppt"
-filetype=powerpoint
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-filtersdir=`dirname $0`
-checkcmds $filtersdir/ppt-dump.py
-
-mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
-
-cat <
-
-
-EOF
-
-$mso "$infile"| sed -e 's/</g' -e 's/&/&/g'
-
-echo '
'
diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py
index d506aecd..c2319e18 100755
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@@ -6,15 +6,12 @@ import re
import sys
import os
-# Processing the output from unrtf
class PPTProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gotdata = 0
- # Some versions of unrtf put out a garbled charset line.
- # Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''''' + \
@@ -22,7 +19,7 @@ class PPTProcessData:
'''content="text/html;charset=UTF-8">''' + \
''''''
self.gotdata = True
- self.out += self.em.htmlescape(line)
+ self.out += self.em.htmlescape(line) + "
\n"
def wrapData(self):
return self.out + ''''''
diff --git a/src/filters/rclrtf b/src/filters/rclrtf
deleted file mode 100755
index abfe6e2f..00000000
--- a/src/filters/rclrtf
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
-# Some inspiration from estraier
-#================================================================
-# convert rtf to html, by executing the unrtf program:
-# http://www.gnu.org/software/unrtf/unrtf.html
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclrtl"
-filetype=rtf
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds awk unrtf
-
-# output the result
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-# The thing about the charset is that unrtf outputs a garbled one.
-unrtf --nopict --html "$infile" 2> /dev/null |
-awk 'BEGIN'\
-' {
- gothead = 0
-}
-/<\/head>/{
- if (gothead == 0) {
- printf("\n")
- gothead = 1
- }
-}
-/&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc unzip
-
-# We need a temporary directory
-if test z"$RECOLL_TMPDIR" != z; then
- ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
- ttdir=$TMPDIR
-else
- ttdir=/tmp
-fi
-tmpdir=$ttdir/rclsoff_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclsofftmp || exit 1
-
-cleanup()
-{
- # Note that we're using a constant part (rclsofftmp), that hopefully
- # guarantees that we can't do big mistakes here.
- rm -rf $tmpdir/rclsofftmp
- rmdir $tmpdir
-}
-
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Unzip the input file and change to the unzipped directory
-unzip -q -d $tmpdir/rclsofftmp "$infile"
-cd $tmpdir/rclsofftmp
-
-echo '
-'
-
-xsltproc --novalid --nonet - meta.xml <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- abstract
-
-
-
-
-
-
-
-
-
- keywords
-
-
-
-
-
-
-
-
-
- author
-
-
-
-
-
-
-
-
-
- keywords
-
-
-
-
-
-
-
-
-EOF
-
-echo ''
-
-xsltproc --novalid --nonet - content.xml <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-EOF
-echo ''
-cd /
-exit 0
diff --git a/src/filters/rclsvg b/src/filters/rclsvg
deleted file mode 100755
index be5d75b5..00000000
--- a/src/filters/rclsvg
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/sh
-
-#================================================================
-# Extract text from a Scalable Vector Graphics file
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclsvg"
-filetype=svg
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc
-
-xsltproc --novalid --nonet - "$infile" <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- keywords
-
-
-
-
-
-
-
-
-
- author
-
-
-
-
-
-
-
-
-
- keywords
-
-
-
-
-
-
-
-
-
- description
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-EOF
-
-exit 0
diff --git a/src/filters/rclsvg.py b/src/filters/rclsvg.py
index ccf2e9cf..7fde9f2e 100755
--- a/src/filters/rclsvg.py
+++ b/src/filters/rclsvg.py
@@ -116,7 +116,7 @@ class SVGExtractor:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
- return (True, docdata, "", rclexecm.RclExecM.eofnow)
+ return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
diff --git a/src/filters/rcltext b/src/filters/rcltext
deleted file mode 100755
index 05a27d7e..00000000
--- a/src/filters/rcltext
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/sh
-# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $ (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Wrap generic text (ie: program text) in html
-# Assumes ascii or iso-8859-1
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rcltext"
-filetype=text
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds sed
-echo ''
-
-sed -e 's/\&/\&/g' -e 's/\</g' "$infile"
-
-echo '
'
diff --git a/src/filters/rclxls b/src/filters/rclxls
deleted file mode 100755
index eb5d4904..00000000
--- a/src/filters/rclxls
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-#================================================================
-# Handle excel files for recoll.
-#================================================================
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxls"
-filetype=excel
-
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-top=`dirname $0`
-XLSDUMP="$top/xls-dump.py"
-XMLTOCSV="$top/xlsxmltocsv.py"
-
-checkcmds $XLSDUMP $XLSTOCSV
-
-# output the result
-echo ''
-#echo '' "$title" ''
-echo ''
-echo ''
-echo ''
-
-$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
- $XMLTOCSV | \
- sed -e 's/</g' -e 's/&/&/g'
-
-echo '
'
-echo ''
-
-# exit normally
-exit 0
diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py
index e25a6cb1..cbae1692 100755
--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@@ -8,7 +8,6 @@ import sys
import os
import xml.sax
-# Processing the output from unrtf
class XLSProcessData:
def __init__(self, em):
self.em = em
@@ -16,8 +15,6 @@ class XLSProcessData:
self.gotdata = 0
self.xmldata = ""
- # Some versions of unrtf put out a garbled charset line.
- # Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gotdata:
self.out += '''''' + \
diff --git a/src/filters/rclxml b/src/filters/rclxml
deleted file mode 100755
index 62d7846d..00000000
--- a/src/filters/rclxml
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/sh
-
-#================================================================
-# Extract text from a generic XML file (Justus Piater)
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxml"
-filetype=xml
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
- echo RECFILTERROR $*
- # Also alert on stderr just in case
- echo ":2:$progname::: $*" 1>&2
- exit 1
-}
-
-iscmd()
-{
- cmd=$1
- case $cmd in
- */*)
- if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
- *)
- oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
- for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
- return 1 ;;
- esac
-}
-
-checkcmds()
-{
- for cmd in $*;do
- if iscmd $cmd
- then
- a=1
- else
- senderror HELPERNOTFOUND $cmd
- fi
- done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help"
-then
- echo "Convert a $filetype file to HTML text for Recoll indexing."
- echo "Usage: $progname [infile]"
- exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
- senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc
-
-xsltproc --novalid --nonet - "$infile" <
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-EOF
-
-exit 0
diff --git a/src/filters/rclxml.py b/src/filters/rclxml.py
index 06709ce6..78e93f8a 100755
--- a/src/filters/rclxml.py
+++ b/src/filters/rclxml.py
@@ -74,7 +74,7 @@ class XMLExtractor:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
- return (True, docdata, "", rclexecm.RclExecM.eofnow)
+ return (True, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index 3eaed760..35c0d3dd 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -51,7 +51,7 @@ application/javascript = internal text/plain
# - with unrtf: rtf files disguising as doc files.
# The default is now again to use rcldoc. Use raw antiword if speed is more
# important for you than catching all data,
-application/msword = exec rcldoc
+application/msword = execm rcldoc.py
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics
@@ -59,41 +59,40 @@ application/msword = exec rcldoc
# Also Handle the mime type returned by "file -i" for a suffix-less word
# file. This could probably just as well be an excel file, but we have to
# chose one.
-application/vnd.ms-office = exec rcldoc
+application/vnd.ms-office = execm rcldoc.py
application/ogg = execm rclaudio
-application/pdf = exec rclpdf
-# application/pdf = execm rclmpdf.py
+application/pdf = execm rclmpdf.py
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec rclxls
-application/vnd.ms-powerpoint = exec rclppt
-application/vnd.oasis.opendocument.text = exec rclsoff
-application/vnd.oasis.opendocument.text-template = exec rclsoff
-application/vnd.oasis.opendocument.presentation = exec rclsoff
-application/vnd.oasis.opendocument.spreadsheet = exec rclsoff
-application/vnd.oasis.opendocument.graphics = exec rclsoff
+application/vnd.ms-excel = execm rclxls.py
+application/vnd.ms-powerpoint = execm rclppt.py
+application/vnd.oasis.opendocument.text = execm rclsoff.py
+application/vnd.oasis.opendocument.text-template = execm rclsoff.py
+application/vnd.oasis.opendocument.presentation = execm rclsoff.py
+application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
+application/vnd.oasis.opendocument.graphics = execm rclsoff.py
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
- exec rclopxml
+ execm rclopxml.py
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
- exec rclopxml
+ execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.template = \
- exec rclopxml
+ execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
- exec rclopxml
+ execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
- exec rclopxml
+ execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
- exec rclopxml
-application/vnd.sun.xml.calc = exec rclsoff
-application/vnd.sun.xml.calc.template = exec rclsoff
-application/vnd.sun.xml.draw = exec rclsoff
-application/vnd.sun.xml.draw.template = exec rclsoff
-application/vnd.sun.xml.impress = exec rclsoff
-application/vnd.sun.xml.impress.template = exec rclsoff
-application/vnd.sun.xml.math = exec rclsoff
-application/vnd.sun.xml.writer = exec rclsoff
-application/vnd.sun.xml.writer.global = exec rclsoff
-application/vnd.sun.xml.writer.template = exec rclsoff
+ execm rclopxml.py
+application/vnd.sun.xml.calc = execm rclsoff.py
+application/vnd.sun.xml.calc.template = execm rclsoff.py
+application/vnd.sun.xml.draw = execm rclsoff.py
+application/vnd.sun.xml.draw.template = execm rclsoff.py
+application/vnd.sun.xml.impress = execm rclsoff.py
+application/vnd.sun.xml.impress.template = execm rclsoff.py
+application/vnd.sun.xml.math = execm rclsoff.py
+application/vnd.sun.xml.writer = execm rclsoff.py
+application/vnd.sun.xml.writer.global = execm rclsoff.py
+application/vnd.sun.xml.writer.template = execm rclsoff.py
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw
application/x-awk = internal text/plain
@@ -101,7 +100,7 @@ application/x-chm = execm rclchm
application/x-dia-diagram = execm rcldia;mimetype=text/plain
application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio
-application/x-gnote = exec rclxml
+application/x-gnote = execm rclxml.py
application/x-gnuinfo = execm rclinfo
application/x-gnumeric = exec rclgnm
application/x-kword = exec rclkwd
@@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio
audio/mp4 = execm rclaudio
audio/aac = execm rclaudio
audio/x-karaoke = execm rclkar
-image/gif = execm rclimg
-image/jp2 = execm rclimg
-image/jpeg = execm rclimg
-image/png = execm rclimg
-image/tiff = execm rclimg
+image/gif = execm rclimg.py
+image/jp2 = execm rclimg.py
+image/jpeg = execm rclimg.py
+image/png = execm rclimg.py
+image/tiff = execm rclimg.py
image/vnd.djvu = exec rcldjvu
-image/svg+xml = exec rclsvg
-image/x-xcf = execm rclimg
+image/svg+xml = execm rclsvg.py
+image/x-xcf = execm rclimg.py
inode/symlink = internal
application/x-zerosize = internal
inode/x-empty = internal application/x-zerosize
@@ -159,9 +158,8 @@ text/x-python = exec rclpython
text/x-shellscript = internal text/plain
text/x-srt = internal text/plain
text/x-tex = exec rcltex
-
-application/xml = exec rclxml
-text/xml = exec rclxml
+application/xml = execm rclxml.py
+text/xml = execm rclxml.py
# Using these instead of the two above would index all parameter and tag
# names, attribute values etc, instead of just the text content.
#application/xml = internal text/plain
diff --git a/src/windows/mimeconf b/src/windows/mimeconf
index 26dfdd7b..6230c007 100644
--- a/src/windows/mimeconf
+++ b/src/windows/mimeconf
@@ -135,7 +135,8 @@ message/rfc822 = internal
text/calendar = execm python rclics;mimetype=text/plain
text/html = internal
text/plain = internal
-text/rtf = execm python rclrtf.py
+text/rtf = exec unrtf --nopict --html;mimetype=text/html
+#text/rtf = execm python rclrtf.py
text/x-c = internal
text/x-c++ = internal
text/x-c+ = internal