diff --git a/.hgignore b/.hgignore index 825ae08c..b3592592 100644 --- a/.hgignore +++ b/.hgignore @@ -90,11 +90,8 @@ src/qtgui/Makefile src/qtgui/qrc_recoll.cpp src/qtgui/recoll src/qtgui/recoll.app -src/query/alldeps -src/query/recollq src/sampleconf/rclmon.sh src/sampleconf/recoll.conf -src/utils/alldeps tests/casediac/aspdict.en.rws tests/casediac/idxstatus.txt tests/casediac/index.pid diff --git a/src/filters/rcldoc b/src/filters/rcldoc deleted file mode 100755 index 78d28cbd..00000000 --- a/src/filters/rcldoc +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/sh -# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from an msword file by executing either antiword -# or wvware -# -#================================================================ - - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rcldoc" -filetype=ms-word - -decoder="antiword -t -i 1 -m UTF-8" - - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds awk antiword iconv - -# We need to do some strange stuff to retrieve the status from antiword. Things -# would be simpler if we relied on using bash. -# Explanations: -#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status - -stdintoexitstatus() { - read exitstatus - return $exitstatus -} - -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -(((($decoder "$infile"; echo $? >&3) | -awk 'BEGIN'\ -' { - cont = "" - gotdata = 0 -} -{ - if (!($0 ~ /^[ ]*$/) && gotdata == 0) { - print "" - print "" - print "\n\n

" - gotdata = 1 - } - $0 = cont $0 - cont = "" - - if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - } - - if($0 == "\f") { - print "


" - next - } - - if (gotdata == 1) { - gsub(/&/, "\\&", $0) - gsub(//, "\\>", $0) - - print $0 "
" - } -} -END { - if (gotdata == 1) - print "

" -}' >&4) 3>&1) | stdintoexitstatus) 4>&1 - - -# Antiword rarely fails, we try to catch the most common reasons: -if test $? -eq 1 ; then - # Check actual document type - mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'` - - if test X"$mtype" = Xtext/rtf; then - # RTF document disguising as msword either because it has a .doc - # extension or because it's an attachment with a wrong mime. - exec `dirname $0`/rclrtf "$infile" - fi - - if test X"$mtype" = Xtext/plain; then - # Someone gave a .doc ext to their texts. Happens... - exec `dirname $0`/rcltext "$infile" - fi - - if test X"$mtype" = Xapplication/msword; then - # Actually application/msword: try wvWare, which is much - # slower and we don't use it by default, but it handles some - # files that antiword won't, so use it as a last resort. - if iscmd wvWare ; then - exec wvWare --nographics --charset=utf-8 "$infile" - fi - fi - - # else let the error be... - exit 1 -fi diff --git a/src/filters/rclopxml b/src/filters/rclopxml deleted file mode 100755 index 13ba54a2..00000000 --- a/src/filters/rclopxml +++ /dev/null @@ -1,238 +0,0 @@ -#!/bin/sh -# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -#================================================================ -# Extract text from an openxml msword file (will be extended for spreadsheets) -# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname=rclopxml -filetype=openxml - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc unzip - -# We need a temporary directory -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpdir=$ttdir/rclopxml_tmp$$ -mkdir $tmpdir || exit 1 -mkdir $tmpdir/rclopxmltmp || exit 1 - -cleanup() -{ - # Note that we're using a constant part (rclopxmltmp), that hopefully - # guarantees that we can't do big mistakes here. - rm -rf $tmpdir/rclopxmltmp - rmdir $tmpdir -} - -trap cleanup EXIT HUP QUIT INT TERM - -# Unzip the input file and change to the unzipped directory -unzip -q -d $tmpdir/rclopxmltmp "$infile" -cd $tmpdir/rclopxmltmp - -echo ' -' - -xsltproc --novalid --nonet - docProps/core.xml < - - - - - - - - - - - - - - - - - author - - - - - - - - - - - - date - - - - - - - - - - - - -EOF - -echo ' -' - -filename='' -if test -f word/document.xml ; then - filenames=word/document.xml - tagmatch="w:p" - xmlns_decls=' - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" - xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" - xmlns:o="urn:schemas-microsoft-com:office:office" - xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" - xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" - xmlns:v="urn:schemas-microsoft-com:vml" - xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" - xmlns:w10="urn:schemas-microsoft-com:office:word" - xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" - xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" - ' - -elif test -f xl/sharedStrings.xml ; then - filenames=xl/sharedStrings.xml - tagmatch='x:t' - xmlns_decls=' - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" - xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" - ' - -elif test -f ppt/slides/slide1.xml ; then - filenames=`echo ppt/slides/slide*.xml` - tagmatch='a:t' - xmlns_decls=' - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" - xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" - xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" - xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" - ' -# I want to suppress text output for all except a:t, don't know how to do it -# help ! At least get rid of these: - moretemplates=' - - -' -else - # ?? - exit 1 -fi - - -for filename in $filenames;do -xsltproc --novalid --nonet - $filename < - - - - - -
- -
-
- - -

- -

-
- - $moretemplates - -
-EOF -done - -echo '' diff --git a/src/filters/rclpdf b/src/filters/rclpdf deleted file mode 100755 index 3e11ea6d..00000000 --- a/src/filters/rclpdf +++ /dev/null @@ -1,351 +0,0 @@ -#!/bin/bash -#================================================================ -# Copyright (C) 2015 J.F. Dockes -# There used to be Estraier content in there, but I quite believe that is not -# the case any more. -# This file is licensed under the GPL v2 -#================================================================ -# Convert a pdf file to HTML. -# -# We use pdftotext from the xpdf/poppler-utils package. -# -# pdftotext sometimes outputs unescaped text inside HTML text sections. -# We try to correct. -# -# If pdftotext produces no text and tesseract is available, we try to -# perform OCR. As this can be very slow and the result not always -# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists -# -# We guess the OCR language in order of preference: -# - From the content of a ".ocrpdflang" file if it exists in the same -# directory as the PDF -# - From an RECOLL_TESSERACT_LANG environment variable -# - From the content of $RECOLL_CONFDIR/ocrpdf -# - Default to "eng" -# -# Uncomment the following if you get better results without. The -# pdftotext manual says that the option is no longer recommended The -# difference in output seems mostly the removal of soft-hyphens when -# -raw is not set -# optionraw=-raw - -# set variables -progname="rclpdf" -filetype=pdf - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds pdftotext iconv awk - -ocrpossible=0 -if iscmd tesseract; then - if iscmd pdftoppm; then - ocrpossible=1 - fi -fi -confdir=${RECOLL_CONFDIR:-~/.recoll} -test ! -f "$confdir/ocrpdf" && ocrpossible=0 - -tmpdir= - -cleanup() -{ - # Note that we're using a constant part (rclpdftmp), that hopefully - # guarantees that we can't do big mistakes with the -rf here. - if test ! -z "$tmpdir"; then - rm -rf $tmpdir/rclpdftmp - rmdir $tmpdir - fi -} - -trap cleanup EXIT HUP QUIT INT TERM - -runpdftotext() -{ - # Test poppler version: at some point before 0.24, poppler began - # to properly escape text inside the header (but not the body). - XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'` - MAJOR=`echo $XYZ | cut -d. -f 1` - MINOR=`echo $XYZ | cut -d. -f 2` - escapeheader=1 - escapebody=1 - if test "$MAJOR" -gt 0 ; then - escapeheader=0 - elif test "$MINOR" -ge 24; then - escapeheader=0; - fi - - # Run pdftotext and fix the result (add a charset tag and fix the - # html escaping). The escaping is a half-hearted job. We do try to - # fix some header fields, only for those which are single-line. - pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | - iconv -f UTF-8 -t UTF-8 -c -s | - awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\ -' { - inbodypre = 0 - cont = "" -} -function escapehtml(s) -{ - gsub(/&/, "\\&", s) - gsub(//, "\\>", s) - gsub(/"/, "\\"", s) - return s -} -{ - $0 = cont $0 - cont = "" - # Insert charset meta tag at end of header - if(inbodypre == 0 && $0 ~ /<\/head>/) { - match($0, /<\/head>/) - part1 = substr($0, 0, RSTART-1) - part2 = substr($0, RSTART, length($0)) - charsetmeta = "" - $0 = part1 charsetmeta "\n" part2 - } - if(inbodypre == 0 && $0 ~ /.*<\/title>/){ - match($0, /<title>.*<\/title>/) - part1 = substr($0, 0, RSTART-1) - mid = substr($0, RSTART, RLENGTH) - part2 = substr($0, RSTART + RLENGTH, length($0)) - gsub(/<title>/, "", mid) - gsub(/<\/title>/, "", mid) - if (escapeheader) { - mid = escapehtml(mid) - } - mid = "<title>" mid "" - $0 = part1 mid part2 - } - # This matches all single-line meta fields - if(inbodypre == 0 && $0 ~ /content=".*"\/>/){ - match($0, /content=".*"\/>/) - part1 = substr($0, 0, RSTART-1) - mid = substr($0, RSTART, RLENGTH) - part2 = substr($0, RSTART + RLENGTH, length($0)) - gsub(/content="/, "", mid) - gsub(/"\/>/, "", mid) - if (escapeheader) { - mid = escapehtml(mid) - } - mid = "content=\"" mid "\"/>" - $0 = part1 mid part2 - } - - # Recoll treats "Subject" as a "title" element (based on emails). The PDF - # "Subject" metadata field is more like an HTML "description" - if(inbodypre == 0 && $0 ~ /"){ - # Begin of body text. - inbodypre++ - print $0 - next - } else if ($0 ~ /<\/pre>/){ - inbodypre-- - print $0 - next - } else if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - # print "LINE [" $0 "] CONT[" cont "]" - } - if(inbodypre > 0 && escapebody){ - $0 = escapehtml($0) - } - print $0 -} -' -} - -# If we're not equipped for ocr, just run pdftotext to stdout -if test $ocrpossible -eq 0; then - runpdftotext - exit $? -fi - - -# tesseract is installed, prepare for running it. -# We need to check the pdftotext output, but we don't want to run -# it twice. Use a temporary file. -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpdir=$ttdir/rclpdf_tmp$$ -mkdir $tmpdir || senderror mkdir $tmpdir failed -mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed - -# Run pdftotext into the temp file -pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile -runpdftotext > $pdftxtfile - -# If text is big, or small but not only tags and empty lines, output -# it. Given the contents check which we perform, a file in which the -# only text content is metadata (pdf description field), will be run -# through OCR, which is not necessarily what we would want. It would -# be possible to detect the situation if this proved an issue. -txtsize=`ls -l $pdftxtfile | awk '{print $5}'` -txtempty=0 -# Use grep to check if there is regular text in there. Only do it on -# small outputs -if test $txtsize -lt 5000 ; then - realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile` - test -z "$realtext" && txtempty=1 -fi - -if test $txtempty -eq 0; then - # pdftotext produced actual output, use it. No OCR - cat $pdftxtfile - exit 0 -fi - -# PDF has no text content and tesseract is available. Give it a try -pdflangfile=`dirname "$infile"`/.ocrpdflang -if test -f "$pdflangfile"; then - tesseractlang=`cat "$pdflangfile"` -fi - -# Try to guess tesseract language. This should depend on the input -# file, but we have no general way to determine it. So use the -# environment and hope for the best. -if test -z "$tesseractlang"; then - tesseractlang=${RECOLL_TESSERACT_LANG} - if test -z "$tesseractlang"; then - # Half assed trial to guess from LANG then default to english - localelang=`echo $LANG | awk -F_ '{print $1}'` - # echo localelang "$localelang" >&2 - case "$localelang" in - en) tesseractlang=eng;; - de) tesseractlang=deu;; - fr) tesseractlang=fra;; - # Someone will have to add more tesseract language codes here. - esac - - test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"` - - test -z "$tesseractlang" && tesseractlang="eng" - fi -fi - -# echo tesseractlang "$tesseractlang" >&2 - -TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile" -TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX" - -# split pdf-pages -ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1) -if [ $? -ne 0 ] ; then - senderror "pdftoppm: $ERR_MSG" -fi - -for i in $TMPFILE* ; do - if [ -s "$i" ] ; then - - tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1 - TESSERR=$? - # ignore tesseract start message - LINECOUNT=$(wc -l < $TESSERRORFILE) - if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then - echo "tesseract-error $TESSERR page $i in $infile" >&2 - # sort "compacts" leptonica-output - cat $TESSERRORFILE | sort -u >&2 - fi - # else - # debugging purpose - # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX) - # echo "no pdftoppm in $infile cp to $SICFILE" >&2 - # cp -a $infile $SICFILE - # fi - fi -done - -# don't output "empty" HTML-Files -CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) -if [ "$CHARS" -gt 0 ] ; then - echo "
" 
-    cat "$TMPFILE"*.txt | \
-        awk '{
-  gsub(/&/, "\\&", $0)
-  gsub(//, "\\>", $0)
-  print $0
-}
-'
-    echo "
" -fi \ No newline at end of file diff --git a/src/filters/rclppt b/src/filters/rclppt deleted file mode 100755 index 467acab6..00000000 --- a/src/filters/rclppt +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/sh -# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -#================================================================ -# Handle powerpoint files for recoll. -# Use unoconv, this is very slow, but catppt just can't handle the majority -# of semi-modern ppt files - -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclppt" -filetype=powerpoint - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -filtersdir=`dirname $0` -checkcmds $filtersdir/ppt-dump.py - -mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text" - -cat < - -
-EOF
-
-$mso "$infile"| sed -e 's/'
diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py
index d506aecd..c2319e18 100755
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@@ -6,15 +6,12 @@ import re
 import sys
 import os
 
-# Processing the output from unrtf
 class PPTProcessData:
     def __init__(self, em):
         self.em = em
         self.out = ""
         self.gotdata = 0
 
-    # Some versions of unrtf put out a garbled charset line.
-    # Apart from this, we pass the data untouched.
     def takeLine(self, line):
         if not self.gotdata:
             self.out += '''''' + \
@@ -22,7 +19,7 @@ class PPTProcessData:
                         '''content="text/html;charset=UTF-8">''' + \
                         '''
'''
             self.gotdata = True
-        self.out += self.em.htmlescape(line)
+        self.out += self.em.htmlescape(line) + "
\n" def wrapData(self): return self.out + '''
''' diff --git a/src/filters/rclrtf b/src/filters/rclrtf deleted file mode 100755 index abfe6e2f..00000000 --- a/src/filters/rclrtf +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/sh -# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes -# Some inspiration from estraier -#================================================================ -# convert rtf to html, by executing the unrtf program: -# http://www.gnu.org/software/unrtf/unrtf.html -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclrtl" -filetype=rtf - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds awk unrtf - -# output the result -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -# The thing about the charset is that unrtf outputs a garbled one. -unrtf --nopict --html "$infile" 2> /dev/null | -awk 'BEGIN'\ -' { - gothead = 0 -} -/<\/head>/{ - if (gothead == 0) { - printf("\n") - gothead = 1 - } -} -/&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc unzip - -# We need a temporary directory -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpdir=$ttdir/rclsoff_tmp$$ -mkdir $tmpdir || exit 1 -mkdir $tmpdir/rclsofftmp || exit 1 - -cleanup() -{ - # Note that we're using a constant part (rclsofftmp), that hopefully - # guarantees that we can't do big mistakes here. - rm -rf $tmpdir/rclsofftmp - rmdir $tmpdir -} - -trap cleanup EXIT HUP QUIT INT TERM - -# Unzip the input file and change to the unzipped directory -unzip -q -d $tmpdir/rclsofftmp "$infile" -cd $tmpdir/rclsofftmp - -echo ' -' - -xsltproc --novalid --nonet - meta.xml < - - - - - - - - - - - - - - <xsl:value-of select="."/> - - - - - - abstract - - - - - - - - - - keywords - - - - - - - - - - author - - - - - - - - - - keywords - - - - - - - - -EOF - -echo '' - -xsltproc --novalid --nonet - content.xml < - - - - - -

- -
- - -

- -
- - - - - - -
-
- - - - - -
-EOF -echo '' -cd / -exit 0 diff --git a/src/filters/rclsvg b/src/filters/rclsvg deleted file mode 100755 index be5d75b5..00000000 --- a/src/filters/rclsvg +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/sh - -#================================================================ -# Extract text from a Scalable Vector Graphics file -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclsvg" -filetype=svg - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds xsltproc - -xsltproc --novalid --nonet - "$infile" < - - - - - - - - - - - - - - - - - - - - - - keywords - - - - - - - - - - author - - - - - - - - - - keywords - - - - - - - - - - description - - - - - - - - - <xsl:value-of select="."/> - - - - -

- -
- -
-EOF - -exit 0 diff --git a/src/filters/rclsvg.py b/src/filters/rclsvg.py index ccf2e9cf..7fde9f2e 100755 --- a/src/filters/rclsvg.py +++ b/src/filters/rclsvg.py @@ -116,7 +116,7 @@ class SVGExtractor: self.em.rclog("%s: bad data: " % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) - return (True, docdata, "", rclexecm.RclExecM.eofnow) + return (True, docdata, "", rclexecm.RclExecM.eofnext) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): diff --git a/src/filters/rcltext b/src/filters/rcltext deleted file mode 100755 index 05a27d7e..00000000 --- a/src/filters/rcltext +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/sh -# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Wrap generic text (ie: program text) in html -# Assumes ascii or iso-8859-1 -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rcltext" -filetype=text - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds sed -echo '
'
-
-sed -e 's/\&/\&/g' -e 's/'
diff --git a/src/filters/rclxls b/src/filters/rclxls
deleted file mode 100755
index eb5d4904..00000000
--- a/src/filters/rclxls
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-
-#================================================================
-# Handle excel files for recoll. 
-#================================================================
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxls"
-filetype=excel
-
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-top=`dirname $0`
-XLSDUMP="$top/xls-dump.py"
-XMLTOCSV="$top/xlsxmltocsv.py"
-
-checkcmds $XLSDUMP $XLSTOCSV
-
-# output the result
-echo ''
-#echo '' "$title" ''
-echo ''
-echo ''
-echo '
'
-
-$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
-   $XMLTOCSV | \
-   sed -e 's/'
-echo ''
-
-# exit normally
-exit 0
diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py
index e25a6cb1..cbae1692 100755
--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@@ -8,7 +8,6 @@ import sys
 import os
 import xml.sax
 
-# Processing the output from unrtf
 class XLSProcessData:
     def __init__(self, em):
         self.em = em
@@ -16,8 +15,6 @@ class XLSProcessData:
         self.gotdata = 0
         self.xmldata = ""
         
-    # Some versions of unrtf put out a garbled charset line.
-    # Apart from this, we pass the data untouched.
     def takeLine(self, line):
         if not self.gotdata:
             self.out += '''''' + \
diff --git a/src/filters/rclxml b/src/filters/rclxml
deleted file mode 100755
index 62d7846d..00000000
--- a/src/filters/rclxml
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/sh
-
-#================================================================
-# Extract text from a generic XML file (Justus Piater)
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxml"
-filetype=xml
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc
-
-xsltproc --novalid --nonet - "$infile" <
-
-
-  
-
-  
-    
-      
-	
-	  
-	    <xsl:value-of select="//*[local-name() = 'title'][1]"/>
-	  
-	
-      
-      
-	
-      
-    
-  
-
-  
-    
-      

- - -
-
- - - - - -
-EOF - -exit 0 diff --git a/src/filters/rclxml.py b/src/filters/rclxml.py index 06709ce6..78e93f8a 100755 --- a/src/filters/rclxml.py +++ b/src/filters/rclxml.py @@ -74,7 +74,7 @@ class XMLExtractor: self.em.rclog("%s: bad data: " % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) - return (True, docdata, "", rclexecm.RclExecM.eofnow) + return (True, docdata, "", rclexecm.RclExecM.eofnext) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 3eaed760..35c0d3dd 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -51,7 +51,7 @@ application/javascript = internal text/plain # - with unrtf: rtf files disguising as doc files. # The default is now again to use rcldoc. Use raw antiword if speed is more # important for you than catching all data, -application/msword = exec rcldoc +application/msword = execm rcldoc.py #application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain # You can also use wvware directly but it's much slower. # application/msword = exec wvWare --charset=utf-8 --nographics @@ -59,41 +59,40 @@ application/msword = exec rcldoc # Also Handle the mime type returned by "file -i" for a suffix-less word # file. This could probably just as well be an excel file, but we have to # chose one. -application/vnd.ms-office = exec rcldoc +application/vnd.ms-office = execm rcldoc.py application/ogg = execm rclaudio -application/pdf = exec rclpdf -# application/pdf = execm rclmpdf.py +application/pdf = execm rclmpdf.py application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain -application/vnd.ms-excel = exec rclxls -application/vnd.ms-powerpoint = exec rclppt -application/vnd.oasis.opendocument.text = exec rclsoff -application/vnd.oasis.opendocument.text-template = exec rclsoff -application/vnd.oasis.opendocument.presentation = exec rclsoff -application/vnd.oasis.opendocument.spreadsheet = exec rclsoff -application/vnd.oasis.opendocument.graphics = exec rclsoff +application/vnd.ms-excel = execm rclxls.py +application/vnd.ms-powerpoint = execm rclppt.py +application/vnd.oasis.opendocument.text = execm rclsoff.py +application/vnd.oasis.opendocument.text-template = execm rclsoff.py +application/vnd.oasis.opendocument.presentation = execm rclsoff.py +application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py +application/vnd.oasis.opendocument.graphics = execm rclsoff.py application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ - exec rclopxml + execm rclopxml.py application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ - exec rclopxml + execm rclopxml.py application/vnd.openxmlformats-officedocument.presentationml.template = \ - exec rclopxml + execm rclopxml.py application/vnd.openxmlformats-officedocument.presentationml.presentation = \ - exec rclopxml + execm rclopxml.py application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ - exec rclopxml + execm rclopxml.py application/vnd.openxmlformats-officedocument.spreadsheetml.template =\ - exec rclopxml -application/vnd.sun.xml.calc = exec rclsoff -application/vnd.sun.xml.calc.template = exec rclsoff -application/vnd.sun.xml.draw = exec rclsoff -application/vnd.sun.xml.draw.template = exec rclsoff -application/vnd.sun.xml.impress = exec rclsoff -application/vnd.sun.xml.impress.template = exec rclsoff -application/vnd.sun.xml.math = exec rclsoff -application/vnd.sun.xml.writer = exec rclsoff -application/vnd.sun.xml.writer.global = exec rclsoff -application/vnd.sun.xml.writer.template = exec rclsoff + execm rclopxml.py +application/vnd.sun.xml.calc = execm rclsoff.py +application/vnd.sun.xml.calc.template = execm rclsoff.py +application/vnd.sun.xml.draw = execm rclsoff.py +application/vnd.sun.xml.draw.template = execm rclsoff.py +application/vnd.sun.xml.impress = execm rclsoff.py +application/vnd.sun.xml.impress.template = execm rclsoff.py +application/vnd.sun.xml.math = execm rclsoff.py +application/vnd.sun.xml.writer = execm rclsoff.py +application/vnd.sun.xml.writer.global = execm rclsoff.py +application/vnd.sun.xml.writer.template = execm rclsoff.py application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = exec rclabw application/x-awk = internal text/plain @@ -101,7 +100,7 @@ application/x-chm = execm rclchm application/x-dia-diagram = execm rcldia;mimetype=text/plain application/x-dvi = exec rcldvi application/x-flac = execm rclaudio -application/x-gnote = exec rclxml +application/x-gnote = execm rclxml.py application/x-gnuinfo = execm rclinfo application/x-gnumeric = exec rclgnm application/x-kword = exec rclkwd @@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio audio/mp4 = execm rclaudio audio/aac = execm rclaudio audio/x-karaoke = execm rclkar -image/gif = execm rclimg -image/jp2 = execm rclimg -image/jpeg = execm rclimg -image/png = execm rclimg -image/tiff = execm rclimg +image/gif = execm rclimg.py +image/jp2 = execm rclimg.py +image/jpeg = execm rclimg.py +image/png = execm rclimg.py +image/tiff = execm rclimg.py image/vnd.djvu = exec rcldjvu -image/svg+xml = exec rclsvg -image/x-xcf = execm rclimg +image/svg+xml = execm rclsvg.py +image/x-xcf = execm rclimg.py inode/symlink = internal application/x-zerosize = internal inode/x-empty = internal application/x-zerosize @@ -159,9 +158,8 @@ text/x-python = exec rclpython text/x-shellscript = internal text/plain text/x-srt = internal text/plain text/x-tex = exec rcltex - -application/xml = exec rclxml -text/xml = exec rclxml +application/xml = execm rclxml.py +text/xml = execm rclxml.py # Using these instead of the two above would index all parameter and tag # names, attribute values etc, instead of just the text content. #application/xml = internal text/plain diff --git a/src/windows/mimeconf b/src/windows/mimeconf index 26dfdd7b..6230c007 100644 --- a/src/windows/mimeconf +++ b/src/windows/mimeconf @@ -135,7 +135,8 @@ message/rfc822 = internal text/calendar = execm python rclics;mimetype=text/plain text/html = internal text/plain = internal -text/rtf = execm python rclrtf.py +text/rtf = exec unrtf --nopict --html;mimetype=text/html +#text/rtf = execm python rclrtf.py text/x-c = internal text/x-c++ = internal text/x-c+ = internal