Use the python-based filters written for ms-win on Linux too

2015-10-11 08:41:15 +02:00 · 2015-10-11 08:41:15 +02:00 · 4c3e112c27
commit 4c3e112c27
parent dc9d9900be
17 changed files with 41 additions and 1740 deletions
--- a/.hgignore
+++ b/.hgignore
@ -90,11 +90,8 @@ src/qtgui/Makefile
 src/qtgui/qrc_recoll.cpp
 src/qtgui/recoll
 src/qtgui/recoll.app
-src/query/alldeps
-src/query/recollq
 src/sampleconf/rclmon.sh
 src/sampleconf/recoll.conf
-src/utils/alldeps
 tests/casediac/aspdict.en.rws
 tests/casediac/idxstatus.txt
 tests/casediac/index.pid
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@ -1,176 +0,0 @@
-#!/bin/sh
-# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $  (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Extract text from an msword file by executing either antiword 
-#  or wvware
-#
-#================================================================
-
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rcldoc"
-filetype=ms-word
-
-decoder="antiword -t -i 1 -m UTF-8"
-
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds awk antiword iconv
-
-# We need to do some strange stuff to retrieve the status from antiword. Things
-# would be simpler if we relied on using bash.
-# Explanations:
-#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status
-
-stdintoexitstatus() {
-  read exitstatus
-  return $exitstatus
-}
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-(((($decoder "$infile"; echo $? >&3) |
-awk 'BEGIN'\
-' {
-  cont = ""
-  gotdata = 0
-}
-{
-  if (!($0 ~ /^[ 	]*$/) && gotdata == 0) {
-    print "<html><head><title></title>"
-    print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
-    print "</head>\n<body>\n<p>"
-    gotdata = 1
-  }
-  $0 = cont $0
-  cont = ""
-
-  if ($0 ~ /[-]$/) {
-    # Note : soft-hyphen is iso8859 0xad
-    # Break at last whitespace
-    match($0, "[ \t][^ \t]+$")
-    line = substr($0, 0, RSTART)
-    cont = substr($0, RSTART, RLENGTH-1)
-    $0 = line
-  }
-
-  if($0 == "\f") {
-    print "</p><hr><p>"
-    next
-  } 
-
-  if (gotdata == 1) {
-    gsub(/&/, "\\&amp;", $0)
-    gsub(/</, "\\&lt;", $0)
-    gsub(/>/, "\\&gt;", $0)
-
-    print $0 "<br>"
-  }
-}
-END {
-    if (gotdata == 1)
-      print "</p></body></html>"
-}' >&4) 3>&1) | stdintoexitstatus) 4>&1
-
-
-# Antiword rarely fails, we try to catch the most common reasons:
-if test $? -eq 1 ; then
-    # Check actual document type 
-    mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
-
-    if test X"$mtype" = Xtext/rtf; then
-      # RTF document disguising as msword either because it has a .doc
-      # extension or because it's an attachment with a wrong mime.
-      exec `dirname $0`/rclrtf "$infile"
-    fi
-    
-    if test X"$mtype" = Xtext/plain; then
-      # Someone gave a .doc ext to their texts. Happens...
-      exec `dirname $0`/rcltext "$infile"
-    fi
-
-    if test X"$mtype" = Xapplication/msword; then
-      # Actually application/msword: try wvWare, which is much
-      # slower and we don't use it by default, but it handles some
-      # files that antiword won't, so use it as a last resort.
-      if iscmd wvWare ; then
-        exec wvWare --nographics --charset=utf-8 "$infile"
-      fi
-    fi
-
-    # else let the error be...
-    exit 1
-fi
--- a/src/filters/rclopxml
+++ b/src/filters/rclopxml
@ -1,238 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-#================================================================
-# Extract text from an openxml msword file (will be extended for spreadsheets)
-# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname=rclopxml
-filetype=openxml
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc unzip
-
-# We need a temporary directory
-if test z"$RECOLL_TMPDIR" != z; then
-   ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-   ttdir=$TMPDIR
-else
-   ttdir=/tmp
-fi
-tmpdir=$ttdir/rclopxml_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclopxmltmp || exit 1
-
-cleanup()
-{
-    # Note that we're using a constant part (rclopxmltmp), that hopefully
-    # guarantees that we can't do big mistakes here.
-    rm -rf $tmpdir/rclopxmltmp
-    rmdir $tmpdir
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Unzip the input file and change to the unzipped directory
-unzip -q -d $tmpdir/rclopxmltmp "$infile"
-cd $tmpdir/rclopxmltmp
-
-echo '<html>
-<head>'
-
-xsltproc --novalid --nonet - docProps/core.xml <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet 
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xmlns:dcterms="http://purl.org/dc/terms/"
- xmlns:dcmitype="http://purl.org/dc/dcmitype/"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-
-<!--  <xsl:output method="text"/> -->
-  <xsl:output omit-xml-declaration="yes"/>
-
-  <xsl:template match="cp:coreProperties">
-    <xsl:text>&#10;</xsl:text>
-    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
-    <xsl:text>&#10;</xsl:text>
-    <xsl:apply-templates/>
-  </xsl:template>
-
-  <xsl:template match="dc:creator">
-    <meta>
-    <xsl:attribute name="name">
-      <!-- <xsl:value-of select="name()"/> pour sortir tous les meta avec 
-       le meme nom que dans le xml (si on devenait dc-natif) -->
-      <xsl:text>author</xsl:text> 
-    </xsl:attribute>
-    <xsl:attribute name="content">
-       <xsl:value-of select="."/>
-    </xsl:attribute>
-    </meta>
-    <xsl:text>&#10;</xsl:text>
-  </xsl:template>
-
-  <xsl:template match="dcterms:modified">
-    <meta>
-    <xsl:attribute name="name">
-      <xsl:text>date</xsl:text> 
-    </xsl:attribute>
-    <xsl:attribute name="content">
-       <xsl:value-of select="."/>
-    </xsl:attribute>
-    </meta>
-    <xsl:text>&#10;</xsl:text>
-  </xsl:template>
-
-  <xsl:template match="*">
-  </xsl:template>
-
-</xsl:stylesheet>
-EOF
-
-echo '</head>
-<body>'
-
-filename=''
-if test -f word/document.xml ; then
- filenames=word/document.xml 
- tagmatch="w:p"
- xmlns_decls='
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
- xmlns:o="urn:schemas-microsoft-com:office:office"
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
- xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
- xmlns:v="urn:schemas-microsoft-com:vml"
- xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
- xmlns:w10="urn:schemas-microsoft-com:office:word"
- xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
- xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
- '
-
-elif test -f xl/sharedStrings.xml ; then
- filenames=xl/sharedStrings.xml 
- tagmatch='x:t'
- xmlns_decls='
-   xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
-   xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
-  '
-
-elif test -f ppt/slides/slide1.xml ; then
- filenames=`echo ppt/slides/slide*.xml`
- tagmatch='a:t'
- xmlns_decls='
-  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
-  xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" 
-  xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
- '
-# I want to suppress text output for all except a:t, don't know how to do it
-# help ! At least get rid of these:
- moretemplates='
-  <xsl:template match="p:attrName">
-  </xsl:template>
-'
-else
-    # ??
-    exit 1
-fi
-
-
-for filename in $filenames;do
-xsltproc --novalid --nonet - $filename <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet $xmlns_decls >
-
- <xsl:output omit-xml-declaration="yes"/>
-
- <xsl:template match="/">
-  <div>
-  <xsl:apply-templates/> 
-  </div>
-</xsl:template>
-
- <xsl:template match="$tagmatch">
-  <p>
-  <xsl:value-of select="."/>
-  </p>
- </xsl:template>
-
- $moretemplates
-
-</xsl:stylesheet>
-EOF
-done
-
-echo '</html>'
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,351 +0,0 @@
-#!/bin/bash
-#================================================================
-# Copyright (C) 2015 J.F. Dockes
-# There used to be Estraier content in there, but I quite believe that is not
-# the case any more.
-# This file is licensed under the GPL v2
-#================================================================
-# Convert a pdf file to  HTML.
-#
-# We use pdftotext from the xpdf/poppler-utils package. 
-#
-# pdftotext sometimes outputs unescaped text inside HTML text sections.
-# We try to correct.
-#
-# If pdftotext produces no text and tesseract is available, we try to
-# perform OCR. As this can be very slow and the result not always
-# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
-#
-# We guess the OCR language in order of preference:
-#  - From the content of a ".ocrpdflang" file if it exists in the same
-#    directory as the PDF
-#  - From an RECOLL_TESSERACT_LANG environment variable
-#  - From the content of $RECOLL_CONFDIR/ocrpdf
-#  - Default to "eng"
-#
-# Uncomment the following if you get better results without. The
-# pdftotext manual says that the option is no longer recommended The
-# difference in output seems mostly the removal of soft-hyphens when
-# -raw is not set
-# optionraw=-raw
-
-# set variables
-progname="rclpdf"
-filetype=pdf
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds pdftotext iconv awk
-
-ocrpossible=0
-if iscmd tesseract; then
-    if iscmd pdftoppm; then
-        ocrpossible=1
-    fi
-fi
-confdir=${RECOLL_CONFDIR:-~/.recoll}
-test ! -f "$confdir/ocrpdf" && ocrpossible=0
-
-tmpdir=
-
-cleanup()
-{
-    # Note that we're using a constant part (rclpdftmp), that hopefully
-    # guarantees that we can't do big mistakes with the -rf here.
-    if test ! -z "$tmpdir"; then
-        rm -rf $tmpdir/rclpdftmp
-        rmdir $tmpdir
-    fi
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-runpdftotext()
-{
-    # Test poppler version: at some point before 0.24, poppler began
-    # to properly escape text inside the header (but not the body).
-    XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
-    MAJOR=`echo $XYZ | cut -d. -f 1`
-    MINOR=`echo $XYZ | cut -d. -f 2`
-    escapeheader=1
-    escapebody=1
-    if test "$MAJOR" -gt 0 ; then
-        escapeheader=0
-    elif test "$MINOR" -ge 24; then
-        escapeheader=0;
-    fi
-
-    # Run pdftotext and fix the result (add a charset tag and fix the
-    # html escaping). The escaping is a half-hearted job. We do try to
-    # fix some header fields, only for those which are single-line.
-    pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
-    iconv -f UTF-8 -t UTF-8 -c -s |
-    awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
-' {
-  inbodypre = 0
-  cont = ""
-}
-function escapehtml(s)
-{
-  gsub(/&/, "\\&amp;", s)
-  gsub(/</, "\\&lt;", s)
-  gsub(/>/, "\\&gt;", s)
-  gsub(/"/, "\\&quot;", s)
-  return s
-}    
-{
-  $0 = cont $0
-  cont = ""
-  # Insert charset meta tag at end of header
-  if(inbodypre == 0 && $0 ~ /<\/head>/) {
-    match($0, /<\/head>/)
-    part1 = substr($0, 0, RSTART-1)
-    part2 = substr($0, RSTART, length($0))
-    charsetmeta = "<meta http-equiv=\"Content-Type\" "\
-                  "content=\"text/html; charset=UTF-8\">"
-    $0 =  part1 charsetmeta "\n" part2
-  }
-  if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
-    match($0, /<title>.*<\/title>/)
-    part1 = substr($0, 0, RSTART-1)
-    mid = substr($0, RSTART, RLENGTH)
-    part2 = substr($0, RSTART + RLENGTH, length($0))
-    gsub(/<title>/, "", mid)
-    gsub(/<\/title>/, "", mid)
-    if (escapeheader) {
-        mid = escapehtml(mid)
-    }
-    mid = "<title>" mid "</title>"
-    $0 = part1 mid part2
-  }
-  # This matches all single-line meta fields
-  if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
-    match($0, /content=".*"\/>/)
-    part1 = substr($0, 0, RSTART-1)
-    mid = substr($0, RSTART, RLENGTH)
-    part2 = substr($0, RSTART + RLENGTH, length($0))
-    gsub(/content="/, "", mid)
-    gsub(/"\/>/, "", mid)
-    if (escapeheader) {
-        mid = escapehtml(mid)
-    }
-    mid = "content=\"" mid "\"/>"
-    $0 = part1 mid part2
-  }
-
-  # Recoll treats "Subject" as a "title" element (based on emails). The PDF
-  # "Subject" metadata field is more like an HTML "description"
-  if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
-      gsub(/="Subject"/, "=\"Description\"", $0)
-  }
-
-  if ($0 == "<pre>"){
-    # Begin of body text.
-    inbodypre++
-    print $0
-    next
-  } else if ($0 ~ /<\/pre>/){
-    inbodypre--
-    print $0 
-    next
-  } else if ($0 ~ /[-]$/) {
-    # Note : soft-hyphen is iso8859 0xad
-    # Break at last whitespace
-    match($0, "[ \t][^ \t]+$")
-    line = substr($0, 0, RSTART)
-    cont = substr($0, RSTART, RLENGTH-1)
-    $0 = line
-    # print "LINE [" $0 "] CONT[" cont "]"
-  } 
-  if(inbodypre > 0 && escapebody){
-      $0 = escapehtml($0)
-  }
-  print $0
-}
-'
-}
-
-# If we're not equipped for ocr, just run pdftotext to stdout
-if test $ocrpossible -eq 0; then
-    runpdftotext
-    exit $?
-fi
-
-
-# tesseract is installed, prepare for running it.
-# We need to check the pdftotext output, but we don't want to run
-# it twice. Use a temporary file.
-if test z"$RECOLL_TMPDIR" != z; then
-    ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-    ttdir=$TMPDIR
-else
-    ttdir=/tmp
-fi
-tmpdir=$ttdir/rclpdf_tmp$$
-mkdir $tmpdir || senderror mkdir $tmpdir failed
-mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
-
-# Run pdftotext into the temp file
-pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
-runpdftotext > $pdftxtfile
-
-# If text is big, or small but not only tags and empty lines, output
-# it. Given the contents check which we perform, a file in which the
-# only text content is metadata (pdf description field), will be run
-# through OCR, which is not necessarily what we would want. It would
-# be possible to detect the situation if this proved an issue.
-txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
-txtempty=0
-# Use grep to check if there is regular text in there. Only do it on
-# small outputs
-if test $txtsize -lt 5000 ; then
-    realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
-    test -z "$realtext" && txtempty=1
-fi
-
-if test $txtempty -eq 0; then
-    # pdftotext produced actual output, use it. No OCR
-    cat $pdftxtfile
-    exit 0
-fi
-
-# PDF has no text content and tesseract is available. Give it a try
-pdflangfile=`dirname "$infile"`/.ocrpdflang
-if test -f "$pdflangfile"; then
-    tesseractlang=`cat "$pdflangfile"`
-fi
-
-# Try to guess tesseract language. This should depend on the input
-# file, but we have no general way to determine it. So use the
-# environment and hope for the best.
-if test -z "$tesseractlang"; then
-    tesseractlang=${RECOLL_TESSERACT_LANG}
-    if test -z "$tesseractlang"; then
-        # Half assed trial to guess from LANG then default to english
-        localelang=`echo $LANG | awk -F_ '{print $1}'`
-        # echo localelang "$localelang" >&2
-        case "$localelang" in 
-        en) tesseractlang=eng;;
-        de) tesseractlang=deu;;
-        fr) tesseractlang=fra;;
-        # Someone will have to add more tesseract language codes here.
-        esac
-
-        test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
-
-        test -z "$tesseractlang" && tesseractlang="eng"
-    fi
-fi
-
-# echo tesseractlang "$tesseractlang" >&2
-
-TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
-TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
-
-# split pdf-pages
-ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
-if [ $? -ne 0 ] ; then
-    senderror "pdftoppm: $ERR_MSG"
-fi
-
-for i in $TMPFILE* ; do
-    if [ -s "$i" ] ; then
-
-        tesseract $i $i -l  $tesseractlang > $TESSERRORFILE 2>&1
-        TESSERR=$?
-        # ignore tesseract start message
-        LINECOUNT=$(wc -l < $TESSERRORFILE)
-        if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
-            echo "tesseract-error $TESSERR page $i in $infile" >&2
-            # sort "compacts" leptonica-output
-            cat $TESSERRORFILE | sort -u >&2
-        fi
-        # else
-            # debugging purpose
-            # SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
-            # echo "no pdftoppm in $infile cp to $SICFILE" >&2
-            # cp -a $infile $SICFILE
-        # fi
-    fi
-done
-
-# don't output "empty" HTML-Files
-CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m) 
-if [ "$CHARS" -gt 0 ] ; then
-    echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>" 
-    cat "$TMPFILE"*.txt | \
-        awk '{
-  gsub(/&/, "\\&amp;", $0)
-  gsub(/</, "\\&lt;", $0)
-  gsub(/>/, "\\&gt;", $0)
-  print $0
-}
-'
-    echo "</pre></body></html>"
-fi
--- a/src/filters/rclppt
+++ b/src/filters/rclppt
@ -1,110 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-
-#================================================================
-# Handle powerpoint files for recoll. 
-# Use unoconv, this is very slow, but catppt just can't handle the majority
-# of semi-modern ppt files
-
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclppt"
-filetype=powerpoint
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-filtersdir=`dirname $0`
-checkcmds $filtersdir/ppt-dump.py
-
-mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
-
-cat <<EOF
-<html><head>
-<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
-</head><body><pre>
-EOF
-
-$mso "$infile"| sed -e 's/</&lt;/g' -e 's/&/&amp;/g' 
-
-echo '</pre></body></html>'
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@ -6,15 +6,12 @@ import re
 import sys
 import os

-# Processing the output from unrtf
 class PPTProcessData:
    def __init__(self, em):
        self.em = em
        self.out = ""
        self.gotdata = 0

-    # Some versions of unrtf put out a garbled charset line.
-    # Apart from this, we pass the data untouched.
    def takeLine(self, line):
        if not self.gotdata:
            self.out += '''<html><head>''' + \
@ -22,7 +19,7 @@ class PPTProcessData:
                        '''content="text/html;charset=UTF-8">''' + \
                        '''</head><body><pre>'''
            self.gotdata = True
-        self.out += self.em.htmlescape(line)
+        self.out += self.em.htmlescape(line) + "<br>\n"

    def wrapData(self):
        return self.out + '''</pre></body></html>'''
--- a/src/filters/rclrtf
+++ b/src/filters/rclrtf
@ -1,102 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $  (C) 2004 J.F.Dockes
-# Some inspiration from estraier
-#================================================================
-# convert rtf to html, by  executing the unrtf program:
-#    http://www.gnu.org/software/unrtf/unrtf.html
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclrtl"
-filetype=rtf
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds awk unrtf
-
-# output the result
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-# The thing about the charset is that unrtf outputs a garbled one.
-unrtf --nopict --html "$infile" 2> /dev/null |
-awk 'BEGIN'\
-' {
-  gothead = 0
-}
-/<\/head>/{
-    if (gothead == 0) {
-        printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
-	gothead = 1
-    }
-}
-/<meta http-equiv=/{next}
-{
-  print
-}
-' 
--- a/src/filters/rclsoff
+++ b/src/filters/rclsoff
@ -1,225 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Extract text from an openoffice/soffice file
-#
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclsoff"
-filetype=openoffice
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc unzip
-
-# We need a temporary directory
-if test z"$RECOLL_TMPDIR" != z; then
-   ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-   ttdir=$TMPDIR
-else
-   ttdir=/tmp
-fi
-tmpdir=$ttdir/rclsoff_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclsofftmp || exit 1
-
-cleanup()
-{
-    # Note that we're using a constant part (rclsofftmp), that hopefully
-    # guarantees that we can't do big mistakes here.
-    rm -rf $tmpdir/rclsofftmp
-    rmdir $tmpdir
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Unzip the input file and change to the unzipped directory
-unzip -q -d $tmpdir/rclsofftmp "$infile"
-cd $tmpdir/rclsofftmp
-
-echo '<html><head>
-<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
-
-xsltproc --novalid --nonet - meta.xml <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet version="1.0"
-  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-  xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" 
-  xmlns:xlink="http://www.w3.org/1999/xlink" 
-  xmlns:dc="http://purl.org/dc/elements/1.1/" 
-  xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" 
-  xmlns:ooo="http://openoffice.org/2004/office"
-  exclude-result-prefixes="office xlink meta ooo dc"
-  >
-
-<xsl:output method="html" encoding="UTF-8"/>
-
-<xsl:template match="/office:document-meta">
-  <xsl:apply-templates select="office:meta/dc:description"/>
-  <xsl:apply-templates select="office:meta/dc:subject"/>
-  <xsl:apply-templates select="office:meta/dc:title"/>
-  <xsl:apply-templates select="office:meta/meta:keyword"/>
-  <xsl:apply-templates select="office:meta/dc:creator"/>
-</xsl:template>
-
-<xsl:template match="dc:title">
-<title> <xsl:value-of select="."/> </title><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:description">
-  <meta>
-  <xsl:attribute name="name">abstract</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:subject">
-  <meta>
-  <xsl:attribute name="name">keywords</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:creator">
-  <meta>
-  <xsl:attribute name="name">author</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="meta:keyword">
-  <meta>
-  <xsl:attribute name="name">keywords</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-</xsl:stylesheet>
-EOF
-
-echo '</head><body>'
-
-xsltproc --novalid --nonet - content.xml <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet version="1.0"
-  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-  xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
-  exclude-result-prefixes="text"
->
-
-<xsl:output method="html" encoding="UTF-8"/>
-
-<xsl:template match="text:p">
-  <p><xsl:apply-templates/></p><xsl:text>
-  </xsl:text>
-</xsl:template>
-
-<xsl:template match="text:h">
-<p><xsl:apply-templates/></p><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="text:s">
-<xsl:text> </xsl:text>
-</xsl:template>
-
-<xsl:template match="text:line-break">
-<br />
-</xsl:template>
-
-<xsl:template match="text:tab">
-<xsl:text>    </xsl:text>
-</xsl:template>
-
-</xsl:stylesheet>
-EOF
-echo '</body></html>'
-cd /
-exit 0
--- a/src/filters/rclsvg
+++ b/src/filters/rclsvg
@ -1,161 +0,0 @@
-#!/bin/sh
-
-#================================================================
-# Extract text from a Scalable Vector Graphics file
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclsvg"
-filetype=svg
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc
-
-xsltproc --novalid --nonet - "$infile" <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet version="1.0"
-  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-  xmlns:svg="http://www.w3.org/2000/svg"
-  xmlns:dc="http://purl.org/dc/elements/1.1/"
-  exclude-result-prefixes="svg"
-  >
-
-<xsl:output method="html" encoding="UTF-8"/>
-
-<xsl:template match="/">
-  <html>
-  <head>
-  <xsl:apply-templates select="svg:svg/svg:title"/>
-  <xsl:apply-templates select="svg:svg/svg:desc"/>
-  <xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:creator"/>
-  <xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:subject"/>
-  <xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:description"/>
-  </head>
-  <body>
-  <xsl:apply-templates select="//svg:text"/>
-  </body>
-  </html>
-</xsl:template>
-
-<xsl:template match="svg:desc"> 
-  <meta>
-  <xsl:attribute name="name">keywords</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:creator"> 
-  <meta>
-  <xsl:attribute name="name">author</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:subject"> 
-  <meta>
-  <xsl:attribute name="name">keywords</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="dc:description"> 
-  <meta>
-  <xsl:attribute name="name">description</xsl:attribute>
-  <xsl:attribute name="content">
-     <xsl:value-of select="."/>
-  </xsl:attribute>
-  </meta><xsl:text>
-</xsl:text>
-</xsl:template>
-
-<xsl:template match="svg:title"> 
-  <title><xsl:value-of select="."/></title><xsl:text>
-  </xsl:text>
-</xsl:template>
-	    
-<xsl:template match="svg:text"> 
-  <p><xsl:value-of select="."/></p><xsl:text>
-  </xsl:text>
-</xsl:template>
-
-</xsl:stylesheet>
-EOF
-
-exit 0
--- a/src/filters/rclsvg.py
+++ b/src/filters/rclsvg.py
@ -116,7 +116,7 @@ class SVGExtractor:
            self.em.rclog("%s: bad data: " % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

-        return (True, docdata, "", rclexecm.RclExecM.eofnow)
+        return (True, docdata, "", rclexecm.RclExecM.eofnext)
    
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
--- a/src/filters/rcltext
+++ b/src/filters/rcltext
@ -1,91 +0,0 @@
-#!/bin/sh
-# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $  (C) 2004 J.F.Dockes
-# Parts taken from Estraier:
-#================================================================
-# Estraier: a personal full-text search system
-# Copyright (C) 2003-2004 Mikio Hirabayashi
-#================================================================
-#================================================================
-# Wrap generic text (ie: program text) in html
-# Assumes ascii or iso-8859-1
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rcltext"
-filetype=text
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds sed
-echo '<html><head><title></title></head><body><pre>'
-
-sed -e 's/\&/\&amp;/g' -e 's/</\&lt;/g' "$infile" 
-
-echo '</pre></body></html>'
--- a/src/filters/rclxls
+++ b/src/filters/rclxls
@ -1,116 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $  (C) 2004 J.F.Dockes
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-
-#================================================================
-# Handle excel files for recoll. 
-#================================================================
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxls"
-filetype=excel
-
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-top=`dirname $0`
-XLSDUMP="$top/xls-dump.py"
-XMLTOCSV="$top/xlsxmltocsv.py"
-
-checkcmds $XLSDUMP $XLSTOCSV
-
-# output the result
-echo '<html><head>'
-#echo '<title>' "$title" '</title>'
-echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
-echo '</head><body>'
-echo '<pre>'
-
-$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
-   $XMLTOCSV | \
-   sed -e 's/</&lt;/g' -e 's/&/&amp;/g' 
-
-echo '</pre>'
-echo '</body></html>'
-
-# exit normally
-exit 0
--- a/src/filters/rclxls.py
+++ b/src/filters/rclxls.py
@ -8,7 +8,6 @@ import sys
 import os
 import xml.sax

-# Processing the output from unrtf
 class XLSProcessData:
    def __init__(self, em):
        self.em = em
@ -16,8 +15,6 @@ class XLSProcessData:
        self.gotdata = 0
        self.xmldata = ""
        
-    # Some versions of unrtf put out a garbled charset line.
-    # Apart from this, we pass the data untouched.
    def takeLine(self, line):
        if not self.gotdata:
            self.out += '''<html><head>''' + \
--- a/src/filters/rclxml
+++ b/src/filters/rclxml
@ -1,119 +0,0 @@
-#!/bin/sh
-
-#================================================================
-# Extract text from a generic XML file (Justus Piater)
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclxml"
-filetype=xml
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc
-
-xsltproc --novalid --nonet - "$infile" <<EOF
-<?xml version="1.0"?>
-<xsl:stylesheet version="1.0"
-		xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-
-  <xsl:output method="html" encoding="UTF-8"/>
-
-  <xsl:template match="/">
-    <html>
-      <head>
-	<xsl:if test="//*[local-name() = 'title']">
-	  <title>
-	    <xsl:value-of select="//*[local-name() = 'title'][1]"/>
-	  </title>
-	</xsl:if>
-      </head>
-      <body>
-	<xsl:apply-templates/>
-      </body>
-    </html>
-  </xsl:template>
-
-  <xsl:template match="text()">
-    <xsl:if test="string-length(normalize-space(.)) &gt; 0">
-      <p><xsl:value-of select="."/></p>
-      <xsl:text>
-      </xsl:text>
-    </xsl:if>
-  </xsl:template>
-
-  <xsl:template match="*">
-    <xsl:apply-templates/>
-  </xsl:template>
-
-</xsl:stylesheet>
-EOF
-
-exit 0
--- a/src/filters/rclxml.py
+++ b/src/filters/rclxml.py
@ -74,7 +74,7 @@ class XMLExtractor:
            self.em.rclog("%s: bad data: " % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

-        return (True, docdata, "", rclexecm.RclExecM.eofnow)
+        return (True, docdata, "", rclexecm.RclExecM.eofnext)
    
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -51,7 +51,7 @@ application/javascript = internal text/plain
 #  - with unrtf: rtf files disguising as doc files.
 # The default is now again to use rcldoc. Use raw antiword if speed is more
 # important for you than catching all data, 
-application/msword = exec rcldoc
+application/msword = execm rcldoc.py
 #application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
 # You can also use wvware directly but it's much slower.
 # application/msword = exec wvWare --charset=utf-8 --nographics
@ -59,41 +59,40 @@ application/msword = exec rcldoc
 # Also Handle the mime type returned by "file -i" for a suffix-less word
 # file. This could probably just as well be an excel file, but we have to
 # chose one.
-application/vnd.ms-office = exec rcldoc
+application/vnd.ms-office = execm rcldoc.py

 application/ogg = execm rclaudio
-application/pdf = exec rclpdf
-# application/pdf = execm rclmpdf.py
+application/pdf = execm rclmpdf.py
 application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec rclxls
-application/vnd.ms-powerpoint = exec rclppt
-application/vnd.oasis.opendocument.text = exec rclsoff
-application/vnd.oasis.opendocument.text-template = exec rclsoff
-application/vnd.oasis.opendocument.presentation = exec rclsoff 
-application/vnd.oasis.opendocument.spreadsheet = exec rclsoff
-application/vnd.oasis.opendocument.graphics = exec rclsoff
+application/vnd.ms-excel = execm rclxls.py
+application/vnd.ms-powerpoint = execm rclppt.py
+application/vnd.oasis.opendocument.text = execm rclsoff.py
+application/vnd.oasis.opendocument.text-template = execm rclsoff.py
+application/vnd.oasis.opendocument.presentation = execm rclsoff.py 
+application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
+application/vnd.oasis.opendocument.graphics = execm rclsoff.py
 application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
- exec rclopxml
+ execm rclopxml.py
 application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
- exec rclopxml
+ execm rclopxml.py
 application/vnd.openxmlformats-officedocument.presentationml.template = \
- exec rclopxml
+ execm rclopxml.py
 application/vnd.openxmlformats-officedocument.presentationml.presentation = \
- exec rclopxml
+ execm rclopxml.py
 application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
- exec rclopxml
+ execm rclopxml.py
 application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
- exec rclopxml
-application/vnd.sun.xml.calc = exec rclsoff
-application/vnd.sun.xml.calc.template = exec rclsoff
-application/vnd.sun.xml.draw = exec rclsoff
-application/vnd.sun.xml.draw.template = exec rclsoff
-application/vnd.sun.xml.impress = exec rclsoff
-application/vnd.sun.xml.impress.template = exec rclsoff
-application/vnd.sun.xml.math = exec rclsoff
-application/vnd.sun.xml.writer = exec rclsoff
-application/vnd.sun.xml.writer.global = exec rclsoff
-application/vnd.sun.xml.writer.template = exec rclsoff
+ execm rclopxml.py
+application/vnd.sun.xml.calc = execm rclsoff.py
+application/vnd.sun.xml.calc.template = execm rclsoff.py
+application/vnd.sun.xml.draw = execm rclsoff.py
+application/vnd.sun.xml.draw.template = execm rclsoff.py
+application/vnd.sun.xml.impress = execm rclsoff.py
+application/vnd.sun.xml.impress.template = execm rclsoff.py
+application/vnd.sun.xml.math = execm rclsoff.py
+application/vnd.sun.xml.writer = execm rclsoff.py
+application/vnd.sun.xml.writer.global = execm rclsoff.py
+application/vnd.sun.xml.writer.template = execm rclsoff.py
 application/vnd.wordperfect = exec wpd2html;mimetype=text/html
 application/x-abiword = exec rclabw
 application/x-awk = internal text/plain
@ -101,7 +100,7 @@ application/x-chm = execm rclchm
 application/x-dia-diagram = execm rcldia;mimetype=text/plain
 application/x-dvi = exec rcldvi
 application/x-flac = execm rclaudio
-application/x-gnote = exec rclxml
+application/x-gnote = execm rclxml.py
 application/x-gnuinfo = execm rclinfo
 application/x-gnumeric = exec rclgnm
 application/x-kword = exec rclkwd
@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio
 audio/mp4 = execm rclaudio
 audio/aac = execm rclaudio
 audio/x-karaoke = execm rclkar
-image/gif = execm rclimg
-image/jp2 = execm rclimg
-image/jpeg = execm rclimg
-image/png = execm rclimg
-image/tiff = execm rclimg
+image/gif = execm rclimg.py
+image/jp2 = execm rclimg.py
+image/jpeg = execm rclimg.py
+image/png = execm rclimg.py
+image/tiff = execm rclimg.py
 image/vnd.djvu = exec rcldjvu
-image/svg+xml = exec rclsvg
-image/x-xcf = execm rclimg
+image/svg+xml = execm rclsvg.py
+image/x-xcf = execm rclimg.py
 inode/symlink = internal
 application/x-zerosize = internal
 inode/x-empty = internal application/x-zerosize
@ -159,9 +158,8 @@ text/x-python = exec rclpython
 text/x-shellscript = internal text/plain
 text/x-srt = internal text/plain
 text/x-tex = exec rcltex
-
-application/xml = exec rclxml
-text/xml = exec rclxml
+application/xml = execm rclxml.py
+text/xml = execm rclxml.py
 # Using these instead of the two above would index all parameter and tag
 # names, attribute values etc, instead of just the text content.
 #application/xml = internal text/plain
--- a/src/windows/mimeconf
+++ b/src/windows/mimeconf
@ -135,7 +135,8 @@ message/rfc822 = internal
 text/calendar = execm python rclics;mimetype=text/plain
 text/html  = internal 
 text/plain = internal 
-text/rtf = execm python rclrtf.py
+text/rtf = exec unrtf --nopict --html;mimetype=text/html
+#text/rtf = execm python rclrtf.py
 text/x-c = internal
 text/x-c++ = internal
 text/x-c+ = internal