Use the python-based filters written for ms-win on Linux too
This commit is contained in:
parent
dc9d9900be
commit
4c3e112c27
@ -90,11 +90,8 @@ src/qtgui/Makefile
|
||||
src/qtgui/qrc_recoll.cpp
|
||||
src/qtgui/recoll
|
||||
src/qtgui/recoll.app
|
||||
src/query/alldeps
|
||||
src/query/recollq
|
||||
src/sampleconf/rclmon.sh
|
||||
src/sampleconf/recoll.conf
|
||||
src/utils/alldeps
|
||||
tests/casediac/aspdict.en.rws
|
||||
tests/casediac/idxstatus.txt
|
||||
tests/casediac/index.pid
|
||||
|
||||
@ -1,176 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from an msword file by executing either antiword
|
||||
# or wvware
|
||||
#
|
||||
#================================================================
|
||||
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcldoc"
|
||||
filetype=ms-word
|
||||
|
||||
decoder="antiword -t -i 1 -m UTF-8"
|
||||
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds awk antiword iconv
|
||||
|
||||
# We need to do some strange stuff to retrieve the status from antiword. Things
|
||||
# would be simpler if we relied on using bash.
|
||||
# Explanations:
|
||||
#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status
|
||||
|
||||
stdintoexitstatus() {
|
||||
read exitstatus
|
||||
return $exitstatus
|
||||
}
|
||||
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
(((($decoder "$infile"; echo $? >&3) |
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
cont = ""
|
||||
gotdata = 0
|
||||
}
|
||||
{
|
||||
if (!($0 ~ /^[ ]*$/) && gotdata == 0) {
|
||||
print "<html><head><title></title>"
|
||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
||||
print "</head>\n<body>\n<p>"
|
||||
gotdata = 1
|
||||
}
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
}
|
||||
|
||||
if($0 == "\f") {
|
||||
print "</p><hr><p>"
|
||||
next
|
||||
}
|
||||
|
||||
if (gotdata == 1) {
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
|
||||
print $0 "<br>"
|
||||
}
|
||||
}
|
||||
END {
|
||||
if (gotdata == 1)
|
||||
print "</p></body></html>"
|
||||
}' >&4) 3>&1) | stdintoexitstatus) 4>&1
|
||||
|
||||
|
||||
# Antiword rarely fails, we try to catch the most common reasons:
|
||||
if test $? -eq 1 ; then
|
||||
# Check actual document type
|
||||
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
|
||||
|
||||
if test X"$mtype" = Xtext/rtf; then
|
||||
# RTF document disguising as msword either because it has a .doc
|
||||
# extension or because it's an attachment with a wrong mime.
|
||||
exec `dirname $0`/rclrtf "$infile"
|
||||
fi
|
||||
|
||||
if test X"$mtype" = Xtext/plain; then
|
||||
# Someone gave a .doc ext to their texts. Happens...
|
||||
exec `dirname $0`/rcltext "$infile"
|
||||
fi
|
||||
|
||||
if test X"$mtype" = Xapplication/msword; then
|
||||
# Actually application/msword: try wvWare, which is much
|
||||
# slower and we don't use it by default, but it handles some
|
||||
# files that antiword won't, so use it as a last resort.
|
||||
if iscmd wvWare ; then
|
||||
exec wvWare --nographics --charset=utf-8 "$infile"
|
||||
fi
|
||||
fi
|
||||
|
||||
# else let the error be...
|
||||
exit 1
|
||||
fi
|
||||
@ -1,238 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
#================================================================
|
||||
# Extract text from an openxml msword file (will be extended for spreadsheets)
|
||||
# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname=rclopxml
|
||||
filetype=openxml
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc unzip
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclopxml_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclopxmltmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclopxmltmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclopxmltmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
unzip -q -d $tmpdir/rclopxmltmp "$infile"
|
||||
cd $tmpdir/rclopxmltmp
|
||||
|
||||
echo '<html>
|
||||
<head>'
|
||||
|
||||
xsltproc --novalid --nonet - docProps/core.xml <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
|
||||
<!-- <xsl:output method="text"/> -->
|
||||
<xsl:output omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="cp:coreProperties">
|
||||
<xsl:text> </xsl:text>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">
|
||||
<!-- <xsl:value-of select="name()"/> pour sortir tous les meta avec
|
||||
le meme nom que dans le xml (si on devenait dc-natif) -->
|
||||
<xsl:text>author</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dcterms:modified">
|
||||
<meta>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:text>date</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
echo '</head>
|
||||
<body>'
|
||||
|
||||
filename=''
|
||||
if test -f word/document.xml ; then
|
||||
filenames=word/document.xml
|
||||
tagmatch="w:p"
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
|
||||
xmlns:v="urn:schemas-microsoft-com:vml"
|
||||
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
||||
xmlns:w10="urn:schemas-microsoft-com:office:word"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
||||
'
|
||||
|
||||
elif test -f xl/sharedStrings.xml ; then
|
||||
filenames=xl/sharedStrings.xml
|
||||
tagmatch='x:t'
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
||||
'
|
||||
|
||||
elif test -f ppt/slides/slide1.xml ; then
|
||||
filenames=`echo ppt/slides/slide*.xml`
|
||||
tagmatch='a:t'
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
||||
'
|
||||
# I want to suppress text output for all except a:t, don't know how to do it
|
||||
# help ! At least get rid of these:
|
||||
moretemplates='
|
||||
<xsl:template match="p:attrName">
|
||||
</xsl:template>
|
||||
'
|
||||
else
|
||||
# ??
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
for filename in $filenames;do
|
||||
xsltproc --novalid --nonet - $filename <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet $xmlns_decls >
|
||||
|
||||
<xsl:output omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<div>
|
||||
<xsl:apply-templates/>
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="$tagmatch">
|
||||
<p>
|
||||
<xsl:value-of select="."/>
|
||||
</p>
|
||||
</xsl:template>
|
||||
|
||||
$moretemplates
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
done
|
||||
|
||||
echo '</html>'
|
||||
@ -1,351 +0,0 @@
|
||||
#!/bin/bash
|
||||
#================================================================
|
||||
# Copyright (C) 2015 J.F. Dockes
|
||||
# There used to be Estraier content in there, but I quite believe that is not
|
||||
# the case any more.
|
||||
# This file is licensed under the GPL v2
|
||||
#================================================================
|
||||
# Convert a pdf file to HTML.
|
||||
#
|
||||
# We use pdftotext from the xpdf/poppler-utils package.
|
||||
#
|
||||
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
||||
# We try to correct.
|
||||
#
|
||||
# If pdftotext produces no text and tesseract is available, we try to
|
||||
# perform OCR. As this can be very slow and the result not always
|
||||
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
||||
#
|
||||
# We guess the OCR language in order of preference:
|
||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
||||
# directory as the PDF
|
||||
# - From an RECOLL_TESSERACT_LANG environment variable
|
||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
||||
# - Default to "eng"
|
||||
#
|
||||
# Uncomment the following if you get better results without. The
|
||||
# pdftotext manual says that the option is no longer recommended The
|
||||
# difference in output seems mostly the removal of soft-hyphens when
|
||||
# -raw is not set
|
||||
# optionraw=-raw
|
||||
|
||||
# set variables
|
||||
progname="rclpdf"
|
||||
filetype=pdf
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds pdftotext iconv awk
|
||||
|
||||
ocrpossible=0
|
||||
if iscmd tesseract; then
|
||||
if iscmd pdftoppm; then
|
||||
ocrpossible=1
|
||||
fi
|
||||
fi
|
||||
confdir=${RECOLL_CONFDIR:-~/.recoll}
|
||||
test ! -f "$confdir/ocrpdf" && ocrpossible=0
|
||||
|
||||
tmpdir=
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclpdftmp), that hopefully
|
||||
# guarantees that we can't do big mistakes with the -rf here.
|
||||
if test ! -z "$tmpdir"; then
|
||||
rm -rf $tmpdir/rclpdftmp
|
||||
rmdir $tmpdir
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
runpdftotext()
|
||||
{
|
||||
# Test poppler version: at some point before 0.24, poppler began
|
||||
# to properly escape text inside the header (but not the body).
|
||||
XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
|
||||
MAJOR=`echo $XYZ | cut -d. -f 1`
|
||||
MINOR=`echo $XYZ | cut -d. -f 2`
|
||||
escapeheader=1
|
||||
escapebody=1
|
||||
if test "$MAJOR" -gt 0 ; then
|
||||
escapeheader=0
|
||||
elif test "$MINOR" -ge 24; then
|
||||
escapeheader=0;
|
||||
fi
|
||||
|
||||
# Run pdftotext and fix the result (add a charset tag and fix the
|
||||
# html escaping). The escaping is a half-hearted job. We do try to
|
||||
# fix some header fields, only for those which are single-line.
|
||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
|
||||
' {
|
||||
inbodypre = 0
|
||||
cont = ""
|
||||
}
|
||||
function escapehtml(s)
|
||||
{
|
||||
gsub(/&/, "\\&", s)
|
||||
gsub(/</, "\\<", s)
|
||||
gsub(/>/, "\\>", s)
|
||||
gsub(/"/, "\\"", s)
|
||||
return s
|
||||
}
|
||||
{
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
# Insert charset meta tag at end of header
|
||||
if(inbodypre == 0 && $0 ~ /<\/head>/) {
|
||||
match($0, /<\/head>/)
|
||||
part1 = substr($0, 0, RSTART-1)
|
||||
part2 = substr($0, RSTART, length($0))
|
||||
charsetmeta = "<meta http-equiv=\"Content-Type\" "\
|
||||
"content=\"text/html; charset=UTF-8\">"
|
||||
$0 = part1 charsetmeta "\n" part2
|
||||
}
|
||||
if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
|
||||
match($0, /<title>.*<\/title>/)
|
||||
part1 = substr($0, 0, RSTART-1)
|
||||
mid = substr($0, RSTART, RLENGTH)
|
||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||
gsub(/<title>/, "", mid)
|
||||
gsub(/<\/title>/, "", mid)
|
||||
if (escapeheader) {
|
||||
mid = escapehtml(mid)
|
||||
}
|
||||
mid = "<title>" mid "</title>"
|
||||
$0 = part1 mid part2
|
||||
}
|
||||
# This matches all single-line meta fields
|
||||
if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
|
||||
match($0, /content=".*"\/>/)
|
||||
part1 = substr($0, 0, RSTART-1)
|
||||
mid = substr($0, RSTART, RLENGTH)
|
||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||
gsub(/content="/, "", mid)
|
||||
gsub(/"\/>/, "", mid)
|
||||
if (escapeheader) {
|
||||
mid = escapehtml(mid)
|
||||
}
|
||||
mid = "content=\"" mid "\"/>"
|
||||
$0 = part1 mid part2
|
||||
}
|
||||
|
||||
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
||||
# "Subject" metadata field is more like an HTML "description"
|
||||
if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
|
||||
gsub(/="Subject"/, "=\"Description\"", $0)
|
||||
}
|
||||
|
||||
if ($0 == "<pre>"){
|
||||
# Begin of body text.
|
||||
inbodypre++
|
||||
print $0
|
||||
next
|
||||
} else if ($0 ~ /<\/pre>/){
|
||||
inbodypre--
|
||||
print $0
|
||||
next
|
||||
} else if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
# print "LINE [" $0 "] CONT[" cont "]"
|
||||
}
|
||||
if(inbodypre > 0 && escapebody){
|
||||
$0 = escapehtml($0)
|
||||
}
|
||||
print $0
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
# If we're not equipped for ocr, just run pdftotext to stdout
|
||||
if test $ocrpossible -eq 0; then
|
||||
runpdftotext
|
||||
exit $?
|
||||
fi
|
||||
|
||||
|
||||
# tesseract is installed, prepare for running it.
|
||||
# We need to check the pdftotext output, but we don't want to run
|
||||
# it twice. Use a temporary file.
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclpdf_tmp$$
|
||||
mkdir $tmpdir || senderror mkdir $tmpdir failed
|
||||
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
|
||||
|
||||
# Run pdftotext into the temp file
|
||||
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
|
||||
runpdftotext > $pdftxtfile
|
||||
|
||||
# If text is big, or small but not only tags and empty lines, output
|
||||
# it. Given the contents check which we perform, a file in which the
|
||||
# only text content is metadata (pdf description field), will be run
|
||||
# through OCR, which is not necessarily what we would want. It would
|
||||
# be possible to detect the situation if this proved an issue.
|
||||
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
|
||||
txtempty=0
|
||||
# Use grep to check if there is regular text in there. Only do it on
|
||||
# small outputs
|
||||
if test $txtsize -lt 5000 ; then
|
||||
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
|
||||
test -z "$realtext" && txtempty=1
|
||||
fi
|
||||
|
||||
if test $txtempty -eq 0; then
|
||||
# pdftotext produced actual output, use it. No OCR
|
||||
cat $pdftxtfile
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# PDF has no text content and tesseract is available. Give it a try
|
||||
pdflangfile=`dirname "$infile"`/.ocrpdflang
|
||||
if test -f "$pdflangfile"; then
|
||||
tesseractlang=`cat "$pdflangfile"`
|
||||
fi
|
||||
|
||||
# Try to guess tesseract language. This should depend on the input
|
||||
# file, but we have no general way to determine it. So use the
|
||||
# environment and hope for the best.
|
||||
if test -z "$tesseractlang"; then
|
||||
tesseractlang=${RECOLL_TESSERACT_LANG}
|
||||
if test -z "$tesseractlang"; then
|
||||
# Half assed trial to guess from LANG then default to english
|
||||
localelang=`echo $LANG | awk -F_ '{print $1}'`
|
||||
# echo localelang "$localelang" >&2
|
||||
case "$localelang" in
|
||||
en) tesseractlang=eng;;
|
||||
de) tesseractlang=deu;;
|
||||
fr) tesseractlang=fra;;
|
||||
# Someone will have to add more tesseract language codes here.
|
||||
esac
|
||||
|
||||
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
|
||||
|
||||
test -z "$tesseractlang" && tesseractlang="eng"
|
||||
fi
|
||||
fi
|
||||
|
||||
# echo tesseractlang "$tesseractlang" >&2
|
||||
|
||||
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
|
||||
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
|
||||
|
||||
# split pdf-pages
|
||||
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
|
||||
if [ $? -ne 0 ] ; then
|
||||
senderror "pdftoppm: $ERR_MSG"
|
||||
fi
|
||||
|
||||
for i in $TMPFILE* ; do
|
||||
if [ -s "$i" ] ; then
|
||||
|
||||
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
|
||||
TESSERR=$?
|
||||
# ignore tesseract start message
|
||||
LINECOUNT=$(wc -l < $TESSERRORFILE)
|
||||
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
|
||||
echo "tesseract-error $TESSERR page $i in $infile" >&2
|
||||
# sort "compacts" leptonica-output
|
||||
cat $TESSERRORFILE | sort -u >&2
|
||||
fi
|
||||
# else
|
||||
# debugging purpose
|
||||
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
|
||||
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
|
||||
# cp -a $infile $SICFILE
|
||||
# fi
|
||||
fi
|
||||
done
|
||||
|
||||
# don't output "empty" HTML-Files
|
||||
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
|
||||
if [ "$CHARS" -gt 0 ] ; then
|
||||
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
|
||||
cat "$TMPFILE"*.txt | \
|
||||
awk '{
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
print $0
|
||||
}
|
||||
'
|
||||
echo "</pre></body></html>"
|
||||
fi
|
||||
@ -1,110 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
#================================================================
|
||||
# Handle powerpoint files for recoll.
|
||||
# Use unoconv, this is very slow, but catppt just can't handle the majority
|
||||
# of semi-modern ppt files
|
||||
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclppt"
|
||||
filetype=powerpoint
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
filtersdir=`dirname $0`
|
||||
checkcmds $filtersdir/ppt-dump.py
|
||||
|
||||
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
|
||||
|
||||
cat <<EOF
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||
</head><body><pre>
|
||||
EOF
|
||||
|
||||
$mso "$infile"| sed -e 's/</</g' -e 's/&/&/g'
|
||||
|
||||
echo '</pre></body></html>'
|
||||
@ -6,15 +6,12 @@ import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Processing the output from unrtf
|
||||
class PPTProcessData:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.out = ""
|
||||
self.gotdata = 0
|
||||
|
||||
# Some versions of unrtf put out a garbled charset line.
|
||||
# Apart from this, we pass the data untouched.
|
||||
def takeLine(self, line):
|
||||
if not self.gotdata:
|
||||
self.out += '''<html><head>''' + \
|
||||
@ -22,7 +19,7 @@ class PPTProcessData:
|
||||
'''content="text/html;charset=UTF-8">''' + \
|
||||
'''</head><body><pre>'''
|
||||
self.gotdata = True
|
||||
self.out += self.em.htmlescape(line)
|
||||
self.out += self.em.htmlescape(line) + "<br>\n"
|
||||
|
||||
def wrapData(self):
|
||||
return self.out + '''</pre></body></html>'''
|
||||
|
||||
@ -1,102 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Some inspiration from estraier
|
||||
#================================================================
|
||||
# convert rtf to html, by executing the unrtf program:
|
||||
# http://www.gnu.org/software/unrtf/unrtf.html
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclrtl"
|
||||
filetype=rtf
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds awk unrtf
|
||||
|
||||
# output the result
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
# The thing about the charset is that unrtf outputs a garbled one.
|
||||
unrtf --nopict --html "$infile" 2> /dev/null |
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
gothead = 0
|
||||
}
|
||||
/<\/head>/{
|
||||
if (gothead == 0) {
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
|
||||
gothead = 1
|
||||
}
|
||||
}
|
||||
/<meta http-equiv=/{next}
|
||||
{
|
||||
print
|
||||
}
|
||||
'
|
||||
@ -1,225 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from an openoffice/soffice file
|
||||
#
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclsoff"
|
||||
filetype=openoffice
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc unzip
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclsoff_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclsofftmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclsofftmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclsofftmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
unzip -q -d $tmpdir/rclsofftmp "$infile"
|
||||
cd $tmpdir/rclsofftmp
|
||||
|
||||
echo '<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
|
||||
|
||||
xsltproc --novalid --nonet - meta.xml <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
||||
xmlns:ooo="http://openoffice.org/2004/office"
|
||||
exclude-result-prefixes="office xlink meta ooo dc"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/office:document-meta">
|
||||
<xsl:apply-templates select="office:meta/dc:description"/>
|
||||
<xsl:apply-templates select="office:meta/dc:subject"/>
|
||||
<xsl:apply-templates select="office:meta/dc:title"/>
|
||||
<xsl:apply-templates select="office:meta/meta:keyword"/>
|
||||
<xsl:apply-templates select="office:meta/dc:creator"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:title">
|
||||
<title> <xsl:value-of select="."/> </title><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:description">
|
||||
<meta>
|
||||
<xsl:attribute name="name">abstract</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:subject">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="meta:keyword">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
echo '</head><body>'
|
||||
|
||||
xsltproc --novalid --nonet - content.xml <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
||||
exclude-result-prefixes="text"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="text:p">
|
||||
<p><xsl:apply-templates/></p><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:h">
|
||||
<p><xsl:apply-templates/></p><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:s">
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:line-break">
|
||||
<br />
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:tab">
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
echo '</body></html>'
|
||||
cd /
|
||||
exit 0
|
||||
@ -1,161 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
#================================================================
|
||||
# Extract text from a Scalable Vector Graphics file
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclsvg"
|
||||
filetype=svg
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc
|
||||
|
||||
xsltproc --novalid --nonet - "$infile" <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
exclude-result-prefixes="svg"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<xsl:apply-templates select="svg:svg/svg:title"/>
|
||||
<xsl:apply-templates select="svg:svg/svg:desc"/>
|
||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:creator"/>
|
||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:subject"/>
|
||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:description"/>
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates select="//svg:text"/>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="svg:desc">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:subject">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:description">
|
||||
<meta>
|
||||
<xsl:attribute name="name">description</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="svg:title">
|
||||
<title><xsl:value-of select="."/></title><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="svg:text">
|
||||
<p><xsl:value-of select="."/></p><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
exit 0
|
||||
@ -116,7 +116,7 @@ class SVGExtractor:
|
||||
self.em.rclog("%s: bad data: " % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Wrap generic text (ie: program text) in html
|
||||
# Assumes ascii or iso-8859-1
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcltext"
|
||||
filetype=text
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds sed
|
||||
echo '<html><head><title></title></head><body><pre>'
|
||||
|
||||
sed -e 's/\&/\&/g' -e 's/</\</g' "$infile"
|
||||
|
||||
echo '</pre></body></html>'
|
||||
@ -1,116 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
#================================================================
|
||||
# Handle excel files for recoll.
|
||||
#================================================================
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclxls"
|
||||
filetype=excel
|
||||
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
top=`dirname $0`
|
||||
XLSDUMP="$top/xls-dump.py"
|
||||
XMLTOCSV="$top/xlsxmltocsv.py"
|
||||
|
||||
checkcmds $XLSDUMP $XLSTOCSV
|
||||
|
||||
# output the result
|
||||
echo '<html><head>'
|
||||
#echo '<title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '</head><body>'
|
||||
echo '<pre>'
|
||||
|
||||
$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
|
||||
$XMLTOCSV | \
|
||||
sed -e 's/</</g' -e 's/&/&/g'
|
||||
|
||||
echo '</pre>'
|
||||
echo '</body></html>'
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
@ -8,7 +8,6 @@ import sys
|
||||
import os
|
||||
import xml.sax
|
||||
|
||||
# Processing the output from unrtf
|
||||
class XLSProcessData:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
@ -16,8 +15,6 @@ class XLSProcessData:
|
||||
self.gotdata = 0
|
||||
self.xmldata = ""
|
||||
|
||||
# Some versions of unrtf put out a garbled charset line.
|
||||
# Apart from this, we pass the data untouched.
|
||||
def takeLine(self, line):
|
||||
if not self.gotdata:
|
||||
self.out += '''<html><head>''' + \
|
||||
|
||||
@ -1,119 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
#================================================================
|
||||
# Extract text from a generic XML file (Justus Piater)
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclxml"
|
||||
filetype=xml
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc
|
||||
|
||||
xsltproc --novalid --nonet - "$infile" <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<xsl:if test="//*[local-name() = 'title']">
|
||||
<title>
|
||||
<xsl:value-of select="//*[local-name() = 'title'][1]"/>
|
||||
</title>
|
||||
</xsl:if>
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates/>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()">
|
||||
<xsl:if test="string-length(normalize-space(.)) > 0">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
<xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
exit 0
|
||||
@ -74,7 +74,7 @@ class XMLExtractor:
|
||||
self.em.rclog("%s: bad data: " % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
|
||||
@ -51,7 +51,7 @@ application/javascript = internal text/plain
|
||||
# - with unrtf: rtf files disguising as doc files.
|
||||
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||
# important for you than catching all data,
|
||||
application/msword = exec rcldoc
|
||||
application/msword = execm rcldoc.py
|
||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
|
||||
# You can also use wvware directly but it's much slower.
|
||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||
@ -59,41 +59,40 @@ application/msword = exec rcldoc
|
||||
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
||||
# file. This could probably just as well be an excel file, but we have to
|
||||
# chose one.
|
||||
application/vnd.ms-office = exec rcldoc
|
||||
application/vnd.ms-office = execm rcldoc.py
|
||||
|
||||
application/ogg = execm rclaudio
|
||||
application/pdf = exec rclpdf
|
||||
# application/pdf = execm rclmpdf.py
|
||||
application/pdf = execm rclmpdf.py
|
||||
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
||||
application/vnd.ms-excel = exec rclxls
|
||||
application/vnd.ms-powerpoint = exec rclppt
|
||||
application/vnd.oasis.opendocument.text = exec rclsoff
|
||||
application/vnd.oasis.opendocument.text-template = exec rclsoff
|
||||
application/vnd.oasis.opendocument.presentation = exec rclsoff
|
||||
application/vnd.oasis.opendocument.spreadsheet = exec rclsoff
|
||||
application/vnd.oasis.opendocument.graphics = exec rclsoff
|
||||
application/vnd.ms-excel = execm rclxls.py
|
||||
application/vnd.ms-powerpoint = execm rclppt.py
|
||||
application/vnd.oasis.opendocument.text = execm rclsoff.py
|
||||
application/vnd.oasis.opendocument.text-template = execm rclsoff.py
|
||||
application/vnd.oasis.opendocument.presentation = execm rclsoff.py
|
||||
application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
|
||||
application/vnd.oasis.opendocument.graphics = execm rclsoff.py
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||
exec rclopxml
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||
exec rclopxml
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
exec rclopxml
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
exec rclopxml
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
exec rclopxml
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
||||
exec rclopxml
|
||||
application/vnd.sun.xml.calc = exec rclsoff
|
||||
application/vnd.sun.xml.calc.template = exec rclsoff
|
||||
application/vnd.sun.xml.draw = exec rclsoff
|
||||
application/vnd.sun.xml.draw.template = exec rclsoff
|
||||
application/vnd.sun.xml.impress = exec rclsoff
|
||||
application/vnd.sun.xml.impress.template = exec rclsoff
|
||||
application/vnd.sun.xml.math = exec rclsoff
|
||||
application/vnd.sun.xml.writer = exec rclsoff
|
||||
application/vnd.sun.xml.writer.global = exec rclsoff
|
||||
application/vnd.sun.xml.writer.template = exec rclsoff
|
||||
execm rclopxml.py
|
||||
application/vnd.sun.xml.calc = execm rclsoff.py
|
||||
application/vnd.sun.xml.calc.template = execm rclsoff.py
|
||||
application/vnd.sun.xml.draw = execm rclsoff.py
|
||||
application/vnd.sun.xml.draw.template = execm rclsoff.py
|
||||
application/vnd.sun.xml.impress = execm rclsoff.py
|
||||
application/vnd.sun.xml.impress.template = execm rclsoff.py
|
||||
application/vnd.sun.xml.math = execm rclsoff.py
|
||||
application/vnd.sun.xml.writer = execm rclsoff.py
|
||||
application/vnd.sun.xml.writer.global = execm rclsoff.py
|
||||
application/vnd.sun.xml.writer.template = execm rclsoff.py
|
||||
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
||||
application/x-abiword = exec rclabw
|
||||
application/x-awk = internal text/plain
|
||||
@ -101,7 +100,7 @@ application/x-chm = execm rclchm
|
||||
application/x-dia-diagram = execm rcldia;mimetype=text/plain
|
||||
application/x-dvi = exec rcldvi
|
||||
application/x-flac = execm rclaudio
|
||||
application/x-gnote = exec rclxml
|
||||
application/x-gnote = execm rclxml.py
|
||||
application/x-gnuinfo = execm rclinfo
|
||||
application/x-gnumeric = exec rclgnm
|
||||
application/x-kword = exec rclkwd
|
||||
@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio
|
||||
audio/mp4 = execm rclaudio
|
||||
audio/aac = execm rclaudio
|
||||
audio/x-karaoke = execm rclkar
|
||||
image/gif = execm rclimg
|
||||
image/jp2 = execm rclimg
|
||||
image/jpeg = execm rclimg
|
||||
image/png = execm rclimg
|
||||
image/tiff = execm rclimg
|
||||
image/gif = execm rclimg.py
|
||||
image/jp2 = execm rclimg.py
|
||||
image/jpeg = execm rclimg.py
|
||||
image/png = execm rclimg.py
|
||||
image/tiff = execm rclimg.py
|
||||
image/vnd.djvu = exec rcldjvu
|
||||
image/svg+xml = exec rclsvg
|
||||
image/x-xcf = execm rclimg
|
||||
image/svg+xml = execm rclsvg.py
|
||||
image/x-xcf = execm rclimg.py
|
||||
inode/symlink = internal
|
||||
application/x-zerosize = internal
|
||||
inode/x-empty = internal application/x-zerosize
|
||||
@ -159,9 +158,8 @@ text/x-python = exec rclpython
|
||||
text/x-shellscript = internal text/plain
|
||||
text/x-srt = internal text/plain
|
||||
text/x-tex = exec rcltex
|
||||
|
||||
application/xml = exec rclxml
|
||||
text/xml = exec rclxml
|
||||
application/xml = execm rclxml.py
|
||||
text/xml = execm rclxml.py
|
||||
# Using these instead of the two above would index all parameter and tag
|
||||
# names, attribute values etc, instead of just the text content.
|
||||
#application/xml = internal text/plain
|
||||
|
||||
@ -135,7 +135,8 @@ message/rfc822 = internal
|
||||
text/calendar = execm python rclics;mimetype=text/plain
|
||||
text/html = internal
|
||||
text/plain = internal
|
||||
text/rtf = execm python rclrtf.py
|
||||
text/rtf = exec unrtf --nopict --html;mimetype=text/html
|
||||
#text/rtf = execm python rclrtf.py
|
||||
text/x-c = internal
|
||||
text/x-c++ = internal
|
||||
text/x-c+ = internal
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user