Use the python-based filters written for ms-win on Linux too
This commit is contained in:
parent
dc9d9900be
commit
4c3e112c27
@ -90,11 +90,8 @@ src/qtgui/Makefile
|
|||||||
src/qtgui/qrc_recoll.cpp
|
src/qtgui/qrc_recoll.cpp
|
||||||
src/qtgui/recoll
|
src/qtgui/recoll
|
||||||
src/qtgui/recoll.app
|
src/qtgui/recoll.app
|
||||||
src/query/alldeps
|
|
||||||
src/query/recollq
|
|
||||||
src/sampleconf/rclmon.sh
|
src/sampleconf/rclmon.sh
|
||||||
src/sampleconf/recoll.conf
|
src/sampleconf/recoll.conf
|
||||||
src/utils/alldeps
|
|
||||||
tests/casediac/aspdict.en.rws
|
tests/casediac/aspdict.en.rws
|
||||||
tests/casediac/idxstatus.txt
|
tests/casediac/idxstatus.txt
|
||||||
tests/casediac/index.pid
|
tests/casediac/index.pid
|
||||||
|
|||||||
@ -1,176 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rcldoc,v 1.8 2007-06-08 13:51:08 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# Parts taken from Estraier:
|
|
||||||
#================================================================
|
|
||||||
# Estraier: a personal full-text search system
|
|
||||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
|
||||||
#================================================================
|
|
||||||
#================================================================
|
|
||||||
# Extract text from an msword file by executing either antiword
|
|
||||||
# or wvware
|
|
||||||
#
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rcldoc"
|
|
||||||
filetype=ms-word
|
|
||||||
|
|
||||||
decoder="antiword -t -i 1 -m UTF-8"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds awk antiword iconv
|
|
||||||
|
|
||||||
# We need to do some strange stuff to retrieve the status from antiword. Things
|
|
||||||
# would be simpler if we relied on using bash.
|
|
||||||
# Explanations:
|
|
||||||
#http://stackoverflow.com/questions/1221833/bash-pipe-output-and-capture-exit-status
|
|
||||||
|
|
||||||
stdintoexitstatus() {
|
|
||||||
read exitstatus
|
|
||||||
return $exitstatus
|
|
||||||
}
|
|
||||||
|
|
||||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
|
||||||
# is an awk program
|
|
||||||
(((($decoder "$infile"; echo $? >&3) |
|
|
||||||
awk 'BEGIN'\
|
|
||||||
' {
|
|
||||||
cont = ""
|
|
||||||
gotdata = 0
|
|
||||||
}
|
|
||||||
{
|
|
||||||
if (!($0 ~ /^[ ]*$/) && gotdata == 0) {
|
|
||||||
print "<html><head><title></title>"
|
|
||||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
|
||||||
print "</head>\n<body>\n<p>"
|
|
||||||
gotdata = 1
|
|
||||||
}
|
|
||||||
$0 = cont $0
|
|
||||||
cont = ""
|
|
||||||
|
|
||||||
if ($0 ~ /[-]$/) {
|
|
||||||
# Note : soft-hyphen is iso8859 0xad
|
|
||||||
# Break at last whitespace
|
|
||||||
match($0, "[ \t][^ \t]+$")
|
|
||||||
line = substr($0, 0, RSTART)
|
|
||||||
cont = substr($0, RSTART, RLENGTH-1)
|
|
||||||
$0 = line
|
|
||||||
}
|
|
||||||
|
|
||||||
if($0 == "\f") {
|
|
||||||
print "</p><hr><p>"
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
if (gotdata == 1) {
|
|
||||||
gsub(/&/, "\\&", $0)
|
|
||||||
gsub(/</, "\\<", $0)
|
|
||||||
gsub(/>/, "\\>", $0)
|
|
||||||
|
|
||||||
print $0 "<br>"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
END {
|
|
||||||
if (gotdata == 1)
|
|
||||||
print "</p></body></html>"
|
|
||||||
}' >&4) 3>&1) | stdintoexitstatus) 4>&1
|
|
||||||
|
|
||||||
|
|
||||||
# Antiword rarely fails, we try to catch the most common reasons:
|
|
||||||
if test $? -eq 1 ; then
|
|
||||||
# Check actual document type
|
|
||||||
mtype=`file -b -i "$infile" | awk '{sub(";", "", $1);print $1}'`
|
|
||||||
|
|
||||||
if test X"$mtype" = Xtext/rtf; then
|
|
||||||
# RTF document disguising as msword either because it has a .doc
|
|
||||||
# extension or because it's an attachment with a wrong mime.
|
|
||||||
exec `dirname $0`/rclrtf "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if test X"$mtype" = Xtext/plain; then
|
|
||||||
# Someone gave a .doc ext to their texts. Happens...
|
|
||||||
exec `dirname $0`/rcltext "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if test X"$mtype" = Xapplication/msword; then
|
|
||||||
# Actually application/msword: try wvWare, which is much
|
|
||||||
# slower and we don't use it by default, but it handles some
|
|
||||||
# files that antiword won't, so use it as a last resort.
|
|
||||||
if iscmd wvWare ; then
|
|
||||||
exec wvWare --nographics --charset=utf-8 "$infile"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# else let the error be...
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,238 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
#================================================================
|
|
||||||
# Extract text from an openxml msword file (will be extended for spreadsheets)
|
|
||||||
# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname=rclopxml
|
|
||||||
filetype=openxml
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds xsltproc unzip
|
|
||||||
|
|
||||||
# We need a temporary directory
|
|
||||||
if test z"$RECOLL_TMPDIR" != z; then
|
|
||||||
ttdir=$RECOLL_TMPDIR
|
|
||||||
elif test z"$TMPDIR" != z ; then
|
|
||||||
ttdir=$TMPDIR
|
|
||||||
else
|
|
||||||
ttdir=/tmp
|
|
||||||
fi
|
|
||||||
tmpdir=$ttdir/rclopxml_tmp$$
|
|
||||||
mkdir $tmpdir || exit 1
|
|
||||||
mkdir $tmpdir/rclopxmltmp || exit 1
|
|
||||||
|
|
||||||
cleanup()
|
|
||||||
{
|
|
||||||
# Note that we're using a constant part (rclopxmltmp), that hopefully
|
|
||||||
# guarantees that we can't do big mistakes here.
|
|
||||||
rm -rf $tmpdir/rclopxmltmp
|
|
||||||
rmdir $tmpdir
|
|
||||||
}
|
|
||||||
|
|
||||||
trap cleanup EXIT HUP QUIT INT TERM
|
|
||||||
|
|
||||||
# Unzip the input file and change to the unzipped directory
|
|
||||||
unzip -q -d $tmpdir/rclopxmltmp "$infile"
|
|
||||||
cd $tmpdir/rclopxmltmp
|
|
||||||
|
|
||||||
echo '<html>
|
|
||||||
<head>'
|
|
||||||
|
|
||||||
xsltproc --novalid --nonet - docProps/core.xml <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
|
||||||
xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
|
||||||
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
|
||||||
|
|
||||||
<!-- <xsl:output method="text"/> -->
|
|
||||||
<xsl:output omit-xml-declaration="yes"/>
|
|
||||||
|
|
||||||
<xsl:template match="cp:coreProperties">
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
<xsl:apply-templates/>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:creator">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">
|
|
||||||
<!-- <xsl:value-of select="name()"/> pour sortir tous les meta avec
|
|
||||||
le meme nom que dans le xml (si on devenait dc-natif) -->
|
|
||||||
<xsl:text>author</xsl:text>
|
|
||||||
</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta>
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dcterms:modified">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">
|
|
||||||
<xsl:text>date</xsl:text>
|
|
||||||
</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta>
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="*">
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo '</head>
|
|
||||||
<body>'
|
|
||||||
|
|
||||||
filename=''
|
|
||||||
if test -f word/document.xml ; then
|
|
||||||
filenames=word/document.xml
|
|
||||||
tagmatch="w:p"
|
|
||||||
xmlns_decls='
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
|
||||||
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
||||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
|
||||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
||||||
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
||||||
xmlns:v="urn:schemas-microsoft-com:vml"
|
|
||||||
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
|
||||||
xmlns:w10="urn:schemas-microsoft-com:office:word"
|
|
||||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
||||||
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
|
||||||
'
|
|
||||||
|
|
||||||
elif test -f xl/sharedStrings.xml ; then
|
|
||||||
filenames=xl/sharedStrings.xml
|
|
||||||
tagmatch='x:t'
|
|
||||||
xmlns_decls='
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
|
||||||
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
|
||||||
'
|
|
||||||
|
|
||||||
elif test -f ppt/slides/slide1.xml ; then
|
|
||||||
filenames=`echo ppt/slides/slide*.xml`
|
|
||||||
tagmatch='a:t'
|
|
||||||
xmlns_decls='
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
|
||||||
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
||||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
||||||
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
||||||
'
|
|
||||||
# I want to suppress text output for all except a:t, don't know how to do it
|
|
||||||
# help ! At least get rid of these:
|
|
||||||
moretemplates='
|
|
||||||
<xsl:template match="p:attrName">
|
|
||||||
</xsl:template>
|
|
||||||
'
|
|
||||||
else
|
|
||||||
# ??
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
for filename in $filenames;do
|
|
||||||
xsltproc --novalid --nonet - $filename <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet $xmlns_decls >
|
|
||||||
|
|
||||||
<xsl:output omit-xml-declaration="yes"/>
|
|
||||||
|
|
||||||
<xsl:template match="/">
|
|
||||||
<div>
|
|
||||||
<xsl:apply-templates/>
|
|
||||||
</div>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="$tagmatch">
|
|
||||||
<p>
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</p>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
$moretemplates
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
done
|
|
||||||
|
|
||||||
echo '</html>'
|
|
||||||
@ -1,351 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#================================================================
|
|
||||||
# Copyright (C) 2015 J.F. Dockes
|
|
||||||
# There used to be Estraier content in there, but I quite believe that is not
|
|
||||||
# the case any more.
|
|
||||||
# This file is licensed under the GPL v2
|
|
||||||
#================================================================
|
|
||||||
# Convert a pdf file to HTML.
|
|
||||||
#
|
|
||||||
# We use pdftotext from the xpdf/poppler-utils package.
|
|
||||||
#
|
|
||||||
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
|
||||||
# We try to correct.
|
|
||||||
#
|
|
||||||
# If pdftotext produces no text and tesseract is available, we try to
|
|
||||||
# perform OCR. As this can be very slow and the result not always
|
|
||||||
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
|
||||||
#
|
|
||||||
# We guess the OCR language in order of preference:
|
|
||||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
|
||||||
# directory as the PDF
|
|
||||||
# - From an RECOLL_TESSERACT_LANG environment variable
|
|
||||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
|
||||||
# - Default to "eng"
|
|
||||||
#
|
|
||||||
# Uncomment the following if you get better results without. The
|
|
||||||
# pdftotext manual says that the option is no longer recommended The
|
|
||||||
# difference in output seems mostly the removal of soft-hyphens when
|
|
||||||
# -raw is not set
|
|
||||||
# optionraw=-raw
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
progname="rclpdf"
|
|
||||||
filetype=pdf
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds pdftotext iconv awk
|
|
||||||
|
|
||||||
ocrpossible=0
|
|
||||||
if iscmd tesseract; then
|
|
||||||
if iscmd pdftoppm; then
|
|
||||||
ocrpossible=1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
confdir=${RECOLL_CONFDIR:-~/.recoll}
|
|
||||||
test ! -f "$confdir/ocrpdf" && ocrpossible=0
|
|
||||||
|
|
||||||
tmpdir=
|
|
||||||
|
|
||||||
cleanup()
|
|
||||||
{
|
|
||||||
# Note that we're using a constant part (rclpdftmp), that hopefully
|
|
||||||
# guarantees that we can't do big mistakes with the -rf here.
|
|
||||||
if test ! -z "$tmpdir"; then
|
|
||||||
rm -rf $tmpdir/rclpdftmp
|
|
||||||
rmdir $tmpdir
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
trap cleanup EXIT HUP QUIT INT TERM
|
|
||||||
|
|
||||||
runpdftotext()
|
|
||||||
{
|
|
||||||
# Test poppler version: at some point before 0.24, poppler began
|
|
||||||
# to properly escape text inside the header (but not the body).
|
|
||||||
XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
|
|
||||||
MAJOR=`echo $XYZ | cut -d. -f 1`
|
|
||||||
MINOR=`echo $XYZ | cut -d. -f 2`
|
|
||||||
escapeheader=1
|
|
||||||
escapebody=1
|
|
||||||
if test "$MAJOR" -gt 0 ; then
|
|
||||||
escapeheader=0
|
|
||||||
elif test "$MINOR" -ge 24; then
|
|
||||||
escapeheader=0;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Run pdftotext and fix the result (add a charset tag and fix the
|
|
||||||
# html escaping). The escaping is a half-hearted job. We do try to
|
|
||||||
# fix some header fields, only for those which are single-line.
|
|
||||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
|
||||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
|
||||||
awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
|
|
||||||
' {
|
|
||||||
inbodypre = 0
|
|
||||||
cont = ""
|
|
||||||
}
|
|
||||||
function escapehtml(s)
|
|
||||||
{
|
|
||||||
gsub(/&/, "\\&", s)
|
|
||||||
gsub(/</, "\\<", s)
|
|
||||||
gsub(/>/, "\\>", s)
|
|
||||||
gsub(/"/, "\\"", s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
{
|
|
||||||
$0 = cont $0
|
|
||||||
cont = ""
|
|
||||||
# Insert charset meta tag at end of header
|
|
||||||
if(inbodypre == 0 && $0 ~ /<\/head>/) {
|
|
||||||
match($0, /<\/head>/)
|
|
||||||
part1 = substr($0, 0, RSTART-1)
|
|
||||||
part2 = substr($0, RSTART, length($0))
|
|
||||||
charsetmeta = "<meta http-equiv=\"Content-Type\" "\
|
|
||||||
"content=\"text/html; charset=UTF-8\">"
|
|
||||||
$0 = part1 charsetmeta "\n" part2
|
|
||||||
}
|
|
||||||
if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
|
|
||||||
match($0, /<title>.*<\/title>/)
|
|
||||||
part1 = substr($0, 0, RSTART-1)
|
|
||||||
mid = substr($0, RSTART, RLENGTH)
|
|
||||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
|
||||||
gsub(/<title>/, "", mid)
|
|
||||||
gsub(/<\/title>/, "", mid)
|
|
||||||
if (escapeheader) {
|
|
||||||
mid = escapehtml(mid)
|
|
||||||
}
|
|
||||||
mid = "<title>" mid "</title>"
|
|
||||||
$0 = part1 mid part2
|
|
||||||
}
|
|
||||||
# This matches all single-line meta fields
|
|
||||||
if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
|
|
||||||
match($0, /content=".*"\/>/)
|
|
||||||
part1 = substr($0, 0, RSTART-1)
|
|
||||||
mid = substr($0, RSTART, RLENGTH)
|
|
||||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
|
||||||
gsub(/content="/, "", mid)
|
|
||||||
gsub(/"\/>/, "", mid)
|
|
||||||
if (escapeheader) {
|
|
||||||
mid = escapehtml(mid)
|
|
||||||
}
|
|
||||||
mid = "content=\"" mid "\"/>"
|
|
||||||
$0 = part1 mid part2
|
|
||||||
}
|
|
||||||
|
|
||||||
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
|
||||||
# "Subject" metadata field is more like an HTML "description"
|
|
||||||
if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
|
|
||||||
gsub(/="Subject"/, "=\"Description\"", $0)
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($0 == "<pre>"){
|
|
||||||
# Begin of body text.
|
|
||||||
inbodypre++
|
|
||||||
print $0
|
|
||||||
next
|
|
||||||
} else if ($0 ~ /<\/pre>/){
|
|
||||||
inbodypre--
|
|
||||||
print $0
|
|
||||||
next
|
|
||||||
} else if ($0 ~ /[-]$/) {
|
|
||||||
# Note : soft-hyphen is iso8859 0xad
|
|
||||||
# Break at last whitespace
|
|
||||||
match($0, "[ \t][^ \t]+$")
|
|
||||||
line = substr($0, 0, RSTART)
|
|
||||||
cont = substr($0, RSTART, RLENGTH-1)
|
|
||||||
$0 = line
|
|
||||||
# print "LINE [" $0 "] CONT[" cont "]"
|
|
||||||
}
|
|
||||||
if(inbodypre > 0 && escapebody){
|
|
||||||
$0 = escapehtml($0)
|
|
||||||
}
|
|
||||||
print $0
|
|
||||||
}
|
|
||||||
'
|
|
||||||
}
|
|
||||||
|
|
||||||
# If we're not equipped for ocr, just run pdftotext to stdout
|
|
||||||
if test $ocrpossible -eq 0; then
|
|
||||||
runpdftotext
|
|
||||||
exit $?
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
# tesseract is installed, prepare for running it.
|
|
||||||
# We need to check the pdftotext output, but we don't want to run
|
|
||||||
# it twice. Use a temporary file.
|
|
||||||
if test z"$RECOLL_TMPDIR" != z; then
|
|
||||||
ttdir=$RECOLL_TMPDIR
|
|
||||||
elif test z"$TMPDIR" != z ; then
|
|
||||||
ttdir=$TMPDIR
|
|
||||||
else
|
|
||||||
ttdir=/tmp
|
|
||||||
fi
|
|
||||||
tmpdir=$ttdir/rclpdf_tmp$$
|
|
||||||
mkdir $tmpdir || senderror mkdir $tmpdir failed
|
|
||||||
mkdir $tmpdir/rclpdftmp || senderror mkdir $tmpdir/rclpdftmp failed
|
|
||||||
|
|
||||||
# Run pdftotext into the temp file
|
|
||||||
pdftxtfile=$tmpdir/rclpdftmp/pdftxtfile
|
|
||||||
runpdftotext > $pdftxtfile
|
|
||||||
|
|
||||||
# If text is big, or small but not only tags and empty lines, output
|
|
||||||
# it. Given the contents check which we perform, a file in which the
|
|
||||||
# only text content is metadata (pdf description field), will be run
|
|
||||||
# through OCR, which is not necessarily what we would want. It would
|
|
||||||
# be possible to detect the situation if this proved an issue.
|
|
||||||
txtsize=`ls -l $pdftxtfile | awk '{print $5}'`
|
|
||||||
txtempty=0
|
|
||||||
# Use grep to check if there is regular text in there. Only do it on
|
|
||||||
# small outputs
|
|
||||||
if test $txtsize -lt 5000 ; then
|
|
||||||
realtext=`egrep -v '^[[:space:]]*$|^[[:space:]]*<.*>[[:space:]]*$' $pdftxtfile`
|
|
||||||
test -z "$realtext" && txtempty=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if test $txtempty -eq 0; then
|
|
||||||
# pdftotext produced actual output, use it. No OCR
|
|
||||||
cat $pdftxtfile
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# PDF has no text content and tesseract is available. Give it a try
|
|
||||||
pdflangfile=`dirname "$infile"`/.ocrpdflang
|
|
||||||
if test -f "$pdflangfile"; then
|
|
||||||
tesseractlang=`cat "$pdflangfile"`
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Try to guess tesseract language. This should depend on the input
|
|
||||||
# file, but we have no general way to determine it. So use the
|
|
||||||
# environment and hope for the best.
|
|
||||||
if test -z "$tesseractlang"; then
|
|
||||||
tesseractlang=${RECOLL_TESSERACT_LANG}
|
|
||||||
if test -z "$tesseractlang"; then
|
|
||||||
# Half assed trial to guess from LANG then default to english
|
|
||||||
localelang=`echo $LANG | awk -F_ '{print $1}'`
|
|
||||||
# echo localelang "$localelang" >&2
|
|
||||||
case "$localelang" in
|
|
||||||
en) tesseractlang=eng;;
|
|
||||||
de) tesseractlang=deu;;
|
|
||||||
fr) tesseractlang=fra;;
|
|
||||||
# Someone will have to add more tesseract language codes here.
|
|
||||||
esac
|
|
||||||
|
|
||||||
test -z "$tessractlang" && tesseractlang=`cat "$confdir/ocrpdf"`
|
|
||||||
|
|
||||||
test -z "$tesseractlang" && tesseractlang="eng"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# echo tesseractlang "$tesseractlang" >&2
|
|
||||||
|
|
||||||
TESSERRORFILE="$tmpdir/rclpdftmp/tesserrorfile"
|
|
||||||
TMPFILE="$tmpdir/rclpdftmp/ocrXXXXXX"
|
|
||||||
|
|
||||||
# split pdf-pages
|
|
||||||
ERR_MSG=$(pdftoppm -r 300 "$infile" "$TMPFILE" 2>&1)
|
|
||||||
if [ $? -ne 0 ] ; then
|
|
||||||
senderror "pdftoppm: $ERR_MSG"
|
|
||||||
fi
|
|
||||||
|
|
||||||
for i in $TMPFILE* ; do
|
|
||||||
if [ -s "$i" ] ; then
|
|
||||||
|
|
||||||
tesseract $i $i -l $tesseractlang > $TESSERRORFILE 2>&1
|
|
||||||
TESSERR=$?
|
|
||||||
# ignore tesseract start message
|
|
||||||
LINECOUNT=$(wc -l < $TESSERRORFILE)
|
|
||||||
if [ $TESSERR -ne 0 -o $LINECOUNT -gt 1 ] ; then
|
|
||||||
echo "tesseract-error $TESSERR page $i in $infile" >&2
|
|
||||||
# sort "compacts" leptonica-output
|
|
||||||
cat $TESSERRORFILE | sort -u >&2
|
|
||||||
fi
|
|
||||||
# else
|
|
||||||
# debugging purpose
|
|
||||||
# SICFILE=$(mktemp -p $tmpdir -t sicXXXXXX)
|
|
||||||
# echo "no pdftoppm in $infile cp to $SICFILE" >&2
|
|
||||||
# cp -a $infile $SICFILE
|
|
||||||
# fi
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# don't output "empty" HTML-Files
|
|
||||||
CHARS=$(cat "$TMPFILE"*.txt 2>/dev/null | wc -m)
|
|
||||||
if [ "$CHARS" -gt 0 ] ; then
|
|
||||||
echo "<HTML><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>"
|
|
||||||
cat "$TMPFILE"*.txt | \
|
|
||||||
awk '{
|
|
||||||
gsub(/&/, "\\&", $0)
|
|
||||||
gsub(/</, "\\<", $0)
|
|
||||||
gsub(/>/, "\\>", $0)
|
|
||||||
print $0
|
|
||||||
}
|
|
||||||
'
|
|
||||||
echo "</pre></body></html>"
|
|
||||||
fi
|
|
||||||
@ -1,110 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License as published by
|
|
||||||
# the Free Software Foundation; either version 2 of the License, or
|
|
||||||
# (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU General Public License
|
|
||||||
# along with this program; if not, write to the
|
|
||||||
# Free Software Foundation, Inc.,
|
|
||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
||||||
|
|
||||||
#================================================================
|
|
||||||
# Handle powerpoint files for recoll.
|
|
||||||
# Use unoconv, this is very slow, but catppt just can't handle the majority
|
|
||||||
# of semi-modern ppt files
|
|
||||||
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclppt"
|
|
||||||
filetype=powerpoint
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
filtersdir=`dirname $0`
|
|
||||||
checkcmds $filtersdir/ppt-dump.py
|
|
||||||
|
|
||||||
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
|
|
||||||
|
|
||||||
cat <<EOF
|
|
||||||
<html><head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
|
||||||
</head><body><pre>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
$mso "$infile"| sed -e 's/</</g' -e 's/&/&/g'
|
|
||||||
|
|
||||||
echo '</pre></body></html>'
|
|
||||||
@ -6,15 +6,12 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Processing the output from unrtf
|
|
||||||
class PPTProcessData:
|
class PPTProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = ""
|
self.out = ""
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
|
|
||||||
# Some versions of unrtf put out a garbled charset line.
|
|
||||||
# Apart from this, we pass the data untouched.
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += '''<html><head>''' + \
|
self.out += '''<html><head>''' + \
|
||||||
@ -22,7 +19,7 @@ class PPTProcessData:
|
|||||||
'''content="text/html;charset=UTF-8">''' + \
|
'''content="text/html;charset=UTF-8">''' + \
|
||||||
'''</head><body><pre>'''
|
'''</head><body><pre>'''
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.out += self.em.htmlescape(line)
|
self.out += self.em.htmlescape(line) + "<br>\n"
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
return self.out + '''</pre></body></html>'''
|
return self.out + '''</pre></body></html>'''
|
||||||
|
|||||||
@ -1,102 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rclrtf,v 1.5 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# Some inspiration from estraier
|
|
||||||
#================================================================
|
|
||||||
# convert rtf to html, by executing the unrtf program:
|
|
||||||
# http://www.gnu.org/software/unrtf/unrtf.html
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclrtl"
|
|
||||||
filetype=rtf
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds awk unrtf
|
|
||||||
|
|
||||||
# output the result
|
|
||||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
|
||||||
# is an awk program
|
|
||||||
# The thing about the charset is that unrtf outputs a garbled one.
|
|
||||||
unrtf --nopict --html "$infile" 2> /dev/null |
|
|
||||||
awk 'BEGIN'\
|
|
||||||
' {
|
|
||||||
gothead = 0
|
|
||||||
}
|
|
||||||
/<\/head>/{
|
|
||||||
if (gothead == 0) {
|
|
||||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
|
|
||||||
gothead = 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/<meta http-equiv=/{next}
|
|
||||||
{
|
|
||||||
print
|
|
||||||
}
|
|
||||||
'
|
|
||||||
@ -1,225 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# Parts taken from Estraier:
|
|
||||||
#================================================================
|
|
||||||
# Estraier: a personal full-text search system
|
|
||||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
|
||||||
#================================================================
|
|
||||||
#================================================================
|
|
||||||
# Extract text from an openoffice/soffice file
|
|
||||||
#
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclsoff"
|
|
||||||
filetype=openoffice
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds xsltproc unzip
|
|
||||||
|
|
||||||
# We need a temporary directory
|
|
||||||
if test z"$RECOLL_TMPDIR" != z; then
|
|
||||||
ttdir=$RECOLL_TMPDIR
|
|
||||||
elif test z"$TMPDIR" != z ; then
|
|
||||||
ttdir=$TMPDIR
|
|
||||||
else
|
|
||||||
ttdir=/tmp
|
|
||||||
fi
|
|
||||||
tmpdir=$ttdir/rclsoff_tmp$$
|
|
||||||
mkdir $tmpdir || exit 1
|
|
||||||
mkdir $tmpdir/rclsofftmp || exit 1
|
|
||||||
|
|
||||||
cleanup()
|
|
||||||
{
|
|
||||||
# Note that we're using a constant part (rclsofftmp), that hopefully
|
|
||||||
# guarantees that we can't do big mistakes here.
|
|
||||||
rm -rf $tmpdir/rclsofftmp
|
|
||||||
rmdir $tmpdir
|
|
||||||
}
|
|
||||||
|
|
||||||
trap cleanup EXIT HUP QUIT INT TERM
|
|
||||||
|
|
||||||
# Unzip the input file and change to the unzipped directory
|
|
||||||
unzip -q -d $tmpdir/rclsofftmp "$infile"
|
|
||||||
cd $tmpdir/rclsofftmp
|
|
||||||
|
|
||||||
echo '<html><head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
|
|
||||||
|
|
||||||
xsltproc --novalid --nonet - meta.xml <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet version="1.0"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
||||||
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
|
||||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
|
||||||
xmlns:ooo="http://openoffice.org/2004/office"
|
|
||||||
exclude-result-prefixes="office xlink meta ooo dc"
|
|
||||||
>
|
|
||||||
|
|
||||||
<xsl:output method="html" encoding="UTF-8"/>
|
|
||||||
|
|
||||||
<xsl:template match="/office:document-meta">
|
|
||||||
<xsl:apply-templates select="office:meta/dc:description"/>
|
|
||||||
<xsl:apply-templates select="office:meta/dc:subject"/>
|
|
||||||
<xsl:apply-templates select="office:meta/dc:title"/>
|
|
||||||
<xsl:apply-templates select="office:meta/meta:keyword"/>
|
|
||||||
<xsl:apply-templates select="office:meta/dc:creator"/>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:title">
|
|
||||||
<title> <xsl:value-of select="."/> </title><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:description">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">abstract</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:subject">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:creator">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">author</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="meta:keyword">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo '</head><body>'
|
|
||||||
|
|
||||||
xsltproc --novalid --nonet - content.xml <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet version="1.0"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
||||||
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
|
||||||
exclude-result-prefixes="text"
|
|
||||||
>
|
|
||||||
|
|
||||||
<xsl:output method="html" encoding="UTF-8"/>
|
|
||||||
|
|
||||||
<xsl:template match="text:p">
|
|
||||||
<p><xsl:apply-templates/></p><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="text:h">
|
|
||||||
<p><xsl:apply-templates/></p><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="text:s">
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="text:line-break">
|
|
||||||
<br />
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="text:tab">
|
|
||||||
<xsl:text> </xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
echo '</body></html>'
|
|
||||||
cd /
|
|
||||||
exit 0
|
|
||||||
@ -1,161 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
#================================================================
|
|
||||||
# Extract text from a Scalable Vector Graphics file
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclsvg"
|
|
||||||
filetype=svg
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds xsltproc
|
|
||||||
|
|
||||||
xsltproc --novalid --nonet - "$infile" <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet version="1.0"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
||||||
xmlns:svg="http://www.w3.org/2000/svg"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
exclude-result-prefixes="svg"
|
|
||||||
>
|
|
||||||
|
|
||||||
<xsl:output method="html" encoding="UTF-8"/>
|
|
||||||
|
|
||||||
<xsl:template match="/">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<xsl:apply-templates select="svg:svg/svg:title"/>
|
|
||||||
<xsl:apply-templates select="svg:svg/svg:desc"/>
|
|
||||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:creator"/>
|
|
||||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:subject"/>
|
|
||||||
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:description"/>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<xsl:apply-templates select="//svg:text"/>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="svg:desc">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:creator">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">author</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:subject">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="dc:description">
|
|
||||||
<meta>
|
|
||||||
<xsl:attribute name="name">description</xsl:attribute>
|
|
||||||
<xsl:attribute name="content">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</xsl:attribute>
|
|
||||||
</meta><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="svg:title">
|
|
||||||
<title><xsl:value-of select="."/></title><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="svg:text">
|
|
||||||
<p><xsl:value-of select="."/></p><xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -116,7 +116,7 @@ class SVGExtractor:
|
|||||||
self.em.rclog("%s: bad data: " % (fn, err))
|
self.em.rclog("%s: bad data: " % (fn, err))
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
return (True, docdata, "", rclexecm.RclExecM.eofnow)
|
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
|
|||||||
@ -1,91 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rcltext,v 1.1 2008-09-12 11:30:03 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# Parts taken from Estraier:
|
|
||||||
#================================================================
|
|
||||||
# Estraier: a personal full-text search system
|
|
||||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
|
||||||
#================================================================
|
|
||||||
#================================================================
|
|
||||||
# Wrap generic text (ie: program text) in html
|
|
||||||
# Assumes ascii or iso-8859-1
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rcltext"
|
|
||||||
filetype=text
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds sed
|
|
||||||
echo '<html><head><title></title></head><body><pre>'
|
|
||||||
|
|
||||||
sed -e 's/\&/\&/g' -e 's/</\</g' "$infile"
|
|
||||||
|
|
||||||
echo '</pre></body></html>'
|
|
||||||
@ -1,116 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License as published by
|
|
||||||
# the Free Software Foundation; either version 2 of the License, or
|
|
||||||
# (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU General Public License
|
|
||||||
# along with this program; if not, write to the
|
|
||||||
# Free Software Foundation, Inc.,
|
|
||||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
||||||
|
|
||||||
#================================================================
|
|
||||||
# Handle excel files for recoll.
|
|
||||||
#================================================================
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclxls"
|
|
||||||
filetype=excel
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
top=`dirname $0`
|
|
||||||
XLSDUMP="$top/xls-dump.py"
|
|
||||||
XMLTOCSV="$top/xlsxmltocsv.py"
|
|
||||||
|
|
||||||
checkcmds $XLSDUMP $XLSTOCSV
|
|
||||||
|
|
||||||
# output the result
|
|
||||||
echo '<html><head>'
|
|
||||||
#echo '<title>' "$title" '</title>'
|
|
||||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
|
||||||
echo '</head><body>'
|
|
||||||
echo '<pre>'
|
|
||||||
|
|
||||||
$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
|
|
||||||
$XMLTOCSV | \
|
|
||||||
sed -e 's/</</g' -e 's/&/&/g'
|
|
||||||
|
|
||||||
echo '</pre>'
|
|
||||||
echo '</body></html>'
|
|
||||||
|
|
||||||
# exit normally
|
|
||||||
exit 0
|
|
||||||
@ -8,7 +8,6 @@ import sys
|
|||||||
import os
|
import os
|
||||||
import xml.sax
|
import xml.sax
|
||||||
|
|
||||||
# Processing the output from unrtf
|
|
||||||
class XLSProcessData:
|
class XLSProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
@ -16,8 +15,6 @@ class XLSProcessData:
|
|||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
self.xmldata = ""
|
self.xmldata = ""
|
||||||
|
|
||||||
# Some versions of unrtf put out a garbled charset line.
|
|
||||||
# Apart from this, we pass the data untouched.
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += '''<html><head>''' + \
|
self.out += '''<html><head>''' + \
|
||||||
|
|||||||
@ -1,119 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
#================================================================
|
|
||||||
# Extract text from a generic XML file (Justus Piater)
|
|
||||||
#================================================================
|
|
||||||
|
|
||||||
# set variables
|
|
||||||
LANG=C ; export LANG
|
|
||||||
LC_ALL=C ; export LC_ALL
|
|
||||||
progname="rclxml"
|
|
||||||
filetype=xml
|
|
||||||
|
|
||||||
|
|
||||||
#RECFILTCOMMONCODE
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the previous line unmodified!! Code imported from the
|
|
||||||
# recfiltcommon file
|
|
||||||
|
|
||||||
# Utility code common to all shell filters. This could be sourced at run
|
|
||||||
# time, but it's slightly more efficient to include the code in the
|
|
||||||
# filters at build time (with a sed script).
|
|
||||||
|
|
||||||
# Describe error in a way that can be interpreted by our caller
|
|
||||||
senderror()
|
|
||||||
{
|
|
||||||
echo RECFILTERROR $*
|
|
||||||
# Also alert on stderr just in case
|
|
||||||
echo ":2:$progname::: $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
iscmd()
|
|
||||||
{
|
|
||||||
cmd=$1
|
|
||||||
case $cmd in
|
|
||||||
*/*)
|
|
||||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
|
||||||
*)
|
|
||||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
|
||||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
|
||||||
return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
checkcmds()
|
|
||||||
{
|
|
||||||
for cmd in $*;do
|
|
||||||
if iscmd $cmd
|
|
||||||
then
|
|
||||||
a=1
|
|
||||||
else
|
|
||||||
senderror HELPERNOTFOUND $cmd
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# show help message
|
|
||||||
if test $# -ne 1 -o "$1" = "--help"
|
|
||||||
then
|
|
||||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
|
||||||
echo "Usage: $progname [infile]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
infile="$1"
|
|
||||||
|
|
||||||
# check the input file existence (may be '-' for stdin)
|
|
||||||
if test "X$infile" != X- -a ! -f "$infile"
|
|
||||||
then
|
|
||||||
senderror INPUTNOSUCHFILE "$infile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# protect access to our temp files and directories
|
|
||||||
umask 77
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# !! Leave the following line unmodified !
|
|
||||||
#ENDRECFILTCOMMONCODE
|
|
||||||
|
|
||||||
checkcmds xsltproc
|
|
||||||
|
|
||||||
xsltproc --novalid --nonet - "$infile" <<EOF
|
|
||||||
<?xml version="1.0"?>
|
|
||||||
<xsl:stylesheet version="1.0"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
|
||||||
|
|
||||||
<xsl:output method="html" encoding="UTF-8"/>
|
|
||||||
|
|
||||||
<xsl:template match="/">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<xsl:if test="//*[local-name() = 'title']">
|
|
||||||
<title>
|
|
||||||
<xsl:value-of select="//*[local-name() = 'title'][1]"/>
|
|
||||||
</title>
|
|
||||||
</xsl:if>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<xsl:apply-templates/>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="text()">
|
|
||||||
<xsl:if test="string-length(normalize-space(.)) > 0">
|
|
||||||
<p><xsl:value-of select="."/></p>
|
|
||||||
<xsl:text>
|
|
||||||
</xsl:text>
|
|
||||||
</xsl:if>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
<xsl:template match="*">
|
|
||||||
<xsl:apply-templates/>
|
|
||||||
</xsl:template>
|
|
||||||
|
|
||||||
</xsl:stylesheet>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -74,7 +74,7 @@ class XMLExtractor:
|
|||||||
self.em.rclog("%s: bad data: " % (fn, err))
|
self.em.rclog("%s: bad data: " % (fn, err))
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
return (True, docdata, "", rclexecm.RclExecM.eofnow)
|
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
|
|||||||
@ -51,7 +51,7 @@ application/javascript = internal text/plain
|
|||||||
# - with unrtf: rtf files disguising as doc files.
|
# - with unrtf: rtf files disguising as doc files.
|
||||||
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||||
# important for you than catching all data,
|
# important for you than catching all data,
|
||||||
application/msword = exec rcldoc
|
application/msword = execm rcldoc.py
|
||||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
|
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
|
||||||
# You can also use wvware directly but it's much slower.
|
# You can also use wvware directly but it's much slower.
|
||||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||||
@ -59,41 +59,40 @@ application/msword = exec rcldoc
|
|||||||
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
||||||
# file. This could probably just as well be an excel file, but we have to
|
# file. This could probably just as well be an excel file, but we have to
|
||||||
# chose one.
|
# chose one.
|
||||||
application/vnd.ms-office = exec rcldoc
|
application/vnd.ms-office = execm rcldoc.py
|
||||||
|
|
||||||
application/ogg = execm rclaudio
|
application/ogg = execm rclaudio
|
||||||
application/pdf = exec rclpdf
|
application/pdf = execm rclmpdf.py
|
||||||
# application/pdf = execm rclmpdf.py
|
|
||||||
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
||||||
application/vnd.ms-excel = exec rclxls
|
application/vnd.ms-excel = execm rclxls.py
|
||||||
application/vnd.ms-powerpoint = exec rclppt
|
application/vnd.ms-powerpoint = execm rclppt.py
|
||||||
application/vnd.oasis.opendocument.text = exec rclsoff
|
application/vnd.oasis.opendocument.text = execm rclsoff.py
|
||||||
application/vnd.oasis.opendocument.text-template = exec rclsoff
|
application/vnd.oasis.opendocument.text-template = execm rclsoff.py
|
||||||
application/vnd.oasis.opendocument.presentation = exec rclsoff
|
application/vnd.oasis.opendocument.presentation = execm rclsoff.py
|
||||||
application/vnd.oasis.opendocument.spreadsheet = exec rclsoff
|
application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
|
||||||
application/vnd.oasis.opendocument.graphics = exec rclsoff
|
application/vnd.oasis.opendocument.graphics = execm rclsoff.py
|
||||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
||||||
exec rclopxml
|
execm rclopxml.py
|
||||||
application/vnd.sun.xml.calc = exec rclsoff
|
application/vnd.sun.xml.calc = execm rclsoff.py
|
||||||
application/vnd.sun.xml.calc.template = exec rclsoff
|
application/vnd.sun.xml.calc.template = execm rclsoff.py
|
||||||
application/vnd.sun.xml.draw = exec rclsoff
|
application/vnd.sun.xml.draw = execm rclsoff.py
|
||||||
application/vnd.sun.xml.draw.template = exec rclsoff
|
application/vnd.sun.xml.draw.template = execm rclsoff.py
|
||||||
application/vnd.sun.xml.impress = exec rclsoff
|
application/vnd.sun.xml.impress = execm rclsoff.py
|
||||||
application/vnd.sun.xml.impress.template = exec rclsoff
|
application/vnd.sun.xml.impress.template = execm rclsoff.py
|
||||||
application/vnd.sun.xml.math = exec rclsoff
|
application/vnd.sun.xml.math = execm rclsoff.py
|
||||||
application/vnd.sun.xml.writer = exec rclsoff
|
application/vnd.sun.xml.writer = execm rclsoff.py
|
||||||
application/vnd.sun.xml.writer.global = exec rclsoff
|
application/vnd.sun.xml.writer.global = execm rclsoff.py
|
||||||
application/vnd.sun.xml.writer.template = exec rclsoff
|
application/vnd.sun.xml.writer.template = execm rclsoff.py
|
||||||
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
||||||
application/x-abiword = exec rclabw
|
application/x-abiword = exec rclabw
|
||||||
application/x-awk = internal text/plain
|
application/x-awk = internal text/plain
|
||||||
@ -101,7 +100,7 @@ application/x-chm = execm rclchm
|
|||||||
application/x-dia-diagram = execm rcldia;mimetype=text/plain
|
application/x-dia-diagram = execm rcldia;mimetype=text/plain
|
||||||
application/x-dvi = exec rcldvi
|
application/x-dvi = exec rcldvi
|
||||||
application/x-flac = execm rclaudio
|
application/x-flac = execm rclaudio
|
||||||
application/x-gnote = exec rclxml
|
application/x-gnote = execm rclxml.py
|
||||||
application/x-gnuinfo = execm rclinfo
|
application/x-gnuinfo = execm rclinfo
|
||||||
application/x-gnumeric = exec rclgnm
|
application/x-gnumeric = exec rclgnm
|
||||||
application/x-kword = exec rclkwd
|
application/x-kword = exec rclkwd
|
||||||
@ -124,14 +123,14 @@ audio/mpeg = execm rclaudio
|
|||||||
audio/mp4 = execm rclaudio
|
audio/mp4 = execm rclaudio
|
||||||
audio/aac = execm rclaudio
|
audio/aac = execm rclaudio
|
||||||
audio/x-karaoke = execm rclkar
|
audio/x-karaoke = execm rclkar
|
||||||
image/gif = execm rclimg
|
image/gif = execm rclimg.py
|
||||||
image/jp2 = execm rclimg
|
image/jp2 = execm rclimg.py
|
||||||
image/jpeg = execm rclimg
|
image/jpeg = execm rclimg.py
|
||||||
image/png = execm rclimg
|
image/png = execm rclimg.py
|
||||||
image/tiff = execm rclimg
|
image/tiff = execm rclimg.py
|
||||||
image/vnd.djvu = exec rcldjvu
|
image/vnd.djvu = exec rcldjvu
|
||||||
image/svg+xml = exec rclsvg
|
image/svg+xml = execm rclsvg.py
|
||||||
image/x-xcf = execm rclimg
|
image/x-xcf = execm rclimg.py
|
||||||
inode/symlink = internal
|
inode/symlink = internal
|
||||||
application/x-zerosize = internal
|
application/x-zerosize = internal
|
||||||
inode/x-empty = internal application/x-zerosize
|
inode/x-empty = internal application/x-zerosize
|
||||||
@ -159,9 +158,8 @@ text/x-python = exec rclpython
|
|||||||
text/x-shellscript = internal text/plain
|
text/x-shellscript = internal text/plain
|
||||||
text/x-srt = internal text/plain
|
text/x-srt = internal text/plain
|
||||||
text/x-tex = exec rcltex
|
text/x-tex = exec rcltex
|
||||||
|
application/xml = execm rclxml.py
|
||||||
application/xml = exec rclxml
|
text/xml = execm rclxml.py
|
||||||
text/xml = exec rclxml
|
|
||||||
# Using these instead of the two above would index all parameter and tag
|
# Using these instead of the two above would index all parameter and tag
|
||||||
# names, attribute values etc, instead of just the text content.
|
# names, attribute values etc, instead of just the text content.
|
||||||
#application/xml = internal text/plain
|
#application/xml = internal text/plain
|
||||||
|
|||||||
@ -135,7 +135,8 @@ message/rfc822 = internal
|
|||||||
text/calendar = execm python rclics;mimetype=text/plain
|
text/calendar = execm python rclics;mimetype=text/plain
|
||||||
text/html = internal
|
text/html = internal
|
||||||
text/plain = internal
|
text/plain = internal
|
||||||
text/rtf = execm python rclrtf.py
|
text/rtf = exec unrtf --nopict --html;mimetype=text/html
|
||||||
|
#text/rtf = execm python rclrtf.py
|
||||||
text/x-c = internal
|
text/x-c = internal
|
||||||
text/x-c++ = internal
|
text/x-c++ = internal
|
||||||
text/x-c+ = internal
|
text/x-c+ = internal
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user