diff --git a/src/filters/rcldjvu b/src/filters/rcldjvu new file mode 100755 index 00000000..647c98a7 --- /dev/null +++ b/src/filters/rcldjvu @@ -0,0 +1,129 @@ +#!/bin/sh +# @(#$Id: rcldjvu,v 1.1 2006-02-03 10:53:34 dockes Exp $ (C) 2005 J.F.Dockes + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#================================================================ +# rcldjvu +# Extract text from a djvu file by executing djvused and djvutxt +# +# We use djvused to extract a possible title, djvutxt for the text +# +# Of course this only means anything if the djvu document actually has +# a text layer ! +# +#================================================================ + +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rcldjvu" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Convert a djvu file to HTML text for recoll indexation.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds djvutxt djvused awk + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# Title: we try to extract it from the annotations. djvused outputs string +# in C/awk \-escaped notation. Awk can only process this in string +# constants, so we have a first awk pass to create an awk program to parse +# the string as a constant (...). This is not exactly robust or nice +title=`djvused $infile -e 'select 1;output-ant' | \ +grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\ +awk ' +{ + printf("BEGIN {s = %s; print s}\n", $0) +}' | awk -f -` + + +cat < + + $title + + + +
+EOF
+
+djvutxt $infile | sed -e 's/[ 	][ 	]*$//' | \
+awk '
+BEGIN {
+  cont = ""
+}
+{
+    $0 = cont $0
+    cont = ""
+
+    if ($0 == "\f") {
+       print "

\n
\n

" + next + } else if ($0 ~ /[-]$/) { + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH) + $0 = line + gsub("-", "", cont) + } + gsub(/&/, "\\&", $0) + gsub(//, "\\>", $0) + print $0 +}' + +cat < + + +EOF diff --git a/src/filters/rcldvi b/src/filters/rcldvi new file mode 100755 index 00000000..f2655ce8 --- /dev/null +++ b/src/filters/rcldvi @@ -0,0 +1,58 @@ +#!/bin/sh +# @(#$Id: rcldvi,v 1.1 2006-02-03 10:53:34 dockes Exp $ (C) 2004 J.F.Dockes +#================================================================ +# rcldvi +# Extract text from a dvi file by executing dvitops and rclps +# +#================================================================ + +# Show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Convert a dvi file to unformatted HTML text for recoll indexation.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +decoder=dvips + +# Find rclps. Note: this only works because we are always executed with a +# full path +infile="$1" +rclps=`dirname $0`/rclps + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds $decoder + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# output the result +$decoder -f < "$infile" 2> /dev/null | $rclps - diff --git a/src/filters/rclps b/src/filters/rclps index ffde92d7..0d6860c9 100755 --- a/src/filters/rclps +++ b/src/filters/rclps @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclps,v 1.6 2006-02-03 10:53:35 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -34,6 +34,11 @@ then fi infile="$1" +if test X$infile = X- ; then + cmd=$decoder +else + cmd="$decoder $1" +fi iscmd() { @@ -62,7 +67,7 @@ checkcmds() checkcmds $decoder iconv awk # check the input file existence -if test ! -f "$infile" +if test X$infile != X- -a ! -f "$infile" then printf '%s: %s: no such file\n' "$progname" "$infile" exit 1 diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index c745bd68..814deab3 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.12 2005-11-30 09:46:48 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.13 2006-02-03 10:53:35 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -38,6 +38,9 @@ application/vnd.sun.xml.math = exec rclsoff application/vnd.sun.xml.writer = exec rclsoff application/vnd.sun.xml.writer.global = exec rclsoff application/vnd.sun.xml.writer.template = exec rclsoff +application/x-dvi = exec rcldvi + +image/vnd.djvu = exec rcldjvu message/rfc822 = internal @@ -66,6 +69,8 @@ application/vnd.sun.xml.math = openoffice %f application/vnd.sun.xml.writer = openoffice %f application/vnd.sun.xml.writer.global = openoffice %f application/vnd.sun.xml.writer.template = openoffice %f +image/vnd.djvu = djview %f +application/x-dvi = xdvi %f # Icons to be used in the result list. [icons] diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index 08bd9df1..9c07da7a 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -1,4 +1,4 @@ -# @(#$Id: mimemap,v 1.10 2005-11-21 16:05:07 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimemap,v 1.11 2006-02-03 10:53:35 dockes Exp $ (C) 2004 J.F.Dockes # Recoll: associations of file name extensions to mime types .txt = text/plain @@ -18,6 +18,10 @@ .eps = application/postscript .ai = application/postscript +.dvi = application/x-dvi + +.djvu = image/vnd.djvu + .gz = application/x-gzip .Z = application/x-gzip .bz2 = application/x-bzip2