From 87216c22f59d73279d3698b7410c478fbe8b5352 Mon Sep 17 00:00:00 2001
From: dockes
Date: Fri, 3 Feb 2006 10:53:35 +0000
Subject: [PATCH] added dvi and djvu support
---
src/filters/rcldjvu | 129 ++++++++++++++++++++++++++++++++++++++++
src/filters/rcldvi | 58 ++++++++++++++++++
src/filters/rclps | 9 ++-
src/sampleconf/mimeconf | 7 ++-
src/sampleconf/mimemap | 6 +-
5 files changed, 205 insertions(+), 4 deletions(-)
create mode 100755 src/filters/rcldjvu
create mode 100755 src/filters/rcldvi
diff --git a/src/filters/rcldjvu b/src/filters/rcldjvu
new file mode 100755
index 00000000..647c98a7
--- /dev/null
+++ b/src/filters/rcldjvu
@@ -0,0 +1,129 @@
+#!/bin/sh
+# @(#$Id: rcldjvu,v 1.1 2006-02-03 10:53:34 dockes Exp $ (C) 2005 J.F.Dockes
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+#================================================================
+# rcldjvu
+# Extract text from a djvu file by executing djvused and djvutxt
+#
+# We use djvused to extract a possible title, djvutxt for the text
+#
+# Of course this only means anything if the djvu document actually has
+# a text layer !
+#
+#================================================================
+
+LANG=C ; export LANG
+LC_ALL=C ; export LC_ALL
+progname="rcldjvu"
+
+# show help message
+if test $# -ne 1 -o "$1" = "--help"
+then
+ printf 'Convert a djvu file to HTML text for recoll indexation.\n'
+ printf 'Usage: %s [infile]\n' "$progname"
+ exit 1
+fi
+
+infile="$1"
+
+iscmd()
+{
+ cmd=$1
+ case $cmd in
+ */*)
+ if test -x $cmd ; then return 0; else return 1; fi ;;
+ *)
+ oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
+ for d in $*;do test -x $d/$cmd && return 0;done
+ return 1 ;;
+ esac
+}
+checkcmds()
+{
+ for cmd in $*;do
+ if iscmd $cmd
+ then
+ a=1
+ else
+ echo $cmd not found 1>&2
+ exit 1
+ fi
+ done
+}
+checkcmds djvutxt djvused awk
+
+# check the input file existence
+if test ! -f "$infile"
+then
+ printf '%s: %s: no such file\n' "$progname" "$infile"
+ exit 1
+fi
+
+# Title: we try to extract it from the annotations. djvused outputs string
+# in C/awk \-escaped notation. Awk can only process this in string
+# constants, so we have a first awk pass to create an awk program to parse
+# the string as a constant (...). This is not exactly robust or nice
+title=`djvused $infile -e 'select 1;output-ant' | \
+grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\
+awk '
+{
+ printf("BEGIN {s = %s; print s}\n", $0)
+}' | awk -f -`
+
+
+cat <
+
+ $title
+
+
+
+
+EOF
+
+djvutxt $infile | sed -e 's/[ ][ ]*$//' | \
+awk '
+BEGIN {
+ cont = ""
+}
+{
+ $0 = cont $0
+ cont = ""
+
+ if ($0 == "\f") {
+ print "
\n
\n"
+ next
+ } else if ($0 ~ /[-]$/) {
+ # Break at last whitespace
+ match($0, "[ \t][^ \t]+$")
+ line = substr($0, 0, RSTART)
+ cont = substr($0, RSTART, RLENGTH)
+ $0 = line
+ gsub("-", "", cont)
+ }
+ gsub(/&/, "\\&", $0)
+ gsub(/, "\\<", $0)
+ gsub(/>/, "\\>", $0)
+ print $0
+}'
+
+cat <
+
+