recoll/src/filters/rcldoc

109 lines
2.3 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/sh
# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
# Copyright (C) 2003-2004 Mikio Hirabayashi
#================================================================
#================================================================
# rcldoc
# Extract text from an msword file by executing either antiword
# (or wvware maybe if we need it one day)
#
# The default is to use antiword, the code would need modifications to
# work with wvWare
#
#================================================================
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rcldoc"
decoder="antiword -t -i 1 -m UTF-8"
# Not ready to use this for now (it outputs html, so the code below has to
# be simplified.)
#decoder="wvWare -1 -c UTF-8"
# show help message
if test $# -ne 1 -o "$1" = "--help"
then
printf 'Convert a word file to unformatted HTML text.\n'
printf 'Usage: %s [infile]\n' "$progname"
exit 1
fi
infile="$1"
iscmd()
{
cmd=$1
case $cmd in
*/*)
if test -x $cmd ; then return 0; else return 1; fi ;;
*)
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
for d in $*;do test -x $d/$cmd && return 0;done
return 1 ;;
esac
}
checkcmds()
{
for cmd in $*;do
if iscmd $cmd
then
a=1
else
echo $cmd not found 1>&2
exit 1
fi
done
}
checkcmds awk antiword iconv
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
# output the result
$decoder "$infile" |
awk '
BEGIN {
print "<html><head><title></title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
print "</head>\n<body>\n<p>"
cont = ""
}
{
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
if($0 == "\f") {
print "</p><hr><p>"
next
}
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0 "<br>"
}
END {
print "</p></body></html>"
}' | iconv -f UTF-8 -t UTF-8 -c -s
# exit normally
exit 0