diff --git a/src/filters/rcllyx b/src/filters/rcllyx new file mode 100755 index 00000000..a33f1ff0 --- /dev/null +++ b/src/filters/rcllyx @@ -0,0 +1,195 @@ +#!/bin/sh +# @(#$Id: rcllyx,v 1.1 2007-01-23 07:14:16 dockes Exp $ (C) 2004 J.F.Dockes +# There may still be code from Estraier in here: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rcllyx +# Convert a lyx file to recoll HTML. +# +# We use lyx --export. It was suggested to use untex, but it doesn't give +# good results on raw lyx (of course, this is not TeX), and exporting to +# LaTex then using untex doesn't look nice when we can use the native lyx +# text export. +# The character encoding of the exported text is defined by the +# \inputencoding directive in the lyx file header and, in quite an obscure +# way, by the \language parameter. We use a heuristic to divine the output +# text encoding and it is guaranteed not to work in all cases. Trials using +# an intermediary dvi, pdf or ps file gave worse results. This needs +# improvement. It doesn't even take into account the fact that the language +# can change inside the doc (does this change the encoding or not ?). To be +# frank, this is not entirely my fault, the lyx format is a joke. +# +# As there is unfortunately no way to define the output file name, we have +# to use a temporary directory and link the input file in there. + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rcllyx" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Extract lyx text as basic HTML.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} + +checkcmds lyx iconv + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi + +tmpdir=$ttdir/rcllyx_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rcllyxtmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rcllyxtmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rcllyxtmp + rmdir $tmpdir +} + +#trap cleanup EXIT HUP QUIT INT TERM + +workdir=$tmpdir/rcllyxtmp +case "$infile" in + */*) ;; + *) infile=`pwd`/$infile;; +esac + +binfile=`basename $infile` +ln -s "$infile" "$workdir/$binfile" || exit 1 +lyxfile=$workdir/$binfile +textfile=$workdir/`basename $binfile .lyx`.txt + +#echo binfile: $binfile;echo lyxfile: $lyxfile ; ls -l $lyxfile; echo textfile: $textfile + +# Run lyx --export +lyx --export text $lyxfile + +# Charset and language +formatline=`egrep '^\\\lyxformat ' $lyxfile` +if test -n "$formatline" ; then + set $formatline + format=$2 +fi +charsetline=`egrep '^\\\inputencoding ' $lyxfile` +if test -n "$charsetline" ; then + set $charsetline + charset=$2 +fi +langline=`egrep '^\\\language ' $lyxfile` +if test -n "$langline" ; then + set $langline + lang=$2 +fi +#echo format: [$format] charset: [$charset] lang [$lang] + +if test "$format" -ge 249 ; then + charset=utf-8 +else + # try to guess the charset from the language: this is in no way guaranteed + # to work, the logic has built-in inconsistencies even beyond the numerous + # external ones (what if the ukrainian writer prefers koi8-r ?). This is a + # joke. + if test -n "$charset" -o "$charset" = default -o "$charset" = auto ; then + case "$lang" in + american|afrikaans|basque|catalan|danish|dutch|english|faeroese|finnish|french|galician|german|icelandic|irish|italian|norwegian|portuguese|spanish|swedish) + charset=iso-8859-1;; + czech|german|hungarian|polish|romanian|croatian|slovak|slovene) + charset=iso-8859-2;; + esperanto|galician|maltese|Turkish) + charset=iso-8859-3;; + estonian|latvian|lithuanian) + charset=iso-8859-4;; + bulgarian|byelorussian|macedonian|russian|serbian|ukrainian) + charset=iso-8859-5;; + arabic) + charset=iso-8859-6;; + greek) + charset=iso-8859-7;; + hebrew) + charset=iso-8859-8;; + #ISO-8859-9 - Latin 5 Same as 8859-1 except for Turkish instead of + #Icelandic. ? What is one to do :) + #ISO-8859-10 - Latin 6 + lappish|nordic|eskimo|inuit|sami) + charset=iso-8859-10;; + albanian|german|english|basque|breton|catalan|danish|spanish|estonian|esthonian|faeroese|faroese|finnish|french|frisian|friesian|scottish|goidelic|irish|gaelic|galician|welsh|greenlandic|inuit|icelandic|italian|latin|dutch|norvegian|portuguese|romansch|romansh|friulian|ladin|swedish) + charset=iso-8859-15;; + *) + charset=iso-8859-1;; + esac + fi +fi + +if test -n "$charset" ; then + inputcmd="iconv -f $charset -t UTF-8 -c -s" +else + inputcmd=cat +fi +#echo inputcmd: [$inputcmd] + +cat < + + $title + + + +
+EOF
+
+$inputcmd < $textfile
+
+cat <
+
+
+EOF