From 538264db954f3f2704f29250ec6b57caea3eea69 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 4 May 2010 09:06:52 +0200 Subject: [PATCH] added the old filters page which had been forgotten --- website/filters/filters.html | 294 +++++++++++++++++++++++++++++++++++ website/filters/mimeconf | 215 +++++++++++++++++++++++++ website/filters/mimemap | 132 ++++++++++++++++ website/filters/mimeview | 61 ++++++++ website/filters/rclabw | 175 +++++++++++++++++++++ website/filters/rclimg | 95 +++++++++++ website/filters/rclkwd | 204 ++++++++++++++++++++++++ website/filters/rcllyx | 195 +++++++++++++++++++++++ website/filters/rclopxml | 245 +++++++++++++++++++++++++++++ website/filters/rclscribus | 151 ++++++++++++++++++ website/filters/rclsoff | 156 +++++++++++++++++++ website/filters/rclsvg | 143 +++++++++++++++++ website/filters/rcltex | 106 +++++++++++++ website/filters/rclwpd | 87 +++++++++++ 14 files changed, 2259 insertions(+) create mode 100644 website/filters/filters.html create mode 100644 website/filters/mimeconf create mode 100644 website/filters/mimemap create mode 100644 website/filters/mimeview create mode 100755 website/filters/rclabw create mode 100755 website/filters/rclimg create mode 100755 website/filters/rclkwd create mode 100755 website/filters/rcllyx create mode 100755 website/filters/rclopxml create mode 100755 website/filters/rclscribus create mode 100755 website/filters/rclsoff create mode 100755 website/filters/rclsvg create mode 100755 website/filters/rcltex create mode 100755 website/filters/rclwpd diff --git a/website/filters/filters.html b/website/filters/filters.html new file mode 100644 index 00000000..ebdba757 --- /dev/null +++ b/website/filters/filters.html @@ -0,0 +1,294 @@ + + + + + Recoll updated filters + + + + + + + + + + + + + + + + +
+ +

Updated filters for Recoll

+ +

The following describe new and updated filters, which will be + part of the next release, but can be installed on the current + release if you need them.

+ +

For updated filters, you just need to copy the script to the + filters directory which may be typically either /usr/share/recoll/filters, or /usr/local/share/recoll/filters.

+ +

For new filters, you'll need to copy the script file as + above, possibly install the supporting application, and usually + edit the + mimemap, + mimeview and + mimeconf files, either in the + shared directory + ( + /usr[/local]/share/recoll/examples), or + in your personal configuration directory + ($HOME/.recoll or + $RECOLL_CONFDIR).

+ +

Alternatively, you can replace your 1.[8,9,10] system files with + these updated and complete versions: + mimemap + mimeconf + mimeview

+ +

Notes:

+
+

All filters are up to date in Recoll 1.10.5

+ +

Recoll 1.10.0: only rclsvg for + Scalable Vector Graphic files is missing.

+ +

Recoll 1.9: all filters are up to date in the release, + except the rclimg image + filter and rcltexTeX filter.

+ +

Recoll 1.8: The image, kword, + abiword and wordperfect can be installed in + addition.

+
+ +

Open XML Office formats

+ +

Filter: rclopxml.

+

This needs xsltproc to be + installed (if you run a decently recent Linux, this is + probably on your system already).

+ +

The filters are certainly not perfect, but extract a good + part of the text, which is probably better than nothing.

+ +

There are quite a few added lines in the configuration + files, just fetch the new ones: + mimemap + mimeconf + mimeview

+ + +

Scalable Vector Graphics filter

+ +

A new filter for SVG files: + rclsvg. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
.svg = image/svg+xml
+
+

In mimeconf, [index] section:

+
image/svg+xml = exec rclsvg
+

mimeconf, [icons] section:

+
image/svg+xml = drawing
+

mimeconf, [categories] section, also add + image/svg+xml to the other list.

+ +

The filter is based on sed, so + you don't need to install any external application.

+ +

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
    image/svg+xml = inkview %f
+

(Or substitute your favorite editor).

+ + + +

TeX filter

+ +

A new filter for TeX files: + rcltex. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
.tex = application/x-tex
+
+

In mimeconf, [index] section:

+
    application/x-tex = exec rcltex
+

mimeconf, [icons] section:

+
application/x-tex = wordprocessing
+

mimeconf, [categories] section, also add + application/x-tex to the texts list.

+ +

This filter uses either untex + or detex + if the command is available. . A copy of the + source code for untex is stored + here

+ +

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
    application/x-tex = gnuclient -q %f
+

(Or substitute your favorite editor).

+ + +

A filter for image tags

+ +

A new filter for extracting tags from image and picture files: + rclimg, by Cedric Scott. It is based on + the Exiftool Perl application and library. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
.jpeg = image/jpeg
+.gif = image/gif
+.tiff = image/tiff
+.tif  = image/tiff
+
+

In mimeconf, [index] section:

+
image/gif = exec rclimg
+image/jpeg = exec rclimg
+image/png = exec rclimg
+image/tiff = exec rclimg
+      
+

And remove the image/jpeg = exec rcljpeg line.

+ +

Exiftool supports many other image formats, just enter any + additional ones like above.

+ +

Wordperfect filter

+ +

A new filter for Wordperfect files: + rclwpd. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
.wpd = application/vnd.wordperfect
+
+

In mimeconf, [index] section:

+
    application/vnd.wordperfect = exec rclwpd
+

mimeconf, [icons] section:

+
application/vnd.wordperfect = wordprocessing
+

mimeconf, [categories] section, also add + application/vnd.wordperfect to the texts list.

+ +

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
    application/vnd.wordperfect = openoffice %f
+ +

Abiword filter

+ +

A new filter for + abiword files: + rclabw. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
    .abw = application/x-abiword
+ +

In mimeconf:

+
    application/x-abiword = exec rclabw
+ +

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
    application/x-abiword = abiword %f
+ +

Kword filter

+ +

A new filter for + kword files: + rclkwd. + You'll have to add the following lines in the configuration + files:

+ +

In mimemap:

+
    .kwd = application/x-kword
+

In mimeconf:

+
    application/x-kword = exec rclkwd
+

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
    application/x-kword = kword %f
+ + +

Openoffice filter

+

The filter script for all releases up and including 1.7.5 had + a bug on Debian and Ubuntu systems. You can download the corrected script.

+ +

Scribus filter

+ +

A new filter for + Scribus files: + rclscribus. This is only for the newer + .sla files. I am willing to add support for the older + .scd format if someone sends me a sample... You'll + have to add the following lines in the configuration files:

+ +

In mimemap:

+
      .sla = application/x-scribus
+

In mimeconf:

+
      application/x-scribus = exec rclscribus
+

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
       application/x-scribus = scribus %f
+ +

Do *not* add entries for .sla.gz, the normal recoll + decompression process will handle them (hopefully...).

+ + +

Lyx filter

+ +

A new filter for + Lyx files: rcllyx. + This probably has quite a few issues with character encoding, + but it's also probably better than handling lyx documents as + text files.

+ +

In mimemap:

+
      .lyx = application/x-lyx
+

In mimeconf:

+
      application/x-lyx = exec rcllyx
+

In + mimeview, or the [view] + section of + mimeconf for older recoll versions:

+
       application/x-lyx = lyx %f
+ +
+ + diff --git a/website/filters/mimeconf b/website/filters/mimeconf new file mode 100644 index 00000000..b555121c --- /dev/null +++ b/website/filters/mimeconf @@ -0,0 +1,215 @@ +# @(#$Id: mimeconf,v 1.41 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes + +# Recoll : associations of mime types to processing filters. +# There are different sections for decompression, 'interning' for indexing +# and preview, and external viewers + +## ####################################### +# Decompression: these types need a first pass to create a temp file to +# work with. We use a script because uncompress utilities usually work in +# place, which is not suitable. +# +# The %t parameter will be substituted to the name of a temporary directory +# by recoll. This directory is guaranteed empty when calling the filter +# +# The %f parameter will be substituted with the input file. +# +# The script (ie: rcluncomp) must output the uncompressed file name on +# stdout. +application/x-gzip = uncompress rcluncomp gunzip %f %t +application/x-compress = uncompress rcluncomp gunzip %f %t +application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t + +## ################################### +# Filters for indexing and internal preview. +# The "internal" filters are hardwired in the c++ code. +# The external "exec" filters are typically scripts. They output the +# document in simple html format, have a look at the scripts. +[index] +application/msword = exec rcldoc +application/ogg = exec rclogg +application/pdf = exec rclpdf +application/postscript = exec rclps +application/vnd.ms-excel = exec rclxls +application/vnd.ms-powerpoint = exec rclppt +application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ + exec rclopxml +application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ + exec rclopxml +application/vnd.openxmlformats-officedocument.presentationml.template = \ + exec rclopxml +application/vnd.openxmlformats-officedocument.presentationml.presentation = \ + exec rclopxml +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ + exec rclopxml +application/vnd.openxmlformats-officedocument.spreadsheetml.template =\ + exec rclopxml +application/vnd.sun.xml.calc = exec rclsoff +application/vnd.sun.xml.calc.template = exec rclsoff +application/vnd.sun.xml.draw = exec rclsoff +application/vnd.sun.xml.draw.template = exec rclsoff +application/vnd.sun.xml.impress = exec rclsoff +application/vnd.sun.xml.impress.template = exec rclsoff +application/vnd.sun.xml.math = exec rclsoff +application/vnd.sun.xml.writer = exec rclsoff +application/vnd.sun.xml.writer.global = exec rclsoff +application/vnd.sun.xml.writer.template = exec rclsoff +application/vnd.wordperfect = exec rclwpd +application/x-abiword = exec rclabw +application/x-dvi = exec rcldvi +application/x-flac = exec rclflac +application/x-kword = exec rclkwd +application/x-lyx = exec rcllyx +application/x-scribus = exec rclscribus +application/x-tex = exec rcltex +audio/mpeg = exec rclid3 +image/gif = exec rclimg +image/jpeg = exec rclimg +image/png = exec rclimg +image/tiff = exec rclimg +image/vnd.djvu = exec rcldjvu +image/svg+xml = exec rclsvg +message/rfc822 = internal +text/html = internal +text/plain = internal +text/rtf = exec rclrtf +text/x-gaim-log = exec rclgaim +text/x-html-sidux-man = exec rclsiduxman +text/x-mail = internal +text/x-man = exec rclman + +## ############################################# +# Icons to be used in the result list if required by gui config +[icons] +application/msword = wordprocessing +application/ogg = sownd +application/pdf = pdf +application/postscript = postscript +application/vnd.ms-excel = spreadsheet +application/vnd.ms-powerpoint = presentation +application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ + wordprocessing +application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ + wordprocessing +application/vnd.openxmlformats-officedocument.presentationml.template = \ + presentation +application/vnd.openxmlformats-officedocument.presentationml.presentation = \ + presentation +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ + spreadsheet +application/vnd.openxmlformats-officedocument.spreadsheetml.template =\ + spreadsheet +application/vnd.sun.xml.calc = spreadsheet +application/vnd.sun.xml.calc.template = spreadsheet +application/vnd.sun.xml.draw = drawing +application/vnd.sun.xml.draw.template = drawing +application/vnd.sun.xml.impress = presentation +application/vnd.sun.xml.impress.template = presentation +application/vnd.sun.xml.writer = wordprocessing +application/vnd.sun.xml.writer.global = wordprocessing +application/vnd.sun.xml.writer.template = wordprocessing +application/vnd.wordperfect = wordprocessing +application/x-abiword = wordprocessing +application/x-dvi = document +application/x-flac = sownd +application/x-fsdirectory = folder +application/x-kword = wordprocessing +application/x-lyx = wordprocessing +application/x-scribus = document +application/x-tex = wordprocessing +audio/mpeg = sownd +image/gif = image +image/jpeg = image +image/png = image +image/tiff = image +image/vnd.djvu = document +image/svg+xml = drawing +message/rfc822 = message +text/html = html +text/plain = txt +text/x-c = source +text/x-html-sidux-man = sidux-book +text/x-mail = message +text/x-man = document + +[categories] + +text = \ + application/msword \ + application/pdf \ + application/postscript \ + application/vnd.openxmlformats-officedocument.wordprocessingml.document \ + application/vnd.openxmlformats-officedocument.wordprocessingml.template \ + application/vnd.sun.xml.writer \ + application/vnd.sun.xml.writer.global \ + application/vnd.sun.xml.writer.template \ + application/vnd.wordperfect \ + application/x-abiword \ + application/x-dvi \ + application/x-kword \ + application/x-lyx \ + application/x-scribus \ + application/x-tex \ + image/vnd.djvu \ + text/html \ + text/plain \ + text/rtf \ + text/x-html-sidux-man \ + text/x-man + +spreadsheet = \ + application/vnd.ms-excel \ + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet \ + application/vnd.openxmlformats-officedocument.spreadsheetml.template \ + application/vnd.sun.xml.calc \ + application/vnd.sun.xml.calc.template + +presentation = application/vnd.ms-powerpoint \ + application/vnd.openxmlformats-officedocument.presentationml.template \ + application/vnd.openxmlformats-officedocument.presentationml.presentation \ + application/vnd.sun.xml.impress \ + application/vnd.sun.xml.impress.template + +media = \ + audio/mpeg \ + application/ogg \ + application/x-flac \ + image/jpeg \ + image/png \ + image/tiff \ + image/gif \ + +message = message/rfc822 \ + text/x-gaim-log \ + text/x-mail \ + +other = application/vnd.sun.xml.draw \ + application/vnd.sun.xml.draw.template \ + application/vnd.sun.xml.math \ + application/x-fsdirectory \ + image/svg+xml \ + + +[prefixes] + +# This allows extending the set of fields that recoll understand/searches. +# See the manual for exact usage. +# Important: +# - the field names MUST be all lowercase here. They can be anycased +# in the documents: +# - The extension field prefixes MUST begin with X and be all UPPERCASE. +title = S +caption = S +subject = S + +author = A +creator = A +from = A + +keyword = K +tag = K +keywords = K +tags = K + +# testing /example : +recollspecialfield = XRCLSF diff --git a/website/filters/mimemap b/website/filters/mimemap new file mode 100644 index 00000000..bc811db5 --- /dev/null +++ b/website/filters/mimemap @@ -0,0 +1,132 @@ +# @(#$Id: mimemap,v 1.31 2008/08/25 16:12:16 dockes Exp $ (C) 2004 J.F.Dockes +# Recoll: associations of file name extensions to mime types + +.txt = text/plain +.text = text/plain +.d = text/plain + +# Source files. Defining them as text/x-c will enable ext viewer. If +# text/plain they will be somewhat indexed +.cpp = text/x-c +.h = text/x-c +.c = text/x-c +.cc = text/x-c + +.rtf = text/rtf + +.html = text/html +.htm = text/html +.shtml = text/html +.php = text/html + +.pdf = application/pdf + +.ps = application/postscript +.eps = application/postscript +.ai = application/postscript + +.tex = application/x-tex +.dvi = application/x-dvi + +.djvu = image/vnd.djvu +.svg = image/svg+xml + +.gz = application/x-gzip +.Z = application/x-gzip +.bz2 = application/x-bzip2 +#.Z = application/x-compress + +.doc = application/msword +.ppt = application/vnd.ms-powerpoint +.xls = application/vnd.ms-excel + +# OpenOffice / opendocument. We handle opendocument as old openoffice files +# for now +.sxc = application/vnd.sun.xml.calc +.ods = application/vnd.sun.xml.calc +.stc = application/vnd.sun.xml.calc.template +.sxd = application/vnd.sun.xml.draw +.std = application/vnd.sun.xml.draw.template +.sxi = application/vnd.sun.xml.impress +.odp = application/vnd.sun.xml.impress +.sti = application/vnd.sun.xml.impress.template +.sxm = application/vnd.sun.xml.math +.sxw = application/vnd.sun.xml.writer +.odt = application/vnd.sun.xml.writer +.sxg = application/vnd.sun.xml.writer.global +.stw = application/vnd.sun.xml.writer.template + +# ms openxml +.docm = application/vnd.ms-word.document.macroEnabled.12 +.docx = application/vnd.openxmlformats-officedocument.wordprocessingml.document +.dotm = application/vnd.ms-word.template.macroEnabled.12 +.dotx = application/vnd.openxmlformats-officedocument.wordprocessingml.template +.potm = application/vnd.ms-powerpoint.template.macroEnabled.12 +.potx = application/vnd.openxmlformats-officedocument.presentationml.template +.ppam = application/vnd.ms-powerpoint.addin.macroEnabled.12 +.ppsm = application/vnd.ms-powerpoint.slideshow.macroEnabled.12 +.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow +.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12 +.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation +.xlam = application/vnd.ms-excel.addin.macroEnabled.12 +.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12 +.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12 +.xlsx = application/vnd.openxmlformats-officedocument.spreadsheetml.sheet +.xltm = application/vnd.ms-excel.template.macroEnabled.12 +.xltx = application/vnd.openxmlformats-officedocument.spreadsheetml.template + +.abw = application/x-abiword +.lyx = application/x-lyx +.sla = application/x-scribus +.scd = application/x-scribus + +.kwd = application/x-kword + +.wpd = application/vnd.wordperfect + +.rtf = text/rtf + +.mp3 = audio/mpeg +.flac = application/x-flac +.ogg = application/ogg + +.png = image/png +.jpg = image/jpeg +.jpeg = image/jpeg +.gif = image/gif +.tiff = image/tiff +.tif = image/tiff + +# A list of stuff that we don't want to touch at all (for now). Having the +# suffixes listed in there speeds up things quite a lot by avoiding +# unneeded decompression or 'file' calls. File names still get indexed if +# indexallfilenames is set +recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \ + .m4 .tcl .js .sh .pl .awk \ + .o .lib .dll .a \ + .dat .bak .rdf .log .db .ini .msf .pid \ + .gnm .gnumeric \ + .gif .bmp .xpm \ + ,v ~ # + +# Special handling of .txt files inside ~/.gaim directory +[~/.gaim] +.txt = text/x-gaim-log + +# Special handling of sidux manual menu system +[/usr/share/sidux-manual] +.htm = text/x-html-sidux-man +.html = text/x-html-sidux-man + +# Manual files. You may want to adjust the location for your system +# We can't use the default text/troff type because this doesn't say +# what macro set to use (groff -man) +[/usr/share/man] +.1 = text/x-man +.2 = text/x-man +.3 = text/x-man +.4 = text/x-man +.5 = text/x-man +.6 = text/x-man +.7 = text/x-man +.8 = text/x-man diff --git a/website/filters/mimeview b/website/filters/mimeview new file mode 100644 index 00000000..ef441825 --- /dev/null +++ b/website/filters/mimeview @@ -0,0 +1,61 @@ +# @(#$Id: mimeview,v 1.15 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes + +## ########################################## +# External viewers, launched by the recoll GUI when you click on a result +# 'edit' link + +[view] +# Pseudo entry used if the 'use desktop' preference is set in the GUI +application/x-all = xdg-open %f + +application/x-kword = kword %f +application/x-abiword = abiword %f + +application/msword = openoffice %f +application/ogg = xmms %f +application/pdf = xpdf %f +application/postscript = gv %f +application/vnd.ms-excel = openoffice %f +application/vnd.ms-powerpoint = openoffice %f +application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ + openoffice %f +application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ + openoffice %f +application/vnd.openxmlformats-officedocument.presentationml.template = \ + openoffice %f +application/vnd.openxmlformats-officedocument.presentationml.presentation = \ + openoffice %f +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ + openoffice %f +application/vnd.openxmlformats-officedocument.spreadsheetml.template =\ + openoffice %f +application/vnd.sun.xml.calc = openoffice %f +application/vnd.sun.xml.calc.template = openoffice %f +application/vnd.sun.xml.draw = openoffice %f +application/vnd.sun.xml.draw.template = openoffice %f +application/vnd.sun.xml.impress = openoffice %f +application/vnd.sun.xml.impress.template = openoffice %f +application/vnd.sun.xml.math = openoffice %f +application/vnd.sun.xml.writer = openoffice %f +application/vnd.sun.xml.writer.global = openoffice %f +application/vnd.sun.xml.writer.template = openoffice %f +application/vnd.wordperfect = openoffice %f +application/x-fsdirectory = rox %f +application/x-dvi = xdvi %f +application/x-flac = xmms %f +application/x-lyx = lyx %f +application/x-scribus = scribus %f +application/x-tex = gnuclient -q %f +audio/mpeg = xmms %f +image/jpeg = xv %f +image/png = xv %f +image/tiff = xv %f +image/gif = xv %f +image/svg+xml = inkview %f +image/vnd.djvu = djview %f +# Or firefox -remote "openFile(%u)" +text/html = firefox %u +text/plain = gnuclient -q %f +text/x-c = gnuclient -q %f +text/x-html-sidux-man = konqueror %f +#text/x-html-sidux-man = iceweasel %f diff --git a/website/filters/rclabw b/website/filters/rclabw new file mode 100755 index 00000000..77478127 --- /dev/null +++ b/website/filters/rclabw @@ -0,0 +1,175 @@ +#!/bin/sh +# @(#$Id: rclabw,v 1.2 2007/06/15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# Extract text from an abiword file +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclabw" +filetype=abiword + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds iconv sed + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +encoding=`sed -e '/$//' \ + -e '/^ for all meta +# tags) +descsedprog=' +/\([^<]*\)<\/m>/ { +s//\1/ +p +q +} +//,/<\/m>/ { +s!.*!! +s!.*!! +H +} +${ +g +s/\n/ /g +p +} +' + +description=`sed -n -e "$descsedprog" < "$infile"` +#echo description: "$description" + +# Set program for the single line meta elements. Takes element name as +# parameter +setmetasedprog() { +metasedprog='//{ +s/.*\([^<]*\).*/\1/ +'"s/\"/'/g"' +p +}' +} + +setmetasedprog dc.subject +subject=`sed -n -e "$metasedprog" "$infile"` +#echo subject: "$subject" + +setmetasedprog dc.title +title=`sed -n -e "$metasedprog" "$infile"` +#echo titre: "$title" + +setmetasedprog abiword.keywords +keywords=`sed -n -e "$metasedprog" "$infile"` +#echo keywords: "$keywords" + +setmetasedprog dc.creator +creator=`sed -n -e "$metasedprog" "$infile"` +#echo creator: "$creator" + +# Note: next expr supposes that paragraphs are always all by themselves on +# a single line in the xml (no multiple

per line, no embedded newlines +# in text). +contentsedprog=' +/]/{ +s/<[^>]*>/ /g +p +} +' +content=`sed -n -e "$contentsedprog" "$infile"` +#echo content: "$content" + +# output the result +(echo '' "$title" '' +echo '' +echo '' +echo '' +echo '' +echo '

'
+echo "$content" 
+echo '
') \ +| iconv -f $encoding -t UTF-8 -c -s + + +# exit normally +exit 0 diff --git a/website/filters/rclimg b/website/filters/rclimg new file mode 100755 index 00000000..ced61f98 --- /dev/null +++ b/website/filters/rclimg @@ -0,0 +1,95 @@ +#! /usr/bin/perl -w +# @(#$Id: rclimg,v 1.2 2007/10/02 13:56:42 dockes Exp $ (C) 2007 Cedric Scott +####################################################### +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +###################################################### + +# +# rclimg: extract image tags with exiftool and convert the data to html for +# recoll indexing. +# + +# +# maps image file tags to xapian tags +# +$tagMap = { + 'subject' => 'subject', + 'title' => 'title', + 'headline' => 'title', + 'caption' => 'caption', + 'caption-abstract' => 'caption', + 'author' => 'author', + 'creator' => 'creator', + 'from' => 'from', + 'keywords' => 'keywords', + 'keyword' => 'keyword', + 'tag' => 'tag', +}; + +# set to non-zero if tags which map to xapian tags are to output +# in the body as well as the header +# +$headAndBody = 1; + +# xapianTag +# returns a xapian tag to be used for this tag +# +sub xapianTag { + my $imgtag = shift; + while ( ( $tagre, $xapiantag) = each %{$tagMap} ) { + return $xapiantag if $imgtag =~ /$tagre/i; + } + return undef; +} + +# +# start here +# +use Image::ExifTool qw(:Public); + +$imageFile = shift; +$imageFile = '-' if $imageFile eq ''; +unless ( open(IMGF, $imageFile) ) { + print STDERR "$0: can't open file $imageFile\n"; + exit(1); # file doesn't exist or can't be read +} +$info = ImageInfo(\*IMGF); +die unless $info; +$fields = []; +$other = []; +$titleHtmlTag = ""; +foreach $tagname ( sort keys %{$info} ) { + $xapiantag = xapianTag($tagname); + if (defined $xapiantag ) { + push @{$fields}, [ $xapiantag, $info->{$tagname} ]; + $titleHtmlTag = "$info->{$tagname}" if $xapiantag eq 'title'; + push @{$other}, [ $tagname, $info->{$tagname} ] if $headAndBody; + } else { + push @{$other}, [ $tagname, $info->{$tagname} ]; + } +} +print "\n\n$titleHtmlTag\n"; +print "\n"; +foreach $tagpair ( @{$fields} ) { + ($tagname, $value) = @{$tagpair}; + print "\n"; +} +print "\n"; +foreach $tagpair (@{$other} ) { + ($tagname, $value) = @{$tagpair}; + printf "%30s : %s
\n", $tagname, $value; +} +print "\n\n"; diff --git a/website/filters/rclkwd b/website/filters/rclkwd new file mode 100755 index 00000000..a4aad5ad --- /dev/null +++ b/website/filters/rclkwd @@ -0,0 +1,204 @@ +#!/bin/sh +# @(#$Id: rclkwd,v 1.1 2007/06/08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclkword +# Extract text from a kword file +# +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclkwd" +filetype=kword + + + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds awk unzip gunzip tar + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclkwd_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rclkwdtmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rclkwdtmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rclkwdtmp + rmdir $tmpdir +} + +trap cleanup EXIT HUP QUIT INT TERM + +# Old kwd files are gzip/tar archibes. Newer ones are zip archives. +if file $infile | grep -qi gzip ; then + # Unzip the input file and change to the unzipped directory + gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -) +else + echo new kwd + # Unzip the input file and change to the unzipped directory + unzip -q -d $tmpdir/rclkwdtmp "$infile" +fi +cd $tmpdir/rclkwdtmp + +metafile=documentinfo.xml +contentfile=maindoc.xml + +if test -f $metafile ; then + + # Note: there can be newlines inside the description field, we don't want + # them... + abssedprog='//,/<\/abstract>/{ +s!.*!! +s!.*!! +p +} +' + abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \ + sed -e '1s///'` + subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ + < $metafile` + title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ + < $metafile | tr '\n' ' '` + keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \ + < $metafile` +fi + +# Note: next expr inserts a newline at each end of paragraph (for preview) +content="`sed -e 's!</TEXT>!\\ +!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`" + +#echo abstract "$abstract" +#echo subject "$subject" +#echo title "$title" +#echo keywords "$keywords" +#echo content "$content" + +# output the result +echo '<html><head>' +echo '<title>' "$title" '' +echo '' +echo '' +echo '' +echo '

' + +# The strange 'BEGIN' setup is to prevent 'file' from thinking this file +# is an awk program +echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ +awk 'BEGIN'\ +' { + cont = "" +} +{ + $0 = cont $0 + cont = "" + + if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line + } + + if($0 == "\f") { + print "

\n
\n

" + next + } + + print $0 "
" +} +END { + printf("

\n"); +}' | iconv -f UTF-8 -t UTF-8 -c -s + +cd / +# exit normally +exit 0 diff --git a/website/filters/rcllyx b/website/filters/rcllyx new file mode 100755 index 00000000..047f8e6e --- /dev/null +++ b/website/filters/rcllyx @@ -0,0 +1,195 @@ +#!/bin/sh +# @(#$Id: rcllyx,v 1.4 2007/01/23 07:23:12 dockes Exp $ (C) 2004 J.F.Dockes +# There may still be code from Estraier in here: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rcllyx +# Convert a lyx file to recoll HTML. +# +# We use lyx --export. It was suggested to use untex, but it doesn't give +# good results on raw lyx (of course, this is not TeX), and exporting to +# LaTex then using untex doesn't look nice when we can use the native lyx +# text export. +# The character encoding of the exported text is defined by the +# \inputencoding directive in the lyx file header and, in quite an obscure +# way, by the \language parameter. We use a heuristic to divine the output +# text encoding and it is guaranteed not to work in all cases. Trials using +# an intermediary dvi, pdf or ps file gave worse results. This needs +# improvement. It doesn't even take into account the fact that the language +# can change inside the doc (does this change the encoding or not ?). To be +# frank, this is not entirely my fault, the lyx format is a joke. +# +# As there is unfortunately no way to define the output file name, we have +# to use a temporary directory and link the input file in there. + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rcllyx" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Extract lyx text as basic HTML.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} + +checkcmds lyx iconv + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi + +tmpdir=$ttdir/rcllyx_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rcllyxtmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rcllyxtmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rcllyxtmp + rmdir $tmpdir +} + +trap cleanup EXIT HUP QUIT INT TERM + +workdir=$tmpdir/rcllyxtmp +case "$infile" in + */*) ;; + *) infile=`pwd`/$infile;; +esac + +binfile=`basename $infile` +ln -s "$infile" "$workdir/$binfile" || exit 1 +lyxfile=$workdir/$binfile +textfile=$workdir/`basename $binfile .lyx`.txt + +#echo binfile: $binfile;echo lyxfile: $lyxfile ; ls -l $lyxfile; echo textfile: $textfile + +# Run lyx --export +lyx --export text $lyxfile + +# Charset and language +formatline=`egrep '^\\\lyxformat ' $lyxfile` +if test -n "$formatline" ; then + set $formatline + format=$2 +fi +charsetline=`egrep '^\\\inputencoding ' $lyxfile` +if test -n "$charsetline" ; then + set $charsetline + charset=$2 +fi +langline=`egrep '^\\\language ' $lyxfile` +if test -n "$langline" ; then + set $langline + lang=$2 +fi +#echo format: [$format] charset: [$charset] lang [$lang] + +if test "$format" -ge 249 ; then + charset=utf-8 +else + # try to guess the charset from the language: this is in no way guaranteed + # to work, the logic has built-in inconsistencies even beyond the numerous + # external ones (what if the ukrainian writer prefers koi8-r ?). This is a + # joke. + if test -z "$charset" -o "$charset" = default -o "$charset" = auto ; then + case "$lang" in + american|afrikaans|basque|catalan|danish|dutch|english|faeroese|finnish|french|galician|german|icelandic|irish|italian|norwegian|portuguese|spanish|swedish) + charset=iso-8859-1;; + czech|german|hungarian|polish|romanian|croatian|slovak|slovene) + charset=iso-8859-2;; + esperanto|galician|maltese|Turkish) + charset=iso-8859-3;; + estonian|latvian|lithuanian) + charset=iso-8859-4;; + bulgarian|byelorussian|macedonian|russian|serbian|ukrainian) + charset=iso-8859-5;; + arabic) + charset=iso-8859-6;; + greek) + charset=iso-8859-7;; + hebrew) + charset=iso-8859-8;; + #ISO-8859-9 - Latin 5 Same as 8859-1 except for Turkish instead of + #Icelandic. ? What is one to do :) + #ISO-8859-10 - Latin 6 + lappish|nordic|eskimo|inuit|sami) + charset=iso-8859-10;; + albanian|german|english|basque|breton|catalan|danish|spanish|estonian|esthonian|faeroese|faroese|finnish|french|frisian|friesian|scottish|goidelic|irish|gaelic|galician|welsh|greenlandic|inuit|icelandic|italian|latin|dutch|norvegian|portuguese|romansch|romansh|friulian|ladin|swedish) + charset=iso-8859-15;; + *) + charset=iso-8859-1;; + esac + fi +fi + +if test -n "$charset" ; then + inputcmd="iconv -f $charset -t UTF-8 -c -s" +else + inputcmd=cat +fi +#echo inputcmd: [$inputcmd] + +cat < + + $title + + + +
+EOF
+
+$inputcmd < $textfile
+
+cat <
+
+
+EOF
diff --git a/website/filters/rclopxml b/website/filters/rclopxml
new file mode 100755
index 00000000..e05735f9
--- /dev/null
+++ b/website/filters/rclopxml
@@ -0,0 +1,245 @@
+#!/bin/sh
+# @(#$Id: rclopxml,v 1.2 2008/09/01 17:31:47 dockes Exp $  (C) 2004 J.F.Dockes
+#================================================================
+# rcldocx
+# Extract text from an openxml msword file (will be extended for spreadsheets)
+# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
+#================================================================
+
+# set variables
+LANG=C ; export LANG
+LC_ALL=C ; export LC_ALL
+progname=rclopxml
+filetype=openxml
+
+#RECFILTCOMMONCODE
+##############################################################################
+# !! Leave the previous line unmodified!! Code imported from the
+# recfiltcommon file
+
+# Utility code common to all shell filters. This could be sourced at run
+# time, but it's slightly more efficient to include the code in the
+# filters at build time (with a sed script).
+
+# Describe error in a way that can be interpreted by our caller
+senderror()
+{
+    echo RECFILTERROR $*
+    # Also alert on stderr just in case
+    echo ":2:$progname::: $*" 1>&2
+    exit 1
+}
+
+iscmd()
+{
+    cmd=$1
+    case $cmd in
+    */*)
+	if test -x $cmd ; then return 0; else return 1; fi ;;
+    *)
+      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
+      for d in $*;do test -x $d/$cmd && return 0;done
+      return 1 ;;
+    esac
+}
+
+checkcmds()
+{
+    for cmd in $*;do
+      if iscmd $cmd 
+      then 
+        a=1
+      else 
+        senderror HELPERNOTFOUND $cmd
+      fi
+    done
+}
+
+# show help message
+if test $# -ne 1 -o "$1" = "--help" 
+then
+  echo "Convert a $filetype file to HTML text for Recoll indexing."
+  echo "Usage: $progname [infile]"
+  exit 1
+fi
+
+infile="$1"
+
+# check the input file existence (may be '-' for stdin)
+if test "X$infile" != X- -a ! -f "$infile"
+then
+  senderror INPUTNOSUCHFILE "$infile"
+fi
+
+# protect access to our temp files and directories
+umask 77
+
+##############################################################################
+# !! Leave the following line unmodified !
+#ENDRECFILTCOMMONCODE
+
+checkcmds xsltproc unzip
+
+# check the input file existence
+if test ! -f "$infile"
+then
+  printf '%s: %s: no such file\n' "$progname" "$infile"
+  exit 1
+fi
+
+# We need a temporary directory
+if test z"$RECOLL_TMPDIR" != z; then
+   ttdir=$RECOLL_TMPDIR
+elif test z"$TMPDIR" != z ; then
+   ttdir=$TMPDIR
+else
+   ttdir=/tmp
+fi
+tmpdir=$ttdir/rclopxml_tmp$$
+mkdir $tmpdir || exit 1
+mkdir $tmpdir/rclopxmltmp || exit 1
+
+cleanup()
+{
+    # Note that we're using a constant part (rclopxmltmp), that hopefully
+    # guarantees that we can't do big mistakes here.
+    rm -rf $tmpdir/rclopxmltmp
+    rmdir $tmpdir
+}
+    
+trap cleanup EXIT HUP QUIT INT TERM
+
+# Unzip the input file and change to the unzipped directory
+unzip -q -d $tmpdir/rclopxmltmp "$infile"
+cd $tmpdir/rclopxmltmp
+
+echo '
+'
+
+xsltproc - docProps/core.xml <
+
+
+
+  
+
+  
+    

+    
+    

+    
+  
+
+  
+    
+    
+      
+      author 
+    
+    
+       
+    
+    
+    

+  
+
+  
+    
+    
+      date 
+    
+    
+       
+    
+    
+    

+  
+
+  
+  
+
+
+EOF
+
+echo '
+'
+
+filename=''
+if test -f word/document.xml ; then
+ filenames=word/document.xml 
+ tagmatch="w:p"
+ xmlns_decls='
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+ xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
+ xmlns:o="urn:schemas-microsoft-com:office:office"
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+ xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
+ xmlns:v="urn:schemas-microsoft-com:vml"
+ xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
+ xmlns:w10="urn:schemas-microsoft-com:office:word"
+ xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
+ '
+
+elif test -f xl/sharedStrings.xml ; then
+ filenames=xl/sharedStrings.xml 
+ tagmatch='x:t'
+ xmlns_decls='
+   xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+   xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+  '
+
+elif test -f ppt/slides/slide1.xml ; then
+ filenames=`echo ppt/slides/slide*.xml`
+ tagmatch='a:t'
+ xmlns_decls='
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+  xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" 
+  xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+ '
+# I want to suppress text output for all except a:t, don't know how to do it
+# help ! At least get rid of these:
+ moretemplates='
+  
+  
+'
+else
+    # ??
+    exit 1
+fi
+
+
+for filename in $filenames;do
+xsltproc - $filename <
+
+
+ 
+
+ 
+  
+ +
+
+ + +

+ +

+
+ + $moretemplates + +
+EOF +done + +echo '' diff --git a/website/filters/rclscribus b/website/filters/rclscribus new file mode 100755 index 00000000..045c022d --- /dev/null +++ b/website/filters/rclscribus @@ -0,0 +1,151 @@ +#!/bin/sh +# @(#$Id: rclscribus,v 1.1 2007/01/22 16:32:55 dockes Exp $ (C) 2004 J.F.Dockes +# There may still be code from Estraier in here: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclscribus +# Convert a scribus file to recoll HTML. This only handles the newer .sla +# files until I can have a look at an older .scd. +# +# We just hack into the scribus XML, taking advantage that the tag of +# interest is apparently always output on a single line. +# The text seems to be found in attribute CH of tag ITEXT, it is utf-8 + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclscribus" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Extract scribus text as basic HTML.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds grep awk sed + +# A small sed program to join lines where they are broken inside an +# attribute value. The idea is that all scribus tag are apparently on one +# line except when there are embedded new lines in an attribute lie +# 'comments'. The first version of the sed script joins line which does not +# end with > with the next. It doesn't guard against an embedded '>'. The +# seconf joins line not beginning with '<' with the previous. It is much +# slower for some reason. +sedjoinprog=':a +/[^>] *$/N; s/\n/ /; ta' +#sedjoinprog1=':a +#$!N;/^ *[^<]/s/\n/ /;ta +#P;D' + +# Extract description title author and keywords +description=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +title=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +author=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +keywords=`sed -e "$sedjoinprog" < $infile | \ +awk ' +/" + } +} +'` + +#echo description: [$description];echo title: [$title]; +#echo author: [$author];echo keywords: [$keywords] + +cat < +$title + + + + + +

+EOF + + +sed -e ':a' -e '/[^>] *$/N; s/\n/ /; ta' < $infile | \ +awk ' +/" + } +} +END { + print "

" +} +' | \ +sed -e 's//
/g' -e 's//
/g' diff --git a/website/filters/rclsoff b/website/filters/rclsoff new file mode 100755 index 00000000..8508e430 --- /dev/null +++ b/website/filters/rclsoff @@ -0,0 +1,156 @@ +#!/bin/sh +# @(#$Id: rclsoff,v 1.6.6.1 2007/01/21 16:41:49 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclsoff +# Extract text from an openoffice/soffice file +# +#================================================================ + + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclsoff" + + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Convert an openoffice file to unformatted HTML text.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + echo $cmd not found 1>&2 + exit 1 + fi + done +} +checkcmds awk iconv unzip + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclsoff_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rclsofftmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rclsofftmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rclsofftmp + rmdir $tmpdir +} + +trap cleanup EXIT HUP QUIT INT TERM + +# Unzip the input file and change to the unzipped directory +unzip -q -d $tmpdir/rclsofftmp "$infile" +cd $tmpdir/rclsofftmp + +# Note: there can be newlines inside the description field, we don't want +# them... +descsedprog='//,/<\/dc:description>/{ +s!.*!! +s!.*!! +p +} +' +description=`sed -n -e "$descsedprog" < meta.xml | tr '\n' ' '` + +subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' < meta.xml` + +title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' < meta.xml` + +keywords=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ + < meta.xml` + +# Note: next expr inserts a newline at each end of paragraph (for preview) +content="`sed -e 's!!\\ +!g' -e 's/<[^>]*>/ /g' < content.xml`" + +#echo description "$description" +#echo subject "$subject" +#echo title "$title" +#echo keywords "$keywords" +#echo content "$content" + +# output the result +echo '' +echo '' "$title" '' +echo '' +echo '' +echo '' +echo '

' + +echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ +awk ' +BEGIN { + cont = "" +} +{ + $0 = cont $0 + cont = "" + + if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line + } + + if($0 == "\f") { + print "

\n
\n

" + next + } + + print $0 "
" +} +END { + printf("

\n"); +}' | iconv -f UTF-8 -t UTF-8 -c -s + +cd / +# exit normally +exit 0 diff --git a/website/filters/rclsvg b/website/filters/rclsvg new file mode 100755 index 00000000..e114756c --- /dev/null +++ b/website/filters/rclsvg @@ -0,0 +1,143 @@ +#!/bin/sh +# @(#$Id: rclsvg,v 1.2 2008/02/03 16:05:57 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# Extract text from a Scalable Vector Graphics file +#================================================================ + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclsvg" +filetype=svg + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +checkcmds iconv sed + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +encoding=`sed -ne '//p +//p +//p +//p' + +# Strip tags +spstriptags='#n +//!{ + N + b c + } + />/s/<.*>//g +} +/^[ ]*$/!p' + +content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \ + sed -ne "$spstriptags"` + +(echo '' +echo '' +echo '
'
+echo "$content" 
+echo '
') \ +| iconv -f $encoding -t UTF-8 -c -s + + +# exit normally +exit 0 diff --git a/website/filters/rcltex b/website/filters/rcltex new file mode 100755 index 00000000..aa282a53 --- /dev/null +++ b/website/filters/rcltex @@ -0,0 +1,106 @@ +#!/bin/sh +# @(#$Id: rcltex,v 1.2 2007/11/09 15:56:14 dockes Exp $ (C) 2004 J.F.Dockes +#================================================================ +# rcltex +# Translate TeX files for recoll. Uses either untex or detex to translate to html +#================================================================ +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rcltex" +filetype=TeX + + +#RECFILTCOMMONCODE +############################################################################## +# !! Leave the previous line unmodified!! Code imported from the +# recfiltcommon file + +# Utility code common to all shell filters. This could be sourced at run +# time, but it's slightly more efficient to include the code in the +# filters at build time (with a sed script). + +# Describe error in a way that can be interpreted by our caller +senderror() +{ + echo RECFILTERROR $* + # Also alert on stderr just in case + echo ":2:$progname::: $*" 1>&2 + exit 1 +} + +iscmd() +{ + cmd=$1 + case $cmd in + */*) + if test -x $cmd ; then return 0; else return 1; fi ;; + *) + oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs + for d in $*;do test -x $d/$cmd && return 0;done + return 1 ;; + esac +} + +checkcmds() +{ + for cmd in $*;do + if iscmd $cmd + then + a=1 + else + senderror HELPERNOTFOUND $cmd + fi + done +} + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + echo "Convert a $filetype file to HTML text for Recoll indexing." + echo "Usage: $progname [infile]" + exit 1 +fi + +infile="$1" + +# check the input file existence (may be '-' for stdin) +if test "X$infile" != X- -a ! -f "$infile" +then + senderror INPUTNOSUCHFILE "$infile" +fi + +# protect access to our temp files and directories +umask 77 + +############################################################################## +# !! Leave the following line unmodified ! +#ENDRECFILTCOMMONCODE + +if iscmd detex ; then + checkcmds iconv + CMD="detex -n -e ''" +else + checkcmds untex iconv + CMD="untex -giso -a" +fi + +# output the result +echo '' +#echo '' "$title" '' +echo '' +echo '' +echo '
'
+
+#untex -giso -a "$infile" | \
+
+$CMD "$infile" | \
+   iconv -c -f iso-8859-1 -t utf-8 | \
+   sed \
+       -e 's/'
+echo ''
+
+# exit normally
+exit 0
diff --git a/website/filters/rclwpd b/website/filters/rclwpd
new file mode 100755
index 00000000..5459d9bf
--- /dev/null
+++ b/website/filters/rclwpd
@@ -0,0 +1,87 @@
+#!/bin/sh
+# @(#$Id: rclwpd,v 1.1 2007/08/26 13:34:59 dockes Exp $  (C) 2004 J.F.Dockes
+# Some inspiration from estraier
+#================================================================
+# rclwpd
+# convert wordperfect documents to html, by  executing the wpd2html program:
+#    http://libwpd.sourceforge.net/download.html
+#================================================================
+
+# set variables
+LANG=C ; export LANG
+LC_ALL=C ; export LC_ALL
+progname="rclwpd"
+filetype=wpd
+
+
+#RECFILTCOMMONCODE
+##############################################################################
+# !! Leave the previous line unmodified!! Code imported from the
+# recfiltcommon file
+
+# Utility code common to all shell filters. This could be sourced at run
+# time, but it's slightly more efficient to include the code in the
+# filters at build time (with a sed script).
+
+# Describe error in a way that can be interpreted by our caller
+senderror()
+{
+    echo RECFILTERROR $*
+    # Also alert on stderr just in case
+    echo ":2:$progname::: $*" 1>&2
+    exit 1
+}
+
+iscmd()
+{
+    cmd=$1
+    case $cmd in
+    */*)
+	if test -x $cmd ; then return 0; else return 1; fi ;;
+    *)
+      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
+      for d in $*;do test -x $d/$cmd && return 0;done
+      return 1 ;;
+    esac
+}
+
+checkcmds()
+{
+    for cmd in $*;do
+      if iscmd $cmd 
+      then 
+        a=1
+      else 
+        senderror HELPERNOTFOUND $cmd
+      fi
+    done
+}
+
+# show help message
+if test $# -ne 1 -o "$1" = "--help" 
+then
+  echo "Convert a $filetype file to HTML text for Recoll indexing."
+  echo "Usage: $progname [infile]"
+  exit 1
+fi
+
+infile="$1"
+
+# check the input file existence (may be '-' for stdin)
+if test "X$infile" != X- -a ! -f "$infile"
+then
+  senderror INPUTNOSUCHFILE "$infile"
+fi
+
+# protect access to our temp files and directories
+umask 77
+
+##############################################################################
+# !! Leave the following line unmodified !
+#ENDRECFILTCOMMONCODE
+
+checkcmds wpd2html
+
+# output the result. wpd2html output doesn't seem to need any adjustment?
+
+wpd2html  "$infile" 2> /dev/null