From f385ff4f1aaf61b7e781d108ba8b9cc5e52d9cd7 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 4 May 2010 09:11:56 +0200 Subject: [PATCH] updated filters page for current status --- website/filters/filters.html | 233 ++------------------------------- website/filters/mimeconf | 91 +++++++------ website/filters/mimemap | 35 +++-- website/filters/mimeview | 6 +- website/filters/rclabw | 175 ------------------------- website/filters/rclics | 180 +++++++++++++++++++++++++ website/filters/rclimg | 95 -------------- website/filters/rclkwd | 204 ----------------------------- website/filters/rcllyx | 195 ---------------------------- website/filters/rclopxml | 245 ----------------------------------- website/filters/rclscribus | 151 --------------------- website/filters/rclsoff | 156 ---------------------- website/filters/rclsvg | 143 -------------------- website/filters/rcltex | 106 --------------- website/filters/rclwpd | 87 ------------- 15 files changed, 274 insertions(+), 1828 deletions(-) delete mode 100755 website/filters/rclabw create mode 100755 website/filters/rclics delete mode 100755 website/filters/rclimg delete mode 100755 website/filters/rclkwd delete mode 100755 website/filters/rcllyx delete mode 100755 website/filters/rclopxml delete mode 100755 website/filters/rclscribus delete mode 100755 website/filters/rclsoff delete mode 100755 website/filters/rclsvg delete mode 100755 website/filters/rcltex delete mode 100755 website/filters/rclwpd diff --git a/website/filters/filters.html b/website/filters/filters.html index ebdba757..225fce1f 100644 --- a/website/filters/filters.html +++ b/website/filters/filters.html @@ -56,7 +56,7 @@ ($HOME/.recoll or $RECOLL_CONFDIR).

-

Alternatively, you can replace your 1.[8,9,10] system files with +

Alternatively, you can replace your system files with these updated and complete versions: mimemap mimeconf @@ -64,230 +64,19 @@

Notes:

-

All filters are up to date in Recoll 1.10.5

- -

Recoll 1.10.0: only rclsvg for - Scalable Vector Graphic files is missing.

- -

Recoll 1.9: all filters are up to date in the release, - except the rclimg image - filter and rcltexTeX filter.

- -

Recoll 1.8: The image, kword, - abiword and wordperfect can be installed in - addition.

+

All filters are up to date in Recoll 1.13.04, except rclics.

+

If you are using an older version, you should update to 1.13.04.

-

Open XML Office formats

-

Filter: rclopxml.

-

This needs xsltproc to be - installed (if you run a decently recent Linux, this is - probably on your system already).

- -

The filters are certainly not perfect, but extract a good - part of the text, which is probably better than nothing.

- -

There are quite a few added lines in the configuration - files, just fetch the new ones: - mimemap - mimeconf - mimeview

- - -

Scalable Vector Graphics filter

- -

A new filter for SVG files: - rclsvg. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
.svg = image/svg+xml
-
-

In mimeconf, [index] section:

-
image/svg+xml = exec rclsvg
-

mimeconf, [icons] section:

-
image/svg+xml = drawing
-

mimeconf, [categories] section, also add - image/svg+xml to the other list.

- -

The filter is based on sed, so - you don't need to install any external application.

- -

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
    image/svg+xml = inkview %f
-

(Or substitute your favorite editor).

- - - -

TeX filter

- -

A new filter for TeX files: - rcltex. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
.tex = application/x-tex
-
-

In mimeconf, [index] section:

-
    application/x-tex = exec rcltex
-

mimeconf, [icons] section:

-
application/x-tex = wordprocessing
-

mimeconf, [categories] section, also add - application/x-tex to the texts list.

- -

This filter uses either untex - or detex - if the command is available. . A copy of the - source code for untex is stored - here

- -

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
    application/x-tex = gnuclient -q %f
-

(Or substitute your favorite editor).

- - -

A filter for image tags

- -

A new filter for extracting tags from image and picture files: - rclimg, by Cedric Scott. It is based on - the Exiftool Perl application and library. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
.jpeg = image/jpeg
-.gif = image/gif
-.tiff = image/tiff
-.tif  = image/tiff
-
-

In mimeconf, [index] section:

-
image/gif = exec rclimg
-image/jpeg = exec rclimg
-image/png = exec rclimg
-image/tiff = exec rclimg
-      
-

And remove the image/jpeg = exec rcljpeg line.

- -

Exiftool supports many other image formats, just enter any - additional ones like above.

- -

Wordperfect filter

- -

A new filter for Wordperfect files: - rclwpd. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
.wpd = application/vnd.wordperfect
-
-

In mimeconf, [index] section:

-
    application/vnd.wordperfect = exec rclwpd
-

mimeconf, [icons] section:

-
application/vnd.wordperfect = wordprocessing
-

mimeconf, [categories] section, also add - application/vnd.wordperfect to the texts list.

- -

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
    application/vnd.wordperfect = openoffice %f
- -

Abiword filter

- -

A new filter for - abiword files: - rclabw. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
    .abw = application/x-abiword
- -

In mimeconf:

-
    application/x-abiword = exec rclabw
- -

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
    application/x-abiword = abiword %f
- -

Kword filter

- -

A new filter for - kword files: - rclkwd. - You'll have to add the following lines in the configuration - files:

- -

In mimemap:

-
    .kwd = application/x-kword
-

In mimeconf:

-
    application/x-kword = exec rclkwd
-

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
    application/x-kword = kword %f
- - -

Openoffice filter

-

The filter script for all releases up and including 1.7.5 had - a bug on Debian and Ubuntu systems. You can download the corrected script.

- -

Scribus filter

- -

A new filter for - Scribus files: - rclscribus. This is only for the newer - .sla files. I am willing to add support for the older - .scd format if someone sends me a sample... You'll - have to add the following lines in the configuration files:

- -

In mimemap:

-
      .sla = application/x-scribus
-

In mimeconf:

-
      application/x-scribus = exec rclscribus
-

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
       application/x-scribus = scribus %f
- -

Do *not* add entries for .sla.gz, the normal recoll - decompression process will handle them (hopefully...).

- - -

Lyx filter

- -

A new filter for - Lyx files: rcllyx. - This probably has quite a few issues with character encoding, - but it's also probably better than handling lyx documents as - text files.

- -

In mimemap:

-
      .lyx = application/x-lyx
-

In mimeconf:

-
      application/x-lyx = exec rcllyx
-

In - mimeview, or the [view] - section of - mimeconf for older recoll versions:

-
       application/x-lyx = lyx %f
+

Updated icalendar filter

+

The filter script for all releases up and including 1.13.04 used + the icalendar Python modules which is not robust enough against some + syntax errors (found for example in Mozilla exports). The new version + uses an internal trivial parser, which will hopefully be both + sufficient for what we are doing and more robust. + You can download the new script. This will not + work for versions prior to 1.13.

diff --git a/website/filters/mimeconf b/website/filters/mimeconf index b555121c..9e7af312 100644 --- a/website/filters/mimeconf +++ b/website/filters/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.41 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.48 2008-11-27 13:35:24 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -23,15 +23,24 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t ## ################################### # Filters for indexing and internal preview. # The "internal" filters are hardwired in the c++ code. -# The external "exec" filters are typically scripts. They output the +# The external "exec" filters are typically scripts. By default, they output the # document in simple html format, have a look at the scripts. +# A different format (ie text/plain), and a character set can be defined for +# each filter, see the exemples below (ie: msword) [index] -application/msword = exec rcldoc +# Note: rcldoc did some work to splice hyphenated words at eol. Seems +# actually not needed because antiword apparently does it too +# application/msword = exec rcldoc +application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 +# Alternatively you can use wvWare for msword. It's much slower than +# antiword, but will handle documents which provoke the 'I'm afraid the +# text stream of this file is too small to handle' antiword error +# application/msword = exec wvWare --charset=utf-8 --nographics application/ogg = exec rclogg application/pdf = exec rclpdf -application/postscript = exec rclps -application/vnd.ms-excel = exec rclxls -application/vnd.ms-powerpoint = exec rclppt +application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain +application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain +application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ exec rclopxml application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ @@ -54,29 +63,40 @@ application/vnd.sun.xml.math = exec rclsoff application/vnd.sun.xml.writer = exec rclsoff application/vnd.sun.xml.writer.global = exec rclsoff application/vnd.sun.xml.writer.template = exec rclsoff -application/vnd.wordperfect = exec rclwpd +application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = exec rclabw +application/x-awk = internal application/x-dvi = exec rcldvi application/x-flac = exec rclflac application/x-kword = exec rclkwd application/x-lyx = exec rcllyx +application/x-perl = internal application/x-scribus = exec rclscribus +application/x-shellscript = internal application/x-tex = exec rcltex +application/x-chm = execm rclchm +application/zip = execm rclzip audio/mpeg = exec rclid3 -image/gif = exec rclimg -image/jpeg = exec rclimg -image/png = exec rclimg -image/tiff = exec rclimg +image/gif = execm rclimg +image/jpeg = execm rclimg +image/png = execm rclimg +image/tiff = execm rclimg image/vnd.djvu = exec rcldjvu image/svg+xml = exec rclsvg message/rfc822 = internal +text/calendar = execm rclics;mimetype=text/plain;charset=utf-8 text/html = internal text/plain = internal -text/rtf = exec rclrtf +text/rtf = exec unrtf --nopict --html;charset=iso-8859-1;mimetype=text/html +text/x-c = internal +text/x-fictionbook = exec rclfb2 text/x-gaim-log = exec rclgaim text/x-html-sidux-man = exec rclsiduxman text/x-mail = internal text/x-man = exec rclman +text/x-purple-log = exec rclpurple +text/x-python = exec rclpython +text/x-shellscript = internal ## ############################################# # Icons to be used in the result list if required by gui config @@ -117,6 +137,9 @@ application/x-kword = wordprocessing application/x-lyx = wordprocessing application/x-scribus = document application/x-tex = wordprocessing +application/x-awk = source +application/x-perl = source +application/x-shellscript = source audio/mpeg = sownd image/gif = image image/jpeg = image @@ -128,9 +151,14 @@ message/rfc822 = message text/html = html text/plain = txt text/x-c = source +text/x-c++ = source +text/x-fictionbook = document text/x-html-sidux-man = sidux-book text/x-mail = message text/x-man = document +application/x-chm = document +text/x-purple-log = pidgin +text/x-python = text-x-python [categories] @@ -145,17 +173,27 @@ text = \ application/vnd.sun.xml.writer.template \ application/vnd.wordperfect \ application/x-abiword \ + application/x-awk \ + application/x-chm \ application/x-dvi \ application/x-kword \ application/x-lyx \ + application/x-perl \ application/x-scribus \ + application/x-shellscript \ application/x-tex \ image/vnd.djvu \ + text/calendar \ text/html \ text/plain \ text/rtf \ + text/x-c \ + text/x-c++ \ + text/x-fictionbook \ text/x-html-sidux-man \ - text/x-man + text/x-man \ + text/x-python \ + text/x-shellscript spreadsheet = \ application/vnd.ms-excel \ @@ -182,34 +220,11 @@ media = \ message = message/rfc822 \ text/x-gaim-log \ text/x-mail \ + text/x-purple-log \ other = application/vnd.sun.xml.draw \ application/vnd.sun.xml.draw.template \ application/vnd.sun.xml.math \ application/x-fsdirectory \ + application/zip \ image/svg+xml \ - - -[prefixes] - -# This allows extending the set of fields that recoll understand/searches. -# See the manual for exact usage. -# Important: -# - the field names MUST be all lowercase here. They can be anycased -# in the documents: -# - The extension field prefixes MUST begin with X and be all UPPERCASE. -title = S -caption = S -subject = S - -author = A -creator = A -from = A - -keyword = K -tag = K -keywords = K -tags = K - -# testing /example : -recollspecialfield = XRCLSF diff --git a/website/filters/mimemap b/website/filters/mimemap index bc811db5..0a1b1234 100644 --- a/website/filters/mimemap +++ b/website/filters/mimemap @@ -1,16 +1,23 @@ -# @(#$Id: mimemap,v 1.31 2008/08/25 16:12:16 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimemap,v 1.32 2008-09-15 08:03:37 dockes Exp $ (C) 2004 J.F.Dockes # Recoll: associations of file name extensions to mime types .txt = text/plain .text = text/plain .d = text/plain -# Source files. Defining them as text/x-c will enable ext viewer. If -# text/plain they will be somewhat indexed +# Source files. +# Defining them with specific types allows using a specific ext viewer (in +# mimeview). You can in general use rcltext to wrap them in html for +# indexing the contents (and rough preview). You could also just set them +# as text/plain (index as text, use text viewer) .cpp = text/x-c .h = text/x-c .c = text/x-c .cc = text/x-c +.py = text/x-python +.awk = application/x-awk +.pl = application/x-perl +.sh = application/x-shellscript .rtf = text/rtf @@ -18,6 +25,7 @@ .htm = text/html .shtml = text/html .php = text/html +.ics = text/calendar .pdf = application/pdf @@ -35,10 +43,12 @@ .Z = application/x-gzip .bz2 = application/x-bzip2 #.Z = application/x-compress +.zip = application/zip .doc = application/msword .ppt = application/vnd.ms-powerpoint .xls = application/vnd.ms-excel +.chm = application/x-chm # OpenOffice / opendocument. We handle opendocument as old openoffice files # for now @@ -97,21 +107,26 @@ .tiff = image/tiff .tif = image/tiff -# A list of stuff that we don't want to touch at all (for now). Having the -# suffixes listed in there speeds up things quite a lot by avoiding +.fb2 = text/x-fictionbook + +# A list of suffixes (name endings) that we don't want to touch at all. +# Having these explicitely listed speeds things up a bit by avoiding # unneeded decompression or 'file' calls. File names still get indexed if -# indexallfilenames is set +# indexallfilenames is set (so this is different from skippedNames). It's a +# bit unconsistent to have it listed among the suffix translations, but no +# problem in practice. recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \ - .m4 .tcl .js .sh .pl .awk \ .o .lib .dll .a \ - .dat .bak .rdf .log .db .ini .msf .pid \ + .dat .bak .rdf .log .db .msf .pid \ .gnm .gnumeric \ .gif .bmp .xpm \ ,v ~ # -# Special handling of .txt files inside ~/.gaim directory +# Special handling of .txt files inside ~/.gaim and ~/.purple directories [~/.gaim] .txt = text/x-gaim-log +[~/.purple] +.txt = text/x-purple-log # Special handling of sidux manual menu system [/usr/share/sidux-manual] @@ -130,3 +145,5 @@ recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \ .6 = text/x-man .7 = text/x-man .8 = text/x-man +.n = text/x-man +.3pm = text/x-man diff --git a/website/filters/mimeview b/website/filters/mimeview index ef441825..2a3ee048 100644 --- a/website/filters/mimeview +++ b/website/filters/mimeview @@ -1,4 +1,4 @@ -# @(#$Id: mimeview,v 1.15 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeview,v 1.16 2008-09-15 08:03:37 dockes Exp $ (C) 2004 J.F.Dockes ## ########################################## # External viewers, launched by the recoll GUI when you click on a result @@ -40,8 +40,9 @@ application/vnd.sun.xml.writer = openoffice %f application/vnd.sun.xml.writer.global = openoffice %f application/vnd.sun.xml.writer.template = openoffice %f application/vnd.wordperfect = openoffice %f -application/x-fsdirectory = rox %f +application/x-chm = okular %f application/x-dvi = xdvi %f +application/x-fsdirectory = rox %f application/x-flac = xmms %f application/x-lyx = lyx %f application/x-scribus = scribus %f @@ -59,3 +60,4 @@ text/plain = gnuclient -q %f text/x-c = gnuclient -q %f text/x-html-sidux-man = konqueror %f #text/x-html-sidux-man = iceweasel %f +text/x-python = idle %f diff --git a/website/filters/rclabw b/website/filters/rclabw deleted file mode 100755 index 77478127..00000000 --- a/website/filters/rclabw +++ /dev/null @@ -1,175 +0,0 @@ -#!/bin/sh -# @(#$Id: rclabw,v 1.2 2007/06/15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from an abiword file -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclabw" -filetype=abiword - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds iconv sed - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -encoding=`sed -e '/$//' \ - -e '/^ for all meta -# tags) -descsedprog=' -/\([^<]*\)<\/m>/ { -s//\1/ -p -q -} -//,/<\/m>/ { -s!.*!! -s!.*!! -H -} -${ -g -s/\n/ /g -p -} -' - -description=`sed -n -e "$descsedprog" < "$infile"` -#echo description: "$description" - -# Set program for the single line meta elements. Takes element name as -# parameter -setmetasedprog() { -metasedprog='//{ -s/.*\([^<]*\).*/\1/ -'"s/\"/'/g"' -p -}' -} - -setmetasedprog dc.subject -subject=`sed -n -e "$metasedprog" "$infile"` -#echo subject: "$subject" - -setmetasedprog dc.title -title=`sed -n -e "$metasedprog" "$infile"` -#echo titre: "$title" - -setmetasedprog abiword.keywords -keywords=`sed -n -e "$metasedprog" "$infile"` -#echo keywords: "$keywords" - -setmetasedprog dc.creator -creator=`sed -n -e "$metasedprog" "$infile"` -#echo creator: "$creator" - -# Note: next expr supposes that paragraphs are always all by themselves on -# a single line in the xml (no multiple

per line, no embedded newlines -# in text). -contentsedprog=' -/]/{ -s/<[^>]*>/ /g -p -} -' -content=`sed -n -e "$contentsedprog" "$infile"` -#echo content: "$content" - -# output the result -(echo '' "$title" '' -echo '' -echo '' -echo '' -echo '' -echo '

'
-echo "$content" 
-echo '
') \ -| iconv -f $encoding -t UTF-8 -c -s - - -# exit normally -exit 0 diff --git a/website/filters/rclics b/website/filters/rclics new file mode 100755 index 00000000..f9f0d6c9 --- /dev/null +++ b/website/filters/rclics @@ -0,0 +1,180 @@ +#!/usr/bin/env python + +# Read an ICS file, break it into "documents" which are events, todos, +# or journal entries, and interface with recoll execm +# +# For historical reasons, this can use either the icalendar or the +# vobject Python modules, or an internal splitter. The default is now +# to use the internal splitter, the other modules are more trouble +# than they're worth (to us and until we will want to get into date +# computations etc.) + +import rclexecm +import sys + +# Decide how we'll process the file. +modules = ('internal', 'icalendar', 'vobject') +usemodule = 'internal' +forcevobject = 0 +if usemodule != 'internal': + try: + if forcevobject: + raise Exception + from icalendar import Calendar, Event + usemodule = 'icalendar' + except: + try: + import vobject + usemodule = 'vobject' + except: + print "RECFILTERROR HELPERNOTFOUND python:icalendar" + print "RECFILTERROR HELPERNOTFOUND python:vobject" + sys.exit(1); + + +class IcalExtractor: + def __init__(self, em): + self.file = "" + self.contents = [] + self.em = em + self.em.setmimetype("text/plain") + + def extractone(self, index): + if index >= len(self.contents): + return(False, "", "", True) + docdata = self.contents[index] + #self.em.rclog(docdata) + + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.contents) -1: + iseof = rclexecm.RclExecM.eofnext + return (True, docdata, str(index), iseof) + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.file = params["filename:"] + + try: + calstr = open(self.file, 'rb') + except Exception, e: + self.em.rclog("Openfile: open: %s" % str(e)) + return False + + self.currentindex = 0 + + if usemodule == 'internal': + self.contents = ICalSimpleSplitter().splitcalendar(calstr) + elif usemodule == 'icalendar': + try: + cal = Calendar.from_string(calstr.read()) + except Exception, e: + self.em.rclog("Openfile: read or parse error: %s" % str(e)) + return False + self.contents = cal.walk() + self.contents = [item.as_string() for item in self.contents + if (item.name == 'VEVENT' or item.name == 'VTODO' + or item.name == 'VJOURNAL')] + else: + try: + cal = vobject.readOne(calstr) + except Exception, e: + self.em.rclog("Openfile: cant parse object: %s" % str(e)) + return False + for lstnm in ('vevent_list', 'vtodo_list', 'vjournal_list'): + lst = getattr(cal, lstnm, []) + for ev in lst: + self.contents.append(ev.serialize()) + + #self.em.rclog("openfile: Entry count: %d"%(len(self.contents))) + return True + + def getipath(self, params): + try: + index = int(params["ipath:"]) + except: + return False + return self.extractone(index) + + def getnext(self, params): + if self.currentindex >= len(self.contents): + self.em.rclog("getnext: EOF hit") + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(self.currentindex) + self.currentindex += 1 + return ret + +# Trivial splitter: cut objects on BEGIN/END (only for 'interesting' objects) +# ignore all other syntax +class ICalSimpleSplitter: + # Note that if an 'interesting' element is nested inside another one, + # it will not be extracted (stay as text in external event). This is + # not an issue and I don't think it can happen with the current list + interesting = ('VTODO', 'VEVENT', 'VJOURNAL') + + def splitcalendar(self, fin): + curblkname = '' + curblk = '' + + lo = [] + for line in fin: + line = line.rstrip() + if line == '': + continue + + if curblkname: + curblk = curblk + line + "\n" + + l = line.split(":") + if len(l) < 2: + continue + + # If not currently inside a block and we see an + # 'interesting' BEGIN, start block + if curblkname == '' and l[0].upper() == "BEGIN" : + name = l[1].upper() + if name in ICalSimpleSplitter.interesting: + curblkname = name + curblk = curblk + line + "\n" + + # If currently accumulating block lines, check for end + if curblkname and l[0].upper() == "END" and \ + l[1].upper() == curblkname: + lo.append(curblk) + curblkname = '' + curblk = '' + + if curblk: + lo.append(curblk) + curblkname = '' + curblk = '' + + return lo + + +##### Main program: either talk to the parent or execute test loop + +e = rclexecm.RclExecM() +ical = IcalExtractor(e) + +if len(sys.argv) == 1: + e.mainloop(ical) +else: + # Got a file name parameter: testing without an execm parent + # Loop on all entries + if not ical.openfile({'filename:':sys.argv[1]}): + print "Open error" + sys.exit(1) + + ecnt = 0 + while 1: + ok, data, ipath, eof = ical.getnext("") + if ok: + ecnt = ecnt + 1 + print "=========== ENTRY %d =================" % ecnt + print data + print + else: + print "Got error, eof %d"%eof + break + diff --git a/website/filters/rclimg b/website/filters/rclimg deleted file mode 100755 index ced61f98..00000000 --- a/website/filters/rclimg +++ /dev/null @@ -1,95 +0,0 @@ -#! /usr/bin/perl -w -# @(#$Id: rclimg,v 1.2 2007/10/02 13:56:42 dockes Exp $ (C) 2007 Cedric Scott -####################################################### -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -###################################################### - -# -# rclimg: extract image tags with exiftool and convert the data to html for -# recoll indexing. -# - -# -# maps image file tags to xapian tags -# -$tagMap = { - 'subject' => 'subject', - 'title' => 'title', - 'headline' => 'title', - 'caption' => 'caption', - 'caption-abstract' => 'caption', - 'author' => 'author', - 'creator' => 'creator', - 'from' => 'from', - 'keywords' => 'keywords', - 'keyword' => 'keyword', - 'tag' => 'tag', -}; - -# set to non-zero if tags which map to xapian tags are to output -# in the body as well as the header -# -$headAndBody = 1; - -# xapianTag -# returns a xapian tag to be used for this tag -# -sub xapianTag { - my $imgtag = shift; - while ( ( $tagre, $xapiantag) = each %{$tagMap} ) { - return $xapiantag if $imgtag =~ /$tagre/i; - } - return undef; -} - -# -# start here -# -use Image::ExifTool qw(:Public); - -$imageFile = shift; -$imageFile = '-' if $imageFile eq ''; -unless ( open(IMGF, $imageFile) ) { - print STDERR "$0: can't open file $imageFile\n"; - exit(1); # file doesn't exist or can't be read -} -$info = ImageInfo(\*IMGF); -die unless $info; -$fields = []; -$other = []; -$titleHtmlTag = ""; -foreach $tagname ( sort keys %{$info} ) { - $xapiantag = xapianTag($tagname); - if (defined $xapiantag ) { - push @{$fields}, [ $xapiantag, $info->{$tagname} ]; - $titleHtmlTag = "$info->{$tagname}" if $xapiantag eq 'title'; - push @{$other}, [ $tagname, $info->{$tagname} ] if $headAndBody; - } else { - push @{$other}, [ $tagname, $info->{$tagname} ]; - } -} -print "\n\n$titleHtmlTag\n"; -print "\n"; -foreach $tagpair ( @{$fields} ) { - ($tagname, $value) = @{$tagpair}; - print "\n"; -} -print "\n"; -foreach $tagpair (@{$other} ) { - ($tagname, $value) = @{$tagpair}; - printf "%30s : %s
\n", $tagname, $value; -} -print "\n\n"; diff --git a/website/filters/rclkwd b/website/filters/rclkwd deleted file mode 100755 index a4aad5ad..00000000 --- a/website/filters/rclkwd +++ /dev/null @@ -1,204 +0,0 @@ -#!/bin/sh -# @(#$Id: rclkwd,v 1.1 2007/06/08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# rclkword -# Extract text from a kword file -# -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclkwd" -filetype=kword - - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds awk unzip gunzip tar - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -# We need a temporary directory -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpdir=$ttdir/rclkwd_tmp$$ -mkdir $tmpdir || exit 1 -mkdir $tmpdir/rclkwdtmp || exit 1 - -cleanup() -{ - # Note that we're using a constant part (rclkwdtmp), that hopefully - # guarantees that we can't do big mistakes here. - rm -rf $tmpdir/rclkwdtmp - rmdir $tmpdir -} - -trap cleanup EXIT HUP QUIT INT TERM - -# Old kwd files are gzip/tar archibes. Newer ones are zip archives. -if file $infile | grep -qi gzip ; then - # Unzip the input file and change to the unzipped directory - gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -) -else - echo new kwd - # Unzip the input file and change to the unzipped directory - unzip -q -d $tmpdir/rclkwdtmp "$infile" -fi -cd $tmpdir/rclkwdtmp - -metafile=documentinfo.xml -contentfile=maindoc.xml - -if test -f $metafile ; then - - # Note: there can be newlines inside the description field, we don't want - # them... - abssedprog='//,/<\/abstract>/{ -s!.*!! -s!.*!! -p -} -' - abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \ - sed -e '1s///'` - subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ - < $metafile` - title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ - < $metafile | tr '\n' ' '` - keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \ - < $metafile` -fi - -# Note: next expr inserts a newline at each end of paragraph (for preview) -content="`sed -e 's!</TEXT>!\\ -!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`" - -#echo abstract "$abstract" -#echo subject "$subject" -#echo title "$title" -#echo keywords "$keywords" -#echo content "$content" - -# output the result -echo '<html><head>' -echo '<title>' "$title" '' -echo '' -echo '' -echo '' -echo '

' - -# The strange 'BEGIN' setup is to prevent 'file' from thinking this file -# is an awk program -echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ -awk 'BEGIN'\ -' { - cont = "" -} -{ - $0 = cont $0 - cont = "" - - if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - } - - if($0 == "\f") { - print "

\n
\n

" - next - } - - print $0 "
" -} -END { - printf("

\n"); -}' | iconv -f UTF-8 -t UTF-8 -c -s - -cd / -# exit normally -exit 0 diff --git a/website/filters/rcllyx b/website/filters/rcllyx deleted file mode 100755 index 047f8e6e..00000000 --- a/website/filters/rcllyx +++ /dev/null @@ -1,195 +0,0 @@ -#!/bin/sh -# @(#$Id: rcllyx,v 1.4 2007/01/23 07:23:12 dockes Exp $ (C) 2004 J.F.Dockes -# There may still be code from Estraier in here: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# rcllyx -# Convert a lyx file to recoll HTML. -# -# We use lyx --export. It was suggested to use untex, but it doesn't give -# good results on raw lyx (of course, this is not TeX), and exporting to -# LaTex then using untex doesn't look nice when we can use the native lyx -# text export. -# The character encoding of the exported text is defined by the -# \inputencoding directive in the lyx file header and, in quite an obscure -# way, by the \language parameter. We use a heuristic to divine the output -# text encoding and it is guaranteed not to work in all cases. Trials using -# an intermediary dvi, pdf or ps file gave worse results. This needs -# improvement. It doesn't even take into account the fact that the language -# can change inside the doc (does this change the encoding or not ?). To be -# frank, this is not entirely my fault, the lyx format is a joke. -# -# As there is unfortunately no way to define the output file name, we have -# to use a temporary directory and link the input file in there. - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rcllyx" - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - printf 'Extract lyx text as basic HTML.\n' - printf 'Usage: %s [infile]\n' "$progname" - exit 1 -fi - -infile="$1" - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - echo $cmd not found 1>&2 - exit 1 - fi - done -} - -checkcmds lyx iconv - -# We need a temporary directory -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi - -tmpdir=$ttdir/rcllyx_tmp$$ -mkdir $tmpdir || exit 1 -mkdir $tmpdir/rcllyxtmp || exit 1 - -cleanup() -{ - # Note that we're using a constant part (rcllyxtmp), that hopefully - # guarantees that we can't do big mistakes here. - rm -rf $tmpdir/rcllyxtmp - rmdir $tmpdir -} - -trap cleanup EXIT HUP QUIT INT TERM - -workdir=$tmpdir/rcllyxtmp -case "$infile" in - */*) ;; - *) infile=`pwd`/$infile;; -esac - -binfile=`basename $infile` -ln -s "$infile" "$workdir/$binfile" || exit 1 -lyxfile=$workdir/$binfile -textfile=$workdir/`basename $binfile .lyx`.txt - -#echo binfile: $binfile;echo lyxfile: $lyxfile ; ls -l $lyxfile; echo textfile: $textfile - -# Run lyx --export -lyx --export text $lyxfile - -# Charset and language -formatline=`egrep '^\\\lyxformat ' $lyxfile` -if test -n "$formatline" ; then - set $formatline - format=$2 -fi -charsetline=`egrep '^\\\inputencoding ' $lyxfile` -if test -n "$charsetline" ; then - set $charsetline - charset=$2 -fi -langline=`egrep '^\\\language ' $lyxfile` -if test -n "$langline" ; then - set $langline - lang=$2 -fi -#echo format: [$format] charset: [$charset] lang [$lang] - -if test "$format" -ge 249 ; then - charset=utf-8 -else - # try to guess the charset from the language: this is in no way guaranteed - # to work, the logic has built-in inconsistencies even beyond the numerous - # external ones (what if the ukrainian writer prefers koi8-r ?). This is a - # joke. - if test -z "$charset" -o "$charset" = default -o "$charset" = auto ; then - case "$lang" in - american|afrikaans|basque|catalan|danish|dutch|english|faeroese|finnish|french|galician|german|icelandic|irish|italian|norwegian|portuguese|spanish|swedish) - charset=iso-8859-1;; - czech|german|hungarian|polish|romanian|croatian|slovak|slovene) - charset=iso-8859-2;; - esperanto|galician|maltese|Turkish) - charset=iso-8859-3;; - estonian|latvian|lithuanian) - charset=iso-8859-4;; - bulgarian|byelorussian|macedonian|russian|serbian|ukrainian) - charset=iso-8859-5;; - arabic) - charset=iso-8859-6;; - greek) - charset=iso-8859-7;; - hebrew) - charset=iso-8859-8;; - #ISO-8859-9 - Latin 5 Same as 8859-1 except for Turkish instead of - #Icelandic. ? What is one to do :) - #ISO-8859-10 - Latin 6 - lappish|nordic|eskimo|inuit|sami) - charset=iso-8859-10;; - albanian|german|english|basque|breton|catalan|danish|spanish|estonian|esthonian|faeroese|faroese|finnish|french|frisian|friesian|scottish|goidelic|irish|gaelic|galician|welsh|greenlandic|inuit|icelandic|italian|latin|dutch|norvegian|portuguese|romansch|romansh|friulian|ladin|swedish) - charset=iso-8859-15;; - *) - charset=iso-8859-1;; - esac - fi -fi - -if test -n "$charset" ; then - inputcmd="iconv -f $charset -t UTF-8 -c -s" -else - inputcmd=cat -fi -#echo inputcmd: [$inputcmd] - -cat < - - $title - - - -
-EOF
-
-$inputcmd < $textfile
-
-cat <
-
-
-EOF
diff --git a/website/filters/rclopxml b/website/filters/rclopxml
deleted file mode 100755
index e05735f9..00000000
--- a/website/filters/rclopxml
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclopxml,v 1.2 2008/09/01 17:31:47 dockes Exp $  (C) 2004 J.F.Dockes
-#================================================================
-# rcldocx
-# Extract text from an openxml msword file (will be extended for spreadsheets)
-# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname=rclopxml
-filetype=openxml
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds xsltproc unzip
-
-# check the input file existence
-if test ! -f "$infile"
-then
-  printf '%s: %s: no such file\n' "$progname" "$infile"
-  exit 1
-fi
-
-# We need a temporary directory
-if test z"$RECOLL_TMPDIR" != z; then
-   ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-   ttdir=$TMPDIR
-else
-   ttdir=/tmp
-fi
-tmpdir=$ttdir/rclopxml_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclopxmltmp || exit 1
-
-cleanup()
-{
-    # Note that we're using a constant part (rclopxmltmp), that hopefully
-    # guarantees that we can't do big mistakes here.
-    rm -rf $tmpdir/rclopxmltmp
-    rmdir $tmpdir
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Unzip the input file and change to the unzipped directory
-unzip -q -d $tmpdir/rclopxmltmp "$infile"
-cd $tmpdir/rclopxmltmp
-
-echo '
-'
-
-xsltproc - docProps/core.xml <
-
-
-
-  
-
-  
-    

-    
-    

-    
-  
-
-  
-    
-    
-      
-      author 
-    
-    
-       
-    
-    
-    

-  
-
-  
-    
-    
-      date 
-    
-    
-       
-    
-    
-    

-  
-
-  
-  
-
-
-EOF
-
-echo '
-'
-
-filename=''
-if test -f word/document.xml ; then
- filenames=word/document.xml 
- tagmatch="w:p"
- xmlns_decls='
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
- xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
- xmlns:o="urn:schemas-microsoft-com:office:office"
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
- xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
- xmlns:v="urn:schemas-microsoft-com:vml"
- xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
- xmlns:w10="urn:schemas-microsoft-com:office:word"
- xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
- xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
- '
-
-elif test -f xl/sharedStrings.xml ; then
- filenames=xl/sharedStrings.xml 
- tagmatch='x:t'
- xmlns_decls='
-   xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
-   xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
-  '
-
-elif test -f ppt/slides/slide1.xml ; then
- filenames=`echo ppt/slides/slide*.xml`
- tagmatch='a:t'
- xmlns_decls='
-  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
-  xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" 
-  xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
- '
-# I want to suppress text output for all except a:t, don't know how to do it
-# help ! At least get rid of these:
- moretemplates='
-  
-  
-'
-else
-    # ??
-    exit 1
-fi
-
-
-for filename in $filenames;do
-xsltproc - $filename <
-
-
- 
-
- 
-  
- -
-
- - -

- -

-
- - $moretemplates - -
-EOF -done - -echo '' diff --git a/website/filters/rclscribus b/website/filters/rclscribus deleted file mode 100755 index 045c022d..00000000 --- a/website/filters/rclscribus +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/sh -# @(#$Id: rclscribus,v 1.1 2007/01/22 16:32:55 dockes Exp $ (C) 2004 J.F.Dockes -# There may still be code from Estraier in here: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# rclscribus -# Convert a scribus file to recoll HTML. This only handles the newer .sla -# files until I can have a look at an older .scd. -# -# We just hack into the scribus XML, taking advantage that the tag of -# interest is apparently always output on a single line. -# The text seems to be found in attribute CH of tag ITEXT, it is utf-8 - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclscribus" - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - printf 'Extract scribus text as basic HTML.\n' - printf 'Usage: %s [infile]\n' "$progname" - exit 1 -fi - -infile="$1" - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - echo $cmd not found 1>&2 - exit 1 - fi - done -} -checkcmds grep awk sed - -# A small sed program to join lines where they are broken inside an -# attribute value. The idea is that all scribus tag are apparently on one -# line except when there are embedded new lines in an attribute lie -# 'comments'. The first version of the sed script joins line which does not -# end with > with the next. It doesn't guard against an embedded '>'. The -# seconf joins line not beginning with '<' with the previous. It is much -# slower for some reason. -sedjoinprog=':a -/[^>] *$/N; s/\n/ /; ta' -#sedjoinprog1=':a -#$!N;/^ *[^<]/s/\n/ /;ta -#P;D' - -# Extract description title author and keywords -description=`sed -e "$sedjoinprog" < $infile | \ -awk ' -/" - } -} -'` - -title=`sed -e "$sedjoinprog" < $infile | \ -awk ' -/" - } -} -'` - -author=`sed -e "$sedjoinprog" < $infile | \ -awk ' -/" - } -} -'` - -keywords=`sed -e "$sedjoinprog" < $infile | \ -awk ' -/" - } -} -'` - -#echo description: [$description];echo title: [$title]; -#echo author: [$author];echo keywords: [$keywords] - -cat < -$title - - - - - -

-EOF - - -sed -e ':a' -e '/[^>] *$/N; s/\n/ /; ta' < $infile | \ -awk ' -/" - } -} -END { - print "

" -} -' | \ -sed -e 's//
/g' -e 's//
/g' diff --git a/website/filters/rclsoff b/website/filters/rclsoff deleted file mode 100755 index 8508e430..00000000 --- a/website/filters/rclsoff +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/sh -# @(#$Id: rclsoff,v 1.6.6.1 2007/01/21 16:41:49 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# rclsoff -# Extract text from an openoffice/soffice file -# -#================================================================ - - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclsoff" - - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - printf 'Convert an openoffice file to unformatted HTML text.\n' - printf 'Usage: %s [infile]\n' "$progname" - exit 1 -fi - -infile="$1" - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - echo $cmd not found 1>&2 - exit 1 - fi - done -} -checkcmds awk iconv unzip - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -# We need a temporary directory -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmpdir=$ttdir/rclsoff_tmp$$ -mkdir $tmpdir || exit 1 -mkdir $tmpdir/rclsofftmp || exit 1 - -cleanup() -{ - # Note that we're using a constant part (rclsofftmp), that hopefully - # guarantees that we can't do big mistakes here. - rm -rf $tmpdir/rclsofftmp - rmdir $tmpdir -} - -trap cleanup EXIT HUP QUIT INT TERM - -# Unzip the input file and change to the unzipped directory -unzip -q -d $tmpdir/rclsofftmp "$infile" -cd $tmpdir/rclsofftmp - -# Note: there can be newlines inside the description field, we don't want -# them... -descsedprog='//,/<\/dc:description>/{ -s!.*!! -s!.*!! -p -} -' -description=`sed -n -e "$descsedprog" < meta.xml | tr '\n' ' '` - -subject=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' < meta.xml` - -title=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' < meta.xml` - -keywords=`sed -e "s/\"/'/" -e 's/.*\([^<]*\).*/\1/p;d' \ - < meta.xml` - -# Note: next expr inserts a newline at each end of paragraph (for preview) -content="`sed -e 's!!\\ -!g' -e 's/<[^>]*>/ /g' < content.xml`" - -#echo description "$description" -#echo subject "$subject" -#echo title "$title" -#echo keywords "$keywords" -#echo content "$content" - -# output the result -echo '' -echo '' "$title" '' -echo '' -echo '' -echo '' -echo '

' - -echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ -awk ' -BEGIN { - cont = "" -} -{ - $0 = cont $0 - cont = "" - - if ($0 ~ /[­-]$/) { - # Note : soft-hyphen is iso8859 0xad - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH-1) - $0 = line - } - - if($0 == "\f") { - print "

\n
\n

" - next - } - - print $0 "
" -} -END { - printf("

\n"); -}' | iconv -f UTF-8 -t UTF-8 -c -s - -cd / -# exit normally -exit 0 diff --git a/website/filters/rclsvg b/website/filters/rclsvg deleted file mode 100755 index e114756c..00000000 --- a/website/filters/rclsvg +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/sh -# @(#$Id: rclsvg,v 1.2 2008/02/03 16:05:57 dockes Exp $ (C) 2004 J.F.Dockes -# Parts taken from Estraier: -#================================================================ -# Estraier: a personal full-text search system -# Copyright (C) 2003-2004 Mikio Hirabayashi -#================================================================ -#================================================================ -# Extract text from a Scalable Vector Graphics file -#================================================================ - -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rclsvg" -filetype=svg - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds iconv sed - -# check the input file existence -if test ! -f "$infile" -then - printf '%s: %s: no such file\n' "$progname" "$infile" - exit 1 -fi - -encoding=`sed -ne '//p -//p -//p -//p' - -# Strip tags -spstriptags='#n -//!{ - N - b c - } - />/s/<.*>//g -} -/^[ ]*$/!p' - -content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \ - sed -ne "$spstriptags"` - -(echo '' -echo '' -echo '
'
-echo "$content" 
-echo '
') \ -| iconv -f $encoding -t UTF-8 -c -s - - -# exit normally -exit 0 diff --git a/website/filters/rcltex b/website/filters/rcltex deleted file mode 100755 index aa282a53..00000000 --- a/website/filters/rcltex +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/sh -# @(#$Id: rcltex,v 1.2 2007/11/09 15:56:14 dockes Exp $ (C) 2004 J.F.Dockes -#================================================================ -# rcltex -# Translate TeX files for recoll. Uses either untex or detex to translate to html -#================================================================ -# set variables -LANG=C ; export LANG -LC_ALL=C ; export LC_ALL -progname="rcltex" -filetype=TeX - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -if iscmd detex ; then - checkcmds iconv - CMD="detex -n -e ''" -else - checkcmds untex iconv - CMD="untex -giso -a" -fi - -# output the result -echo '' -#echo '' "$title" '' -echo '' -echo '' -echo '
'
-
-#untex -giso -a "$infile" | \
-
-$CMD "$infile" | \
-   iconv -c -f iso-8859-1 -t utf-8 | \
-   sed \
-       -e 's/'
-echo ''
-
-# exit normally
-exit 0
diff --git a/website/filters/rclwpd b/website/filters/rclwpd
deleted file mode 100755
index 5459d9bf..00000000
--- a/website/filters/rclwpd
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/sh
-# @(#$Id: rclwpd,v 1.1 2007/08/26 13:34:59 dockes Exp $  (C) 2004 J.F.Dockes
-# Some inspiration from estraier
-#================================================================
-# rclwpd
-# convert wordperfect documents to html, by  executing the wpd2html program:
-#    http://libwpd.sourceforge.net/download.html
-#================================================================
-
-# set variables
-LANG=C ; export LANG
-LC_ALL=C ; export LC_ALL
-progname="rclwpd"
-filetype=wpd
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds wpd2html
-
-# output the result. wpd2html output doesn't seem to need any adjustment?
-
-wpd2html  "$infile" 2> /dev/null