added the old filters page which had been forgotten
This commit is contained in:
parent
b33ff20b54
commit
538264db95
294
website/filters/filters.html
Normal file
294
website/filters/filters.html
Normal file
@ -0,0 +1,294 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll updated filters</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="../index.html">Home</a></li>
|
||||
<li><a href="../download.html">Downloads</a></li>
|
||||
<li><a href="../usermanual/index.html">User manual</a></li>
|
||||
<li><a href="../usermanual/rcl.install.html">Installation</a></li>
|
||||
<li><a href="../index.html#support">Support</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Updated filters for Recoll</h1>
|
||||
|
||||
<p>The following describe new and updated filters, which will be
|
||||
part of the next release, but can be installed on the current
|
||||
release if you need them.</p>
|
||||
|
||||
<p>For updated filters, you just need to copy the script to the
|
||||
filters directory which may be typically either <span
|
||||
class="filename">/usr/share/recoll/filters</span>, or <span
|
||||
class="filename">/usr/local/share/recoll/filters</span>.</p>
|
||||
|
||||
<p>For new filters, you'll need to copy the script file as
|
||||
above, possibly install the supporting application, and usually
|
||||
edit the
|
||||
<span class="filename">mimemap</span>,
|
||||
<span class="filename">mimeview</span> and
|
||||
<span class="filename">mimeconf</span> files, either in the
|
||||
shared directory
|
||||
(<span class="filename">
|
||||
/usr[/local]/share/recoll/examples</span>), or
|
||||
in your personal configuration directory
|
||||
(<span class="filename">$HOME/.recoll</span> or
|
||||
<span class="filename">$RECOLL_CONFDIR</span>).</p>
|
||||
|
||||
<p>Alternatively, you can replace your 1.[8,9,10] system files with
|
||||
these updated and complete versions:
|
||||
<a href="mimemap">mimemap</a>
|
||||
<a href="mimeconf">mimeconf</a>
|
||||
<a href="mimeview">mimeview</a> </p>
|
||||
|
||||
<p>Notes:</p>
|
||||
<blockquote>
|
||||
<p>All filters are up to date in Recoll 1.10.5</p>
|
||||
|
||||
<p>Recoll 1.10.0: only <span class="filename">rclsvg</span> for
|
||||
Scalable Vector Graphic files is missing.</p>
|
||||
|
||||
<p>Recoll 1.9: all filters are up to date in the release,
|
||||
except the <span class="filename">rclimg</span> image
|
||||
filter and <span class="filename">rcltex</span>TeX filter.</p>
|
||||
|
||||
<p>Recoll 1.8: The image, <b>kword</b>,
|
||||
<b>abiword</b> and <b>wordperfect</b> can be installed in
|
||||
addition.</p>
|
||||
</blockquote>
|
||||
|
||||
<h2>Open XML Office formats</h2>
|
||||
|
||||
<p>Filter: <a href="rclopxml">rclopxml</a>. </p>
|
||||
<p>This needs <span class="command">xsltproc</span> to be
|
||||
installed (if you run a decently recent Linux, this is
|
||||
probably on your system already). </p>
|
||||
|
||||
<p>The filters are certainly not perfect, but extract a good
|
||||
part of the text, which is probably better than nothing.</p>
|
||||
|
||||
<p>There are quite a few added lines in the configuration
|
||||
files, just fetch the new ones:
|
||||
<a href="mimemap">mimemap</a>
|
||||
<a href="mimeconf">mimeconf</a>
|
||||
<a href="mimeview">mimeview</a> </p>
|
||||
|
||||
|
||||
<h2>Scalable Vector Graphics filter</h2>
|
||||
|
||||
<p>A new filter for <b>SVG</b> files:
|
||||
<a href="rclsvg">rclsvg</a>.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre>.svg = image/svg+xml
|
||||
</pre>
|
||||
<p>In <span class="filename">mimeconf</span>, [index] section: </p>
|
||||
<pre>image/svg+xml = exec rclsvg</pre>
|
||||
<p><span class="filename">mimeconf</span>, [icons] section:</p>
|
||||
<pre>image/svg+xml = drawing</pre>
|
||||
<p><span class="filename">mimeconf</span>, [categories] section, also add
|
||||
<tt>image/svg+xml</tt> to the <tt>other</tt> list.</p>
|
||||
|
||||
<p>The filter is based on <span class="command">sed</span>, so
|
||||
you don't need to install any external application.</p>
|
||||
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> image/svg+xml = inkview %f</pre>
|
||||
<p>(Or substitute your favorite editor).</p>
|
||||
|
||||
|
||||
|
||||
<h2>TeX filter</h2>
|
||||
|
||||
<p>A new filter for <b>TeX</b> files:
|
||||
<a href="rcltex">rcltex</a>.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre>.tex = application/x-tex
|
||||
</pre>
|
||||
<p>In <span class="filename">mimeconf</span>, [index] section: </p>
|
||||
<pre> application/x-tex = exec rcltex</pre>
|
||||
<p>mimeconf, [icons] section:</p>
|
||||
<pre>application/x-tex = wordprocessing</pre>
|
||||
<p>mimeconf, [categories] section, also add
|
||||
application/x-tex to the <tt>texts</tt> list.</p>
|
||||
|
||||
<p>This filter uses either <span class="command">untex</span>
|
||||
or <a
|
||||
"href=http://www.cs.purdue.edu/homes/trinkle/detex/">detex</a>
|
||||
if the command is available. . A copy of the
|
||||
source code for untex is stored <a "href=../untex/untex-1.3.jf.tar.gz">
|
||||
here</a></p>
|
||||
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/x-tex = gnuclient -q %f</pre>
|
||||
<p>(Or substitute your favorite editor).</p>
|
||||
|
||||
|
||||
<h2>A filter for image tags</h2>
|
||||
|
||||
<p>A new filter for extracting tags from image and picture files:
|
||||
<a href="rclimg">rclimg</a>, by Cedric Scott. It is based on
|
||||
the <b>Exiftool</b> Perl application and library.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre>.jpeg = image/jpeg
|
||||
.gif = image/gif
|
||||
.tiff = image/tiff
|
||||
.tif = image/tiff
|
||||
</pre>
|
||||
<p>In <span class="filename">mimeconf</span>, [index] section: </p>
|
||||
<pre>image/gif = exec rclimg
|
||||
image/jpeg = exec rclimg
|
||||
image/png = exec rclimg
|
||||
image/tiff = exec rclimg
|
||||
</pre>
|
||||
<p>And remove the <tt>image/jpeg = exec rcljpeg</tt> line.</p>
|
||||
|
||||
<p>Exiftool supports many other image formats, just enter any
|
||||
additional ones like above.</p>
|
||||
|
||||
<h2>Wordperfect filter</h2>
|
||||
|
||||
<p>A new filter for <b>Wordperfect</b> files:
|
||||
<a href="rclwpd">rclwpd</a>.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre>.wpd = application/vnd.wordperfect
|
||||
</pre>
|
||||
<p>In <span class="filename">mimeconf</span>, [index] section: </p>
|
||||
<pre> application/vnd.wordperfect = exec rclwpd</pre>
|
||||
<p>mimeconf, [icons] section:</p>
|
||||
<pre>application/vnd.wordperfect = wordprocessing</pre>
|
||||
<p>mimeconf, [categories] section, also add
|
||||
application/vnd.wordperfect to the <tt>texts</tt> list.</p>
|
||||
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/vnd.wordperfect = openoffice %f</pre>
|
||||
|
||||
<h2>Abiword filter</h2>
|
||||
|
||||
<p>A new filter for <a href="http://www.abisource.com/">
|
||||
abiword</a> files: <a href="rclabw">
|
||||
rclabw</a>.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre> .abw = application/x-abiword</pre>
|
||||
|
||||
<p>In <span class="filename">mimeconf</span>: </p>
|
||||
<pre> application/x-abiword = exec rclabw</pre>
|
||||
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/x-abiword = abiword %f</pre>
|
||||
|
||||
<h2>Kword filter</h2>
|
||||
|
||||
<p>A new filter for <a href="http://www.kde.org/whatiskde/koffice.php/">
|
||||
kword</a> files: <a href="rclkwd">
|
||||
rclkwd</a>.
|
||||
You'll have to add the following lines in the configuration
|
||||
files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre> .kwd = application/x-kword</pre>
|
||||
<p>In <span class="filename">mimeconf</span>: </p>
|
||||
<pre> application/x-kword = exec rclkwd</pre>
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/x-kword = kword %f</pre>
|
||||
|
||||
|
||||
<h2>Openoffice filter</h2>
|
||||
<p>The filter script for all releases up and including 1.7.5 had
|
||||
a bug on Debian and Ubuntu systems. You can download the <a
|
||||
href="rclsoff">corrected script</a>.</p>
|
||||
|
||||
<h2>Scribus filter</h2>
|
||||
|
||||
<p>A new filter for <a href="http://www.scribus.net/">
|
||||
Scribus</a> files: <a href="rclscribus">
|
||||
rclscribus</a>. This is only for the newer
|
||||
<em>.sla</em> files. I am willing to add support for the older
|
||||
<em>.scd</em> format if someone sends me a sample... You'll
|
||||
have to add the following lines in the configuration files:</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre> .sla = application/x-scribus</pre>
|
||||
<p>In <span class="filename">mimeconf</span>: </p>
|
||||
<pre> application/x-scribus = exec rclscribus</pre>
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/x-scribus = scribus %f</pre>
|
||||
|
||||
<p>Do *not* add entries for <em>.sla.gz</em>, the normal recoll
|
||||
decompression process will handle them (hopefully...).</p>
|
||||
|
||||
|
||||
<h2>Lyx filter</h2>
|
||||
|
||||
<p>A new filter for <a href="http://www.lyx.or/">
|
||||
Lyx</a> files: <a href="rcllyx">rcllyx</a>.
|
||||
This probably has quite a few issues with character encoding,
|
||||
but it's also probably better than handling lyx documents as
|
||||
text files.</p>
|
||||
|
||||
<p>In <span class="filename">mimemap</span>: </p>
|
||||
<pre> .lyx = application/x-lyx</pre>
|
||||
<p>In <span class="filename">mimeconf</span>: </p>
|
||||
<pre> application/x-lyx = exec rcllyx</pre>
|
||||
<p>In
|
||||
<span class="filename">mimeview</span>, or the <em>[view]</em>
|
||||
section of
|
||||
<span class="filename">mimeconf</span> for older recoll versions: </p>
|
||||
<pre> application/x-lyx = lyx %f</pre>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
215
website/filters/mimeconf
Normal file
215
website/filters/mimeconf
Normal file
@ -0,0 +1,215 @@
|
||||
# @(#$Id: mimeconf,v 1.41 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
|
||||
# Recoll : associations of mime types to processing filters.
|
||||
# There are different sections for decompression, 'interning' for indexing
|
||||
# and preview, and external viewers
|
||||
|
||||
## #######################################
|
||||
# Decompression: these types need a first pass to create a temp file to
|
||||
# work with. We use a script because uncompress utilities usually work in
|
||||
# place, which is not suitable.
|
||||
#
|
||||
# The %t parameter will be substituted to the name of a temporary directory
|
||||
# by recoll. This directory is guaranteed empty when calling the filter
|
||||
#
|
||||
# The %f parameter will be substituted with the input file.
|
||||
#
|
||||
# The script (ie: rcluncomp) must output the uncompressed file name on
|
||||
# stdout.
|
||||
application/x-gzip = uncompress rcluncomp gunzip %f %t
|
||||
application/x-compress = uncompress rcluncomp gunzip %f %t
|
||||
application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
|
||||
|
||||
## ###################################
|
||||
# Filters for indexing and internal preview.
|
||||
# The "internal" filters are hardwired in the c++ code.
|
||||
# The external "exec" filters are typically scripts. They output the
|
||||
# document in simple html format, have a look at the scripts.
|
||||
[index]
|
||||
application/msword = exec rcldoc
|
||||
application/ogg = exec rclogg
|
||||
application/pdf = exec rclpdf
|
||||
application/postscript = exec rclps
|
||||
application/vnd.ms-excel = exec rclxls
|
||||
application/vnd.ms-powerpoint = exec rclppt
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||
exec rclopxml
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||
exec rclopxml
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
exec rclopxml
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
exec rclopxml
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
exec rclopxml
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
||||
exec rclopxml
|
||||
application/vnd.sun.xml.calc = exec rclsoff
|
||||
application/vnd.sun.xml.calc.template = exec rclsoff
|
||||
application/vnd.sun.xml.draw = exec rclsoff
|
||||
application/vnd.sun.xml.draw.template = exec rclsoff
|
||||
application/vnd.sun.xml.impress = exec rclsoff
|
||||
application/vnd.sun.xml.impress.template = exec rclsoff
|
||||
application/vnd.sun.xml.math = exec rclsoff
|
||||
application/vnd.sun.xml.writer = exec rclsoff
|
||||
application/vnd.sun.xml.writer.global = exec rclsoff
|
||||
application/vnd.sun.xml.writer.template = exec rclsoff
|
||||
application/vnd.wordperfect = exec rclwpd
|
||||
application/x-abiword = exec rclabw
|
||||
application/x-dvi = exec rcldvi
|
||||
application/x-flac = exec rclflac
|
||||
application/x-kword = exec rclkwd
|
||||
application/x-lyx = exec rcllyx
|
||||
application/x-scribus = exec rclscribus
|
||||
application/x-tex = exec rcltex
|
||||
audio/mpeg = exec rclid3
|
||||
image/gif = exec rclimg
|
||||
image/jpeg = exec rclimg
|
||||
image/png = exec rclimg
|
||||
image/tiff = exec rclimg
|
||||
image/vnd.djvu = exec rcldjvu
|
||||
image/svg+xml = exec rclsvg
|
||||
message/rfc822 = internal
|
||||
text/html = internal
|
||||
text/plain = internal
|
||||
text/rtf = exec rclrtf
|
||||
text/x-gaim-log = exec rclgaim
|
||||
text/x-html-sidux-man = exec rclsiduxman
|
||||
text/x-mail = internal
|
||||
text/x-man = exec rclman
|
||||
|
||||
## #############################################
|
||||
# Icons to be used in the result list if required by gui config
|
||||
[icons]
|
||||
application/msword = wordprocessing
|
||||
application/ogg = sownd
|
||||
application/pdf = pdf
|
||||
application/postscript = postscript
|
||||
application/vnd.ms-excel = spreadsheet
|
||||
application/vnd.ms-powerpoint = presentation
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||
wordprocessing
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||
wordprocessing
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
presentation
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
presentation
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
spreadsheet
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
||||
spreadsheet
|
||||
application/vnd.sun.xml.calc = spreadsheet
|
||||
application/vnd.sun.xml.calc.template = spreadsheet
|
||||
application/vnd.sun.xml.draw = drawing
|
||||
application/vnd.sun.xml.draw.template = drawing
|
||||
application/vnd.sun.xml.impress = presentation
|
||||
application/vnd.sun.xml.impress.template = presentation
|
||||
application/vnd.sun.xml.writer = wordprocessing
|
||||
application/vnd.sun.xml.writer.global = wordprocessing
|
||||
application/vnd.sun.xml.writer.template = wordprocessing
|
||||
application/vnd.wordperfect = wordprocessing
|
||||
application/x-abiword = wordprocessing
|
||||
application/x-dvi = document
|
||||
application/x-flac = sownd
|
||||
application/x-fsdirectory = folder
|
||||
application/x-kword = wordprocessing
|
||||
application/x-lyx = wordprocessing
|
||||
application/x-scribus = document
|
||||
application/x-tex = wordprocessing
|
||||
audio/mpeg = sownd
|
||||
image/gif = image
|
||||
image/jpeg = image
|
||||
image/png = image
|
||||
image/tiff = image
|
||||
image/vnd.djvu = document
|
||||
image/svg+xml = drawing
|
||||
message/rfc822 = message
|
||||
text/html = html
|
||||
text/plain = txt
|
||||
text/x-c = source
|
||||
text/x-html-sidux-man = sidux-book
|
||||
text/x-mail = message
|
||||
text/x-man = document
|
||||
|
||||
[categories]
|
||||
|
||||
text = \
|
||||
application/msword \
|
||||
application/pdf \
|
||||
application/postscript \
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document \
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template \
|
||||
application/vnd.sun.xml.writer \
|
||||
application/vnd.sun.xml.writer.global \
|
||||
application/vnd.sun.xml.writer.template \
|
||||
application/vnd.wordperfect \
|
||||
application/x-abiword \
|
||||
application/x-dvi \
|
||||
application/x-kword \
|
||||
application/x-lyx \
|
||||
application/x-scribus \
|
||||
application/x-tex \
|
||||
image/vnd.djvu \
|
||||
text/html \
|
||||
text/plain \
|
||||
text/rtf \
|
||||
text/x-html-sidux-man \
|
||||
text/x-man
|
||||
|
||||
spreadsheet = \
|
||||
application/vnd.ms-excel \
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet \
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template \
|
||||
application/vnd.sun.xml.calc \
|
||||
application/vnd.sun.xml.calc.template
|
||||
|
||||
presentation = application/vnd.ms-powerpoint \
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template \
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation \
|
||||
application/vnd.sun.xml.impress \
|
||||
application/vnd.sun.xml.impress.template
|
||||
|
||||
media = \
|
||||
audio/mpeg \
|
||||
application/ogg \
|
||||
application/x-flac \
|
||||
image/jpeg \
|
||||
image/png \
|
||||
image/tiff \
|
||||
image/gif \
|
||||
|
||||
message = message/rfc822 \
|
||||
text/x-gaim-log \
|
||||
text/x-mail \
|
||||
|
||||
other = application/vnd.sun.xml.draw \
|
||||
application/vnd.sun.xml.draw.template \
|
||||
application/vnd.sun.xml.math \
|
||||
application/x-fsdirectory \
|
||||
image/svg+xml \
|
||||
|
||||
|
||||
[prefixes]
|
||||
|
||||
# This allows extending the set of fields that recoll understand/searches.
|
||||
# See the manual for exact usage.
|
||||
# Important:
|
||||
# - the field names MUST be all lowercase here. They can be anycased
|
||||
# in the documents:
|
||||
# - The extension field prefixes MUST begin with X and be all UPPERCASE.
|
||||
title = S
|
||||
caption = S
|
||||
subject = S
|
||||
|
||||
author = A
|
||||
creator = A
|
||||
from = A
|
||||
|
||||
keyword = K
|
||||
tag = K
|
||||
keywords = K
|
||||
tags = K
|
||||
|
||||
# testing /example :
|
||||
recollspecialfield = XRCLSF
|
||||
132
website/filters/mimemap
Normal file
132
website/filters/mimemap
Normal file
@ -0,0 +1,132 @@
|
||||
# @(#$Id: mimemap,v 1.31 2008/08/25 16:12:16 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Recoll: associations of file name extensions to mime types
|
||||
|
||||
.txt = text/plain
|
||||
.text = text/plain
|
||||
.d = text/plain
|
||||
|
||||
# Source files. Defining them as text/x-c will enable ext viewer. If
|
||||
# text/plain they will be somewhat indexed
|
||||
.cpp = text/x-c
|
||||
.h = text/x-c
|
||||
.c = text/x-c
|
||||
.cc = text/x-c
|
||||
|
||||
.rtf = text/rtf
|
||||
|
||||
.html = text/html
|
||||
.htm = text/html
|
||||
.shtml = text/html
|
||||
.php = text/html
|
||||
|
||||
.pdf = application/pdf
|
||||
|
||||
.ps = application/postscript
|
||||
.eps = application/postscript
|
||||
.ai = application/postscript
|
||||
|
||||
.tex = application/x-tex
|
||||
.dvi = application/x-dvi
|
||||
|
||||
.djvu = image/vnd.djvu
|
||||
.svg = image/svg+xml
|
||||
|
||||
.gz = application/x-gzip
|
||||
.Z = application/x-gzip
|
||||
.bz2 = application/x-bzip2
|
||||
#.Z = application/x-compress
|
||||
|
||||
.doc = application/msword
|
||||
.ppt = application/vnd.ms-powerpoint
|
||||
.xls = application/vnd.ms-excel
|
||||
|
||||
# OpenOffice / opendocument. We handle opendocument as old openoffice files
|
||||
# for now
|
||||
.sxc = application/vnd.sun.xml.calc
|
||||
.ods = application/vnd.sun.xml.calc
|
||||
.stc = application/vnd.sun.xml.calc.template
|
||||
.sxd = application/vnd.sun.xml.draw
|
||||
.std = application/vnd.sun.xml.draw.template
|
||||
.sxi = application/vnd.sun.xml.impress
|
||||
.odp = application/vnd.sun.xml.impress
|
||||
.sti = application/vnd.sun.xml.impress.template
|
||||
.sxm = application/vnd.sun.xml.math
|
||||
.sxw = application/vnd.sun.xml.writer
|
||||
.odt = application/vnd.sun.xml.writer
|
||||
.sxg = application/vnd.sun.xml.writer.global
|
||||
.stw = application/vnd.sun.xml.writer.template
|
||||
|
||||
# ms openxml
|
||||
.docm = application/vnd.ms-word.document.macroEnabled.12
|
||||
.docx = application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||
.dotm = application/vnd.ms-word.template.macroEnabled.12
|
||||
.dotx = application/vnd.openxmlformats-officedocument.wordprocessingml.template
|
||||
.potm = application/vnd.ms-powerpoint.template.macroEnabled.12
|
||||
.potx = application/vnd.openxmlformats-officedocument.presentationml.template
|
||||
.ppam = application/vnd.ms-powerpoint.addin.macroEnabled.12
|
||||
.ppsm = application/vnd.ms-powerpoint.slideshow.macroEnabled.12
|
||||
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
|
||||
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
|
||||
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
|
||||
.xlam = application/vnd.ms-excel.addin.macroEnabled.12
|
||||
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
|
||||
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12
|
||||
.xlsx = application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||
.xltm = application/vnd.ms-excel.template.macroEnabled.12
|
||||
.xltx = application/vnd.openxmlformats-officedocument.spreadsheetml.template
|
||||
|
||||
.abw = application/x-abiword
|
||||
.lyx = application/x-lyx
|
||||
.sla = application/x-scribus
|
||||
.scd = application/x-scribus
|
||||
|
||||
.kwd = application/x-kword
|
||||
|
||||
.wpd = application/vnd.wordperfect
|
||||
|
||||
.rtf = text/rtf
|
||||
|
||||
.mp3 = audio/mpeg
|
||||
.flac = application/x-flac
|
||||
.ogg = application/ogg
|
||||
|
||||
.png = image/png
|
||||
.jpg = image/jpeg
|
||||
.jpeg = image/jpeg
|
||||
.gif = image/gif
|
||||
.tiff = image/tiff
|
||||
.tif = image/tiff
|
||||
|
||||
# A list of stuff that we don't want to touch at all (for now). Having the
|
||||
# suffixes listed in there speeds up things quite a lot by avoiding
|
||||
# unneeded decompression or 'file' calls. File names still get indexed if
|
||||
# indexallfilenames is set
|
||||
recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \
|
||||
.m4 .tcl .js .sh .pl .awk \
|
||||
.o .lib .dll .a \
|
||||
.dat .bak .rdf .log .db .ini .msf .pid \
|
||||
.gnm .gnumeric \
|
||||
.gif .bmp .xpm \
|
||||
,v ~ #
|
||||
|
||||
# Special handling of .txt files inside ~/.gaim directory
|
||||
[~/.gaim]
|
||||
.txt = text/x-gaim-log
|
||||
|
||||
# Special handling of sidux manual menu system
|
||||
[/usr/share/sidux-manual]
|
||||
.htm = text/x-html-sidux-man
|
||||
.html = text/x-html-sidux-man
|
||||
|
||||
# Manual files. You may want to adjust the location for your system
|
||||
# We can't use the default text/troff type because this doesn't say
|
||||
# what macro set to use (groff -man)
|
||||
[/usr/share/man]
|
||||
.1 = text/x-man
|
||||
.2 = text/x-man
|
||||
.3 = text/x-man
|
||||
.4 = text/x-man
|
||||
.5 = text/x-man
|
||||
.6 = text/x-man
|
||||
.7 = text/x-man
|
||||
.8 = text/x-man
|
||||
61
website/filters/mimeview
Normal file
61
website/filters/mimeview
Normal file
@ -0,0 +1,61 @@
|
||||
# @(#$Id: mimeview,v 1.15 2008/09/01 20:39:40 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
|
||||
## ##########################################
|
||||
# External viewers, launched by the recoll GUI when you click on a result
|
||||
# 'edit' link
|
||||
|
||||
[view]
|
||||
# Pseudo entry used if the 'use desktop' preference is set in the GUI
|
||||
application/x-all = xdg-open %f
|
||||
|
||||
application/x-kword = kword %f
|
||||
application/x-abiword = abiword %f
|
||||
|
||||
application/msword = openoffice %f
|
||||
application/ogg = xmms %f
|
||||
application/pdf = xpdf %f
|
||||
application/postscript = gv %f
|
||||
application/vnd.ms-excel = openoffice %f
|
||||
application/vnd.ms-powerpoint = openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||
openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||
openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
openoffice %f
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
|
||||
openoffice %f
|
||||
application/vnd.sun.xml.calc = openoffice %f
|
||||
application/vnd.sun.xml.calc.template = openoffice %f
|
||||
application/vnd.sun.xml.draw = openoffice %f
|
||||
application/vnd.sun.xml.draw.template = openoffice %f
|
||||
application/vnd.sun.xml.impress = openoffice %f
|
||||
application/vnd.sun.xml.impress.template = openoffice %f
|
||||
application/vnd.sun.xml.math = openoffice %f
|
||||
application/vnd.sun.xml.writer = openoffice %f
|
||||
application/vnd.sun.xml.writer.global = openoffice %f
|
||||
application/vnd.sun.xml.writer.template = openoffice %f
|
||||
application/vnd.wordperfect = openoffice %f
|
||||
application/x-fsdirectory = rox %f
|
||||
application/x-dvi = xdvi %f
|
||||
application/x-flac = xmms %f
|
||||
application/x-lyx = lyx %f
|
||||
application/x-scribus = scribus %f
|
||||
application/x-tex = gnuclient -q %f
|
||||
audio/mpeg = xmms %f
|
||||
image/jpeg = xv %f
|
||||
image/png = xv %f
|
||||
image/tiff = xv %f
|
||||
image/gif = xv %f
|
||||
image/svg+xml = inkview %f
|
||||
image/vnd.djvu = djview %f
|
||||
# Or firefox -remote "openFile(%u)"
|
||||
text/html = firefox %u
|
||||
text/plain = gnuclient -q %f
|
||||
text/x-c = gnuclient -q %f
|
||||
text/x-html-sidux-man = konqueror %f
|
||||
#text/x-html-sidux-man = iceweasel %f
|
||||
175
website/filters/rclabw
Executable file
175
website/filters/rclabw
Executable file
@ -0,0 +1,175 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclabw,v 1.2 2007/06/15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from an abiword file
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclabw"
|
||||
filetype=abiword
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds iconv sed
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
encoding=`sed -e '/<?xml version=/s/"?>$//' \
|
||||
-e '/^<?xml version=/s/.*encoding="//p;D;q' \
|
||||
-e D \
|
||||
< $infile`
|
||||
if test X$encoding = X ; then encoding=UTF-8;fi
|
||||
|
||||
# Note: there can be newlines inside the description field, we don't want
|
||||
# them... Have 2 use 2 different selectors for the single-line and
|
||||
# multiple-line cases because of the generic tag end (</m> for all meta
|
||||
# tags)
|
||||
descsedprog='
|
||||
/<m key="dc.description">\([^<]*\)<\/m>/ {
|
||||
s//\1/
|
||||
p
|
||||
q
|
||||
}
|
||||
/<m key="dc.description">/,/<\/m>/ {
|
||||
s!.*<m key="dc.description">!!
|
||||
s!</m>.*!!
|
||||
H
|
||||
}
|
||||
${
|
||||
g
|
||||
s/\n/ /g
|
||||
p
|
||||
}
|
||||
'
|
||||
|
||||
description=`sed -n -e "$descsedprog" < "$infile"`
|
||||
#echo description: "$description"
|
||||
|
||||
# Set program for the single line meta elements. Takes element name as
|
||||
# parameter
|
||||
setmetasedprog() {
|
||||
metasedprog='/<m key="'$1'">/{
|
||||
s/.*<m key="'$1'">\([^<]*\).*/\1/
|
||||
'"s/\"/'/g"'
|
||||
p
|
||||
}'
|
||||
}
|
||||
|
||||
setmetasedprog dc.subject
|
||||
subject=`sed -n -e "$metasedprog" "$infile"`
|
||||
#echo subject: "$subject"
|
||||
|
||||
setmetasedprog dc.title
|
||||
title=`sed -n -e "$metasedprog" "$infile"`
|
||||
#echo titre: "$title"
|
||||
|
||||
setmetasedprog abiword.keywords
|
||||
keywords=`sed -n -e "$metasedprog" "$infile"`
|
||||
#echo keywords: "$keywords"
|
||||
|
||||
setmetasedprog dc.creator
|
||||
creator=`sed -n -e "$metasedprog" "$infile"`
|
||||
#echo creator: "$creator"
|
||||
|
||||
# Note: next expr supposes that paragraphs are always all by themselves on
|
||||
# a single line in the xml (no multiple <p> per line, no embedded newlines
|
||||
# in text).
|
||||
contentsedprog='
|
||||
/<p[ >]/{
|
||||
s/<[^>]*>/ /g
|
||||
p
|
||||
}
|
||||
'
|
||||
content=`sed -n -e "$contentsedprog" "$infile"`
|
||||
#echo content: "$content"
|
||||
|
||||
# output the result
|
||||
(echo '<html><head><title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '<meta name="description" content="' "$description $subject" '">'
|
||||
echo '<meta name="keywords" content="' "$keywords" '">'
|
||||
echo '<meta name="author" content="' "$creator" '">'
|
||||
echo '</head><body><pre>'
|
||||
echo "$content"
|
||||
echo '</pre></body></html>') \
|
||||
| iconv -f $encoding -t UTF-8 -c -s
|
||||
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
95
website/filters/rclimg
Executable file
95
website/filters/rclimg
Executable file
@ -0,0 +1,95 @@
|
||||
#! /usr/bin/perl -w
|
||||
# @(#$Id: rclimg,v 1.2 2007/10/02 13:56:42 dockes Exp $ (C) 2007 Cedric Scott
|
||||
#######################################################
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################################
|
||||
|
||||
#
|
||||
# rclimg: extract image tags with exiftool and convert the data to html for
|
||||
# recoll indexing.
|
||||
#
|
||||
|
||||
#
|
||||
# maps image file tags to xapian tags
|
||||
#
|
||||
$tagMap = {
|
||||
'subject' => 'subject',
|
||||
'title' => 'title',
|
||||
'headline' => 'title',
|
||||
'caption' => 'caption',
|
||||
'caption-abstract' => 'caption',
|
||||
'author' => 'author',
|
||||
'creator' => 'creator',
|
||||
'from' => 'from',
|
||||
'keywords' => 'keywords',
|
||||
'keyword' => 'keyword',
|
||||
'tag' => 'tag',
|
||||
};
|
||||
|
||||
# set to non-zero if tags which map to xapian tags are to output
|
||||
# in the body as well as the header
|
||||
#
|
||||
$headAndBody = 1;
|
||||
|
||||
# xapianTag
|
||||
# returns a xapian tag to be used for this tag
|
||||
#
|
||||
sub xapianTag {
|
||||
my $imgtag = shift;
|
||||
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
|
||||
return $xapiantag if $imgtag =~ /$tagre/i;
|
||||
}
|
||||
return undef;
|
||||
}
|
||||
|
||||
#
|
||||
# start here
|
||||
#
|
||||
use Image::ExifTool qw(:Public);
|
||||
|
||||
$imageFile = shift;
|
||||
$imageFile = '-' if $imageFile eq '';
|
||||
unless ( open(IMGF, $imageFile) ) {
|
||||
print STDERR "$0: can't open file $imageFile\n";
|
||||
exit(1); # file doesn't exist or can't be read
|
||||
}
|
||||
$info = ImageInfo(\*IMGF);
|
||||
die unless $info;
|
||||
$fields = [];
|
||||
$other = [];
|
||||
$titleHtmlTag = "";
|
||||
foreach $tagname ( sort keys %{$info} ) {
|
||||
$xapiantag = xapianTag($tagname);
|
||||
if (defined $xapiantag ) {
|
||||
push @{$fields}, [ $xapiantag, $info->{$tagname} ];
|
||||
$titleHtmlTag = "<title>$info->{$tagname}</title>" if $xapiantag eq 'title';
|
||||
push @{$other}, [ $tagname, $info->{$tagname} ] if $headAndBody;
|
||||
} else {
|
||||
push @{$other}, [ $tagname, $info->{$tagname} ];
|
||||
}
|
||||
}
|
||||
print "<html>\n<head>\n$titleHtmlTag\n";
|
||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n";
|
||||
foreach $tagpair ( @{$fields} ) {
|
||||
($tagname, $value) = @{$tagpair};
|
||||
print "<meta name=\"$tagname\" content=\"$value\">\n";
|
||||
}
|
||||
print "</head><body>\n";
|
||||
foreach $tagpair (@{$other} ) {
|
||||
($tagname, $value) = @{$tagpair};
|
||||
printf "%30s : %s<br>\n", $tagname, $value;
|
||||
}
|
||||
print "</body>\n</html>\n";
|
||||
204
website/filters/rclkwd
Executable file
204
website/filters/rclkwd
Executable file
@ -0,0 +1,204 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclkwd,v 1.1 2007/06/08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rclkword
|
||||
# Extract text from a kword file
|
||||
#
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclkwd"
|
||||
filetype=kword
|
||||
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds awk unzip gunzip tar
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclkwd_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclkwdtmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclkwdtmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclkwdtmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Old kwd files are gzip/tar archibes. Newer ones are zip archives.
|
||||
if file $infile | grep -qi gzip ; then
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
gunzip < "$infile" | (cd $tmpdir/rclkwdtmp;tar xf -)
|
||||
else
|
||||
echo new kwd
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
unzip -q -d $tmpdir/rclkwdtmp "$infile"
|
||||
fi
|
||||
cd $tmpdir/rclkwdtmp
|
||||
|
||||
metafile=documentinfo.xml
|
||||
contentfile=maindoc.xml
|
||||
|
||||
if test -f $metafile ; then
|
||||
|
||||
# Note: there can be newlines inside the description field, we don't want
|
||||
# them...
|
||||
abssedprog='/<abstract>/,/<\/abstract>/{
|
||||
s!.*<abstract>!!
|
||||
s!</abstract>.*!!
|
||||
p
|
||||
}
|
||||
'
|
||||
abstract=`sed -n -e "$abssedprog" < $metafile | tr '\n' ' ' | \
|
||||
sed -e '1s/<!\[CDATA\[//' -e 's/\]\]>//'`
|
||||
subject=`sed -e "s/\"/'/" -e 's/.*<subject>\([^<]*\).*/\1/p;d' \
|
||||
< $metafile`
|
||||
title=`sed -e "s/\"/'/" -e 's/.*<title>\([^<]*\).*/\1/p;d' \
|
||||
< $metafile | tr '\n' ' '`
|
||||
keywords=`sed -e "s/\"/'/" -e 's/.*<keyword>\([^<]*\).*/\1/p;d' \
|
||||
< $metafile`
|
||||
fi
|
||||
|
||||
# Note: next expr inserts a newline at each end of paragraph (for preview)
|
||||
content="`sed -e 's!</TEXT>!\\
|
||||
!g' -e 's/<[^>]*>/ /g' < $contentfile | sed -e '/^[ ]*$/d'`"
|
||||
|
||||
#echo abstract "$abstract"
|
||||
#echo subject "$subject"
|
||||
#echo title "$title"
|
||||
#echo keywords "$keywords"
|
||||
#echo content "$content"
|
||||
|
||||
# output the result
|
||||
echo '<html><head>'
|
||||
echo '<title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '<meta name="abstract" content="' "$abstract $subject" '">'
|
||||
echo '<meta name="keywords" content="' "$keywords" '">'
|
||||
echo '</head><body><p>'
|
||||
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
}
|
||||
|
||||
if($0 == "\f") {
|
||||
print "</p>\n<hr>\n<p>"
|
||||
next
|
||||
}
|
||||
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
printf("</p></body></html>\n");
|
||||
}' | iconv -f UTF-8 -t UTF-8 -c -s
|
||||
|
||||
cd /
|
||||
# exit normally
|
||||
exit 0
|
||||
195
website/filters/rcllyx
Executable file
195
website/filters/rcllyx
Executable file
@ -0,0 +1,195 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rcllyx,v 1.4 2007/01/23 07:23:12 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# There may still be code from Estraier in here:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rcllyx
|
||||
# Convert a lyx file to recoll HTML.
|
||||
#
|
||||
# We use lyx --export. It was suggested to use untex, but it doesn't give
|
||||
# good results on raw lyx (of course, this is not TeX), and exporting to
|
||||
# LaTex then using untex doesn't look nice when we can use the native lyx
|
||||
# text export.
|
||||
# The character encoding of the exported text is defined by the
|
||||
# \inputencoding directive in the lyx file header and, in quite an obscure
|
||||
# way, by the \language parameter. We use a heuristic to divine the output
|
||||
# text encoding and it is guaranteed not to work in all cases. Trials using
|
||||
# an intermediary dvi, pdf or ps file gave worse results. This needs
|
||||
# improvement. It doesn't even take into account the fact that the language
|
||||
# can change inside the doc (does this change the encoding or not ?). To be
|
||||
# frank, this is not entirely my fault, the lyx format is a joke.
|
||||
#
|
||||
# As there is unfortunately no way to define the output file name, we have
|
||||
# to use a temporary directory and link the input file in there.
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcllyx"
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
printf 'Extract lyx text as basic HTML.\n'
|
||||
printf 'Usage: %s [infile]\n' "$progname"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
echo $cmd not found 1>&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
checkcmds lyx iconv
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
|
||||
tmpdir=$ttdir/rcllyx_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rcllyxtmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rcllyxtmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rcllyxtmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
workdir=$tmpdir/rcllyxtmp
|
||||
case "$infile" in
|
||||
*/*) ;;
|
||||
*) infile=`pwd`/$infile;;
|
||||
esac
|
||||
|
||||
binfile=`basename $infile`
|
||||
ln -s "$infile" "$workdir/$binfile" || exit 1
|
||||
lyxfile=$workdir/$binfile
|
||||
textfile=$workdir/`basename $binfile .lyx`.txt
|
||||
|
||||
#echo binfile: $binfile;echo lyxfile: $lyxfile ; ls -l $lyxfile; echo textfile: $textfile
|
||||
|
||||
# Run lyx --export
|
||||
lyx --export text $lyxfile
|
||||
|
||||
# Charset and language
|
||||
formatline=`egrep '^\\\lyxformat ' $lyxfile`
|
||||
if test -n "$formatline" ; then
|
||||
set $formatline
|
||||
format=$2
|
||||
fi
|
||||
charsetline=`egrep '^\\\inputencoding ' $lyxfile`
|
||||
if test -n "$charsetline" ; then
|
||||
set $charsetline
|
||||
charset=$2
|
||||
fi
|
||||
langline=`egrep '^\\\language ' $lyxfile`
|
||||
if test -n "$langline" ; then
|
||||
set $langline
|
||||
lang=$2
|
||||
fi
|
||||
#echo format: [$format] charset: [$charset] lang [$lang]
|
||||
|
||||
if test "$format" -ge 249 ; then
|
||||
charset=utf-8
|
||||
else
|
||||
# try to guess the charset from the language: this is in no way guaranteed
|
||||
# to work, the logic has built-in inconsistencies even beyond the numerous
|
||||
# external ones (what if the ukrainian writer prefers koi8-r ?). This is a
|
||||
# joke.
|
||||
if test -z "$charset" -o "$charset" = default -o "$charset" = auto ; then
|
||||
case "$lang" in
|
||||
american|afrikaans|basque|catalan|danish|dutch|english|faeroese|finnish|french|galician|german|icelandic|irish|italian|norwegian|portuguese|spanish|swedish)
|
||||
charset=iso-8859-1;;
|
||||
czech|german|hungarian|polish|romanian|croatian|slovak|slovene)
|
||||
charset=iso-8859-2;;
|
||||
esperanto|galician|maltese|Turkish)
|
||||
charset=iso-8859-3;;
|
||||
estonian|latvian|lithuanian)
|
||||
charset=iso-8859-4;;
|
||||
bulgarian|byelorussian|macedonian|russian|serbian|ukrainian)
|
||||
charset=iso-8859-5;;
|
||||
arabic)
|
||||
charset=iso-8859-6;;
|
||||
greek)
|
||||
charset=iso-8859-7;;
|
||||
hebrew)
|
||||
charset=iso-8859-8;;
|
||||
#ISO-8859-9 - Latin 5 Same as 8859-1 except for Turkish instead of
|
||||
#Icelandic. ? What is one to do :)
|
||||
#ISO-8859-10 - Latin 6
|
||||
lappish|nordic|eskimo|inuit|sami)
|
||||
charset=iso-8859-10;;
|
||||
albanian|german|english|basque|breton|catalan|danish|spanish|estonian|esthonian|faeroese|faroese|finnish|french|frisian|friesian|scottish|goidelic|irish|gaelic|galician|welsh|greenlandic|inuit|icelandic|italian|latin|dutch|norvegian|portuguese|romansch|romansh|friulian|ladin|swedish)
|
||||
charset=iso-8859-15;;
|
||||
*)
|
||||
charset=iso-8859-1;;
|
||||
esac
|
||||
fi
|
||||
fi
|
||||
|
||||
if test -n "$charset" ; then
|
||||
inputcmd="iconv -f $charset -t UTF-8 -c -s"
|
||||
else
|
||||
inputcmd=cat
|
||||
fi
|
||||
#echo inputcmd: [$inputcmd]
|
||||
|
||||
cat <<EOF
|
||||
<html>
|
||||
<head>
|
||||
<title>$title</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
<pre>
|
||||
EOF
|
||||
|
||||
$inputcmd < $textfile
|
||||
|
||||
cat <<EOF
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
245
website/filters/rclopxml
Executable file
245
website/filters/rclopxml
Executable file
@ -0,0 +1,245 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclopxml,v 1.2 2008/09/01 17:31:47 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
#================================================================
|
||||
# rcldocx
|
||||
# Extract text from an openxml msword file (will be extended for spreadsheets)
|
||||
# TODO: Also process docProps/core.xml for attributes, and word/endnotes.xml
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname=rclopxml
|
||||
filetype=openxml
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc unzip
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclopxml_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclopxmltmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclopxmltmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclopxmltmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
unzip -q -d $tmpdir/rclopxmltmp "$infile"
|
||||
cd $tmpdir/rclopxmltmp
|
||||
|
||||
echo '<html>
|
||||
<head>'
|
||||
|
||||
xsltproc - docProps/core.xml <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
|
||||
<!-- <xsl:output method="text"/> -->
|
||||
<xsl:output omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="cp:coreProperties">
|
||||
<xsl:text> </xsl:text>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">
|
||||
<!-- <xsl:value-of select="name()"/> pour sortir tous les meta avec
|
||||
le meme nom que dans le xml (si on devenait dc-natif) -->
|
||||
<xsl:text>author</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dcterms:modified">
|
||||
<meta>
|
||||
<xsl:attribute name="name">
|
||||
<xsl:text>date</xsl:text>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
echo '</head>
|
||||
<body>'
|
||||
|
||||
filename=''
|
||||
if test -f word/document.xml ; then
|
||||
filenames=word/document.xml
|
||||
tagmatch="w:p"
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
|
||||
xmlns:v="urn:schemas-microsoft-com:vml"
|
||||
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
||||
xmlns:w10="urn:schemas-microsoft-com:office:word"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
||||
'
|
||||
|
||||
elif test -f xl/sharedStrings.xml ; then
|
||||
filenames=xl/sharedStrings.xml
|
||||
tagmatch='x:t'
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
||||
'
|
||||
|
||||
elif test -f ppt/slides/slide1.xml ; then
|
||||
filenames=`echo ppt/slides/slide*.xml`
|
||||
tagmatch='a:t'
|
||||
xmlns_decls='
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
||||
'
|
||||
# I want to suppress text output for all except a:t, don't know how to do it
|
||||
# help ! At least get rid of these:
|
||||
moretemplates='
|
||||
<xsl:template match="p:attrName">
|
||||
</xsl:template>
|
||||
'
|
||||
else
|
||||
# ??
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
for filename in $filenames;do
|
||||
xsltproc - $filename <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet $xmlns_decls >
|
||||
|
||||
<xsl:output omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<div>
|
||||
<xsl:apply-templates/>
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="$tagmatch">
|
||||
<p>
|
||||
<xsl:value-of select="."/>
|
||||
</p>
|
||||
</xsl:template>
|
||||
|
||||
$moretemplates
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
done
|
||||
|
||||
echo '</html>'
|
||||
151
website/filters/rclscribus
Executable file
151
website/filters/rclscribus
Executable file
@ -0,0 +1,151 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclscribus,v 1.1 2007/01/22 16:32:55 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# There may still be code from Estraier in here:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rclscribus
|
||||
# Convert a scribus file to recoll HTML. This only handles the newer .sla
|
||||
# files until I can have a look at an older .scd.
|
||||
#
|
||||
# We just hack into the scribus XML, taking advantage that the tag of
|
||||
# interest is apparently always output on a single line.
|
||||
# The text seems to be found in attribute CH of tag ITEXT, it is utf-8
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclscribus"
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
printf 'Extract scribus text as basic HTML.\n'
|
||||
printf 'Usage: %s [infile]\n' "$progname"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
echo $cmd not found 1>&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
checkcmds grep awk sed
|
||||
|
||||
# A small sed program to join lines where they are broken inside an
|
||||
# attribute value. The idea is that all scribus tag are apparently on one
|
||||
# line except when there are embedded new lines in an attribute lie
|
||||
# 'comments'. The first version of the sed script joins line which does not
|
||||
# end with > with the next. It doesn't guard against an embedded '>'. The
|
||||
# seconf joins line not beginning with '<' with the previous. It is much
|
||||
# slower for some reason.
|
||||
sedjoinprog=':a
|
||||
/[^>] *$/N; s/\n/ /; ta'
|
||||
#sedjoinprog1=':a
|
||||
#$!N;/^ *[^<]/s/\n/ /;ta
|
||||
#P;D'
|
||||
|
||||
# Extract description title author and keywords
|
||||
description=`sed -e "$sedjoinprog" < $infile | \
|
||||
awk '
|
||||
/<DOCUMENT / {
|
||||
if (match($0, " COMMENTS=\"[^\"]+")) {
|
||||
s=substr($0, RSTART+11, RLENGTH-11)
|
||||
printf("%s", s);
|
||||
# Note: there is no way to know if this ends a frame, so no "<br>"
|
||||
}
|
||||
}
|
||||
'`
|
||||
|
||||
title=`sed -e "$sedjoinprog" < $infile | \
|
||||
awk '
|
||||
/<DOCUMENT / {
|
||||
if (match($0, " TITLE=\"[^\"]+")) {
|
||||
s=substr($0, RSTART+8, RLENGTH-8)
|
||||
printf("%s", s);
|
||||
# Note: there is no way to know if this ends a frame, so no "<br>"
|
||||
}
|
||||
}
|
||||
'`
|
||||
|
||||
author=`sed -e "$sedjoinprog" < $infile | \
|
||||
awk '
|
||||
/<DOCUMENT / {
|
||||
if (match($0, " AUTHOR=\"[^\"]+")) {
|
||||
s=substr($0, RSTART+9, RLENGTH-9)
|
||||
printf("%s", s);
|
||||
# Note: there is no way to know if this ends a frame, so no "<br>"
|
||||
}
|
||||
}
|
||||
'`
|
||||
|
||||
keywords=`sed -e "$sedjoinprog" < $infile | \
|
||||
awk '
|
||||
/<DOCUMENT / {
|
||||
if (match($0, " KEYWORDS=\"[^\"]+")) {
|
||||
s=substr($0, RSTART+11, RLENGTH-11)
|
||||
printf("%s", s);
|
||||
# Note: there is no way to know if this ends a frame, so no "<br>"
|
||||
}
|
||||
}
|
||||
'`
|
||||
|
||||
#echo description: [$description];echo title: [$title];
|
||||
#echo author: [$author];echo keywords: [$keywords]
|
||||
|
||||
cat <<EOF
|
||||
<html><head>
|
||||
<title>$title</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||
<meta name="author" content="$author">
|
||||
<meta name="description" content="$description">
|
||||
<meta name="keywords" content="$keywords">
|
||||
</head>
|
||||
<body><p>
|
||||
EOF
|
||||
|
||||
|
||||
sed -e ':a' -e '/[^>] *$/N; s/\n/ /; ta' < $infile | \
|
||||
awk '
|
||||
/<ITEXT / {
|
||||
if (match($0, " CH=\"[^\"]+")) {
|
||||
s=substr($0, RSTART+5, RLENGTH-5)
|
||||
printf("%s", s);
|
||||
# Note: there is no way to know if this ends a frame, so no "<br>"
|
||||
}
|
||||
}
|
||||
END {
|
||||
print "</p></body></html>"
|
||||
}
|
||||
' | \
|
||||
sed -e 's//<br>/g' -e 's//<br>/g'
|
||||
156
website/filters/rclsoff
Executable file
156
website/filters/rclsoff
Executable file
@ -0,0 +1,156 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsoff,v 1.6.6.1 2007/01/21 16:41:49 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rclsoff
|
||||
# Extract text from an openoffice/soffice file
|
||||
#
|
||||
#================================================================
|
||||
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclsoff"
|
||||
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
printf 'Convert an openoffice file to unformatted HTML text.\n'
|
||||
printf 'Usage: %s [infile]\n' "$progname"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
echo $cmd not found 1>&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
checkcmds awk iconv unzip
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# We need a temporary directory
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpdir=$ttdir/rclsoff_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclsofftmp || exit 1
|
||||
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclsofftmp), that hopefully
|
||||
# guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclsofftmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Unzip the input file and change to the unzipped directory
|
||||
unzip -q -d $tmpdir/rclsofftmp "$infile"
|
||||
cd $tmpdir/rclsofftmp
|
||||
|
||||
# Note: there can be newlines inside the description field, we don't want
|
||||
# them...
|
||||
descsedprog='/<dc:description>/,/<\/dc:description>/{
|
||||
s!.*<dc:description>!!
|
||||
s!</dc:description>.*!!
|
||||
p
|
||||
}
|
||||
'
|
||||
description=`sed -n -e "$descsedprog" < meta.xml | tr '\n' ' '`
|
||||
|
||||
subject=`sed -e "s/\"/'/" -e 's/.*<dc:subject>\([^<]*\).*/\1/p;d' < meta.xml`
|
||||
|
||||
title=`sed -e "s/\"/'/" -e 's/.*<dc:title>\([^<]*\).*/\1/p;d' < meta.xml`
|
||||
|
||||
keywords=`sed -e "s/\"/'/" -e 's/.*<meta:keyword>\([^<]*\).*/\1/p;d' \
|
||||
< meta.xml`
|
||||
|
||||
# Note: next expr inserts a newline at each end of paragraph (for preview)
|
||||
content="`sed -e 's!</text:p>!\\
|
||||
!g' -e 's/<[^>]*>/ /g' < content.xml`"
|
||||
|
||||
#echo description "$description"
|
||||
#echo subject "$subject"
|
||||
#echo title "$title"
|
||||
#echo keywords "$keywords"
|
||||
#echo content "$content"
|
||||
|
||||
# output the result
|
||||
echo '<html><head>'
|
||||
echo '<title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '<meta name="description" content="' "$description $subject" '">'
|
||||
echo '<meta name="keywords" content="' "$keywords" '">'
|
||||
echo '</head><body><p>'
|
||||
|
||||
echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\
|
||||
awk '
|
||||
BEGIN {
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
}
|
||||
|
||||
if($0 == "\f") {
|
||||
print "</p>\n<hr>\n<p>"
|
||||
next
|
||||
}
|
||||
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
printf("</p></body></html>\n");
|
||||
}' | iconv -f UTF-8 -t UTF-8 -c -s
|
||||
|
||||
cd /
|
||||
# exit normally
|
||||
exit 0
|
||||
143
website/filters/rclsvg
Executable file
143
website/filters/rclsvg
Executable file
@ -0,0 +1,143 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsvg,v 1.2 2008/02/03 16:05:57 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from a Scalable Vector Graphics file
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclsvg"
|
||||
filetype=svg
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds iconv sed
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
encoding=`sed -ne '/<?xml/s/.*encoding="\([^"]*\).*/\1/p' < $infile`
|
||||
|
||||
if test X$encoding = X ; then encoding=UTF-8;fi
|
||||
|
||||
# We use several sed instances to make our life easier. Not good for
|
||||
# performance, and a sed guru might be able to do better.
|
||||
#
|
||||
# The first sed makes sure each tag starts on a new line
|
||||
# The second one selects the tags we're interested in.
|
||||
# The last strips the tags, leaving only text.
|
||||
#
|
||||
# The whole thing wholly ignore issues like '<' inside quoted strings.
|
||||
#
|
||||
# We could/should add code to explicitely separate title and other
|
||||
# metadata elements.
|
||||
|
||||
# Insert new line before each tag
|
||||
sptagonline='s/</\
|
||||
</g'
|
||||
|
||||
# Select tags
|
||||
spselecttags='/<title/,/<\/title>/p
|
||||
/<desc/,/<\/desc>/p
|
||||
/<metadata/,/<\/metadata>/p
|
||||
/<text/,/<\/text>/p'
|
||||
|
||||
# Strip tags
|
||||
spstriptags='#n
|
||||
/</{
|
||||
:c
|
||||
/>/!{
|
||||
N
|
||||
b c
|
||||
}
|
||||
/>/s/<.*>//g
|
||||
}
|
||||
/^[ ]*$/!p'
|
||||
|
||||
content=`sed -e "$sptagonline" < $infile | sed -ne "$spselecttags" | \
|
||||
sed -ne "$spstriptags"`
|
||||
|
||||
(echo '<html><head>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '</head><body><pre>'
|
||||
echo "$content"
|
||||
echo '</pre></body></html>') \
|
||||
| iconv -f $encoding -t UTF-8 -c -s
|
||||
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
106
website/filters/rcltex
Executable file
106
website/filters/rcltex
Executable file
@ -0,0 +1,106 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rcltex,v 1.2 2007/11/09 15:56:14 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
#================================================================
|
||||
# rcltex
|
||||
# Translate TeX files for recoll. Uses either untex or detex to translate to html
|
||||
#================================================================
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcltex"
|
||||
filetype=TeX
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
if iscmd detex ; then
|
||||
checkcmds iconv
|
||||
CMD="detex -n -e ''"
|
||||
else
|
||||
checkcmds untex iconv
|
||||
CMD="untex -giso -a"
|
||||
fi
|
||||
|
||||
# output the result
|
||||
echo '<html><head>'
|
||||
#echo '<title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '</head><body>'
|
||||
echo '<pre>'
|
||||
|
||||
#untex -giso -a "$infile" | \
|
||||
|
||||
$CMD "$infile" | \
|
||||
iconv -c -f iso-8859-1 -t utf-8 | \
|
||||
sed \
|
||||
-e 's/</</g' -e 's/&/&/g'
|
||||
|
||||
echo '</pre>'
|
||||
echo '</body></html>'
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
87
website/filters/rclwpd
Executable file
87
website/filters/rclwpd
Executable file
@ -0,0 +1,87 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclwpd,v 1.1 2007/08/26 13:34:59 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Some inspiration from estraier
|
||||
#================================================================
|
||||
# rclwpd
|
||||
# convert wordperfect documents to html, by executing the wpd2html program:
|
||||
# http://libwpd.sourceforge.net/download.html
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclwpd"
|
||||
filetype=wpd
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds wpd2html
|
||||
|
||||
# output the result. wpd2html output doesn't seem to need any adjustment?
|
||||
|
||||
wpd2html "$infile" 2> /dev/null
|
||||
Loading…
x
Reference in New Issue
Block a user