improved rclid3 and rclogg

This commit is contained in:
dockes 2008-10-08 08:27:34 +00:00
parent 954de37067
commit 49388c8748
12 changed files with 79 additions and 101 deletions

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclabw,v 1.2 2007-06-15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclabw,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -83,13 +83,6 @@ umask 77
checkcmds iconv sed
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
encoding=`sed -e '/<?xml version=/s/"?>$//' \
-e '/^<?xml version=/s/.*encoding="//p;D;q' \
-e D \

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rcldjvu,v 1.5 2007-06-08 13:51:08 dockes Exp $ (C) 2005 J.F.Dockes
# @(#$Id: rcldjvu,v 1.6 2008-10-08 08:27:34 dockes Exp $ (C) 2005 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -103,13 +103,6 @@ umask 77
checkcmds djvutxt djvused awk
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
# Title: we try to extract it from the annotations. djvused outputs string
# in C/awk \-escaped notation. Awk can only process this in string
# constants, so we have a first awk pass to create an awk program to parse

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclid3,v 1.2 2007-11-09 11:54:59 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclid3,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
#================================================================
# rclid3
# Handle audio files for recoll. This uses id3info to extract tags
@ -83,25 +83,52 @@ checkcmds id3info
echo '<html><head>'
#echo '<title>' "$title" '</title>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>'
echo '<pre>'
nocaptionexp='s/===.*://'
kpcaptionexp='s/===[^(]*//'
if test X$RECOLL_FILTER_FORPREVIEW = Xyes ; then
captionexp=$kpcaptionexp
else
captionexp=$nocaptionexp
fi
echo '</head><body><pre>'
id3info "$infile" | \
sed -e '/Tag information for/d' \
-e "$captionexp" \
-e 's/</&lt;/g' -e 's/&/&amp;/g' \
-e 's/===[^(]*//' \
-e 's/</&lt;/g' -e 's/&/&amp;/g' -e 's/"/&quot;/g' -e 's/>/&gt;/g'
echo '</pre></body></html>'
echo '</pre>'
echo '</body></html>'
else
# Indexing. There are many other fields which we could process this way.
# Would also need that some fields are set up in the "fields" config
# file to be really useful
id3info "$infile" | awk -F: \
'{
value=""
for (i = 2; i <=NF; i++) {
value = value " " $i
}
sub("&", "&amp;", value)
sub("\"", "&quot;", value)
sub("<", "&lt;", value)
sub(">", "&gt;", value)
}
/TIT2/{
printf "<meta name=\"title\" content=\"%s\">\n", value
body=body value "\n"
next
}
/TPE1/{
printf "<meta name=\"artist\" content=\"%s\">\n", value
body=body value "\n"
next
}
/TALB/{
printf "<meta name=\"album\" content=\"%s\">\n", value
body=body value "\n"
next
}
{
body=body value "\n"
}
END{print "</head><body><pre>";print body;print "</pre></body></html>"}
'
fi
# exit normally
exit 0

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclkwd,v 1.1 2007-06-08 14:01:30 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclkwd,v 1.2 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -87,13 +87,6 @@ umask 77
checkcmds awk unzip gunzip tar
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclogg,v 1.1 2007-10-02 14:00:47 dockes Exp $ (C) 2007 J.F.Dockes
# @(#$Id: rclogg,v 1.2 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes
#================================================================
# rclogg
# Handle ogg audio files for recoll.
@ -74,33 +74,37 @@ fi
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
checkcmds ogginfo
checkcmds ogginfo sed awk
# output the result
echo '<html><head>'
#echo '<title>' "$title" '</title>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>'
echo '<pre>'
nocaptionexp='/=/s/.*=//p'
kpcaptionexp='/=/p'
if test X$RECOLL_FILTER_FORPREVIEW = Xyes
then
echo '</head><body><pre>'
ogginfo "$infile" | grep '=' | sed -e 's/&/&amp;/g' -e 's/"/&quot;/g' \
-e 's/</&lt;/g' -e 's/>/&gt;/g'
echo '</pre></body></html>'
if test X$RECOLL_FILTER_FORPREVIEW = Xyes ; then
captionexp=$kpcaptionexp
else
captionexp=$nocaptionexp
ogginfo "$infile" | grep '=' | awk -F= \
'{sub("^[ ]+", "", $1)
sub("&", "&amp;", $2)
sub("\"", "&quot;", $2)
sub("<", "&lt;", $2)
sub(">", "&gt;", $2)
printf "<meta name=\"%s\" content=\"%s\">\n", $1, $2
body=body $2 "\n"
}
END{print "</head><body><pre>";print body;print "</pre></body></html>"}'
fi
# Note: ogginfo output is unparseable: the COMMENT field can be
# multiline, but there is no way to detect the end of the 'User
# comments' area
ogginfo "$infile" | \
sed -n \
-e "$captionexp"
echo '</pre>'
echo '</body></html>'
# exit normally
exit 0

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclopxml,v 1.2 2008-09-01 17:31:47 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
#================================================================
# rcldocx
# Extract text from an openxml msword file (will be extended for spreadsheets)
@ -80,13 +80,6 @@ umask 77
checkcmds xsltproc unzip
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclppt,v 1.3 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclppt,v 1.4 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@ -100,10 +100,6 @@ umask 77
#ENDRECFILTCOMMONCODE
checkcmds catppt
if test X$cmdsok = X0 ; then
printf "Catppt not found"
exit 1
fi
# output the result
echo '<html><head>'

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclsoff,v 1.11 2007-06-15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -89,13 +89,6 @@ umask 77
checkcmds awk iconv unzip
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
# We need a temporary directory
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclsvg,v 1.2 2008-02-03 16:05:57 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclsvg,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -83,13 +83,6 @@ umask 77
checkcmds iconv sed
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
encoding=`sed -ne '/<?xml/s/.*encoding="\([^"]*\).*/\1/p' < $infile`
if test X$encoding = X ; then encoding=UTF-8;fi

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclxls,v 1.4 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclxls,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@ -100,17 +100,7 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
# check the input file existence
if test ! -f "$infile"
then
printf '%s: %s: no such file\n' "$progname" "$infile"
exit 1
fi
checkcmds xls2csv
if test X$cmdsok = X0 ; then
printf "xls2csv not found"
exit 1
fi
# output the result
echo '<html><head>'

View File

@ -1,4 +1,4 @@
# @(#$Id: fields,v 1.4 2008-10-07 06:44:23 dockes Exp $ (C) 2007 J.F.Dockes
# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes
# Field names configuration. This defines how one may search ie for
# author:Hemingway
# Important:
@ -53,7 +53,7 @@ stored = author
[aliases]
abstract = summary dc:summary description xesam:description
author = creator dc:creator xesam:author xesam:creator
author = creator dc:creator xesam:author xesam:creator from
caption = title title dc:title subject
# catg = dc:type contentCategory
dbytes = size xesam:size

View File

@ -1,4 +1,4 @@
# @(#$Id: mimeconf,v 1.44 2008-09-28 14:20:50 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: mimeconf,v 1.45 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll : associations of mime types to processing filters.
# There are different sections for decompression, 'interning' for indexing
@ -26,12 +26,15 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
# The external "exec" filters are typically scripts. They output the
# document in simple html format, have a look at the scripts.
[index]
application/msword = exec rcldoc
# Note: rcldoc did some work to splice hyphenated words at eol. Seems
# actually not needed because antiword apparently does it too
# application/msword = exec rcldoc
application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
application/ogg = exec rclogg
application/pdf = exec rclpdf
application/postscript = exec rclps
application/vnd.ms-excel = exec rclxls
application/vnd.ms-powerpoint = exec rclppt
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain
application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
exec rclopxml
application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
@ -54,7 +57,7 @@ application/vnd.sun.xml.math = exec rclsoff
application/vnd.sun.xml.writer = exec rclsoff
application/vnd.sun.xml.writer.global = exec rclsoff
application/vnd.sun.xml.writer.template = exec rclsoff
application/vnd.wordperfect = exec rclwpd
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw
application/x-dvi = exec rcldvi
application/x-flac = exec rclflac
@ -72,7 +75,7 @@ image/svg+xml = exec rclsvg
message/rfc822 = internal
text/html = internal
text/plain = internal
text/rtf = exec rclrtf
text/rtf = exec unrtf --nopict --html;charset=iso-8859-1;mimetype=text/html
text/x-c = exec rcltext
text/x-gaim-log = exec rclgaim
text/x-html-sidux-man = exec rclsiduxman