From b46f99c9554be2e1b547fa0de4d3b999926a4783 Mon Sep 17 00:00:00 2001
From: dockes ");
- esc = 1
+ print " "
+ cont = ""
}
{
- if ($0 ~ /-$/) {
- sub(/-$/, "", $0)
- printf("%s", $0);
- } else if($0 == "\f") {
- printf(" ")
- } else {
- if(esc > 0) {
- gsub(/&/, "\\&", $0)
- gsub(/, "\\<", $0)
- gsub(/>/, "\\>", $0)
- }
- print $0
+ $0 = cont $0
+ cont = ""
+
+ if ($0 ~ /[-]$/) {
+ # Note : soft-hyphen is iso8859 0xad
+ # Break at last whitespace
+ match($0, "[ \t][^ \t]+$")
+ line = substr($0, 0, RSTART)
+ cont = substr($0, RSTART, RLENGTH-1)
+ $0 = line
}
+
+ if($0 == "\f") {
+ print " "
+ next
+ }
+ gsub(/&/, "\\&", $0)
+ gsub(/, "\\<", $0)
+ gsub(/>/, "\\>", $0)
+
+ print $0 "
when needed + other misc pbs
---
src/filters/rcldoc | 46 +++++++++++++---------
src/filters/rclpdf | 94 ++++++++++++++++++++++++++++++++-------------
src/filters/rclps | 37 +++++++++++-------
src/filters/rclsoff | 33 +++++++++-------
4 files changed, 138 insertions(+), 72 deletions(-)
diff --git a/src/filters/rcldoc b/src/filters/rcldoc
index 251ff539..08e0b440 100755
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@@ -1,5 +1,5 @@
#!/bin/sh
-# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
+# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@@ -20,7 +20,7 @@
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rcldoc"
-decoder="antiword -i -1 -m UTF-8"
+decoder="antiword -t -i 1 -m UTF-8"
# Not ready to use this for now (it outputs html, so the code below has to
# be simplified.)
#decoder="wvWare -1 -c UTF-8"
@@ -72,28 +72,36 @@ fi
$decoder "$infile" |
awk '
BEGIN {
- printf("
\n
"
}
END {
- printf("
"){
+ $0 = cont $0
+ cont = ""
+ # Insert charset meta tag at end of header
+ if(doescape == 0 && $0 ~ /<\/head>/) {
+ match($0, /<\/head>/)
+ part1 = substr($0, 0, RSTART-1)
+ part2 = substr($0, RSTART, length($0))
+ $0 = part1 charsetmeta part2
+ }
+ if(doescape == 0 && $0 ~ /.*<\/title>/){
+ match($0, /.*<\/title>/)
+ part1 = substr($0, 0, RSTART-1)
+ mid = substr($0, RSTART, RLENGTH)
+ part2 = substr($0, RSTART + RLENGTH, length($0))
+ gsub(//, "", mid)
+ gsub(/<\/title>/, "", mid)
+ gsub(/&/, "\\&", mid)
+ gsub(/, "\\<", mid)
+ gsub(/>/, "\\>", mid)
+ mid = "" mid " "
+ $0 = part1 mid part2
+ }
+
+ if ($0 == ""){
# Begin of body text. need to escape some chars from now on as
# pdftotext sometimes doesnt do it
- esc++
- printf("")
+ doescape++
+ print $0
+ next
} else if ($0 ~ /<\/pre>/){
- esc--
- printf("
\n")
- } else if($0 ~ /-$/){
- sub(/-$/, "", $0)
- printf("%s", $0);
+ doescape--
+ print $0
+ next
+ } else if ($0 ~ /[-]$/) {
+ # Note : soft-hyphen is iso8859 0xad
+ # Break at last whitespace
+ match($0, "[ \t][^ \t]+$")
+ line = substr($0, 0, RSTART)
+ cont = substr($0, RSTART, RLENGTH-1)
+ $0 = line
+ # print "LINE [" $0 "] CONT[" cont "]"
} else if($0 == "\f"){
- printf("\n
\n")
- } else {
- if(esc > 0){
+ $0 = "
"
+ print
+ next
+ }
+ if(doescape > 0){
gsub(/&/, "\\&", $0)
gsub(/, "\\<", $0)
gsub(/>/, "\\>", $0)
gsub(/^ */, "", $0)
gsub(/ *$/, "", $0)
- }
- print $0
}
+ print $0
}
'
diff --git a/src/filters/rclps b/src/filters/rclps
index 8d70dc94..ffde92d7 100755
--- a/src/filters/rclps
+++ b/src/filters/rclps
@@ -1,5 +1,5 @@
#!/bin/sh
-# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
+# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@@ -9,7 +9,8 @@
# rclps
# Extract text from a postscript file by executing pstotext or ps2ascii.
#
-# The default is to use pstotext which can deal with accents.
+# The default is to use pstotext which can deal with accents, but in a
+# partially broken way (it always outputs iso8859-1, when it should use utf.
#
# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
# better (ie: on some openoffice output files).
@@ -74,24 +75,34 @@ BEGIN {
printf(" \n")
printf("\n")
printf("\n");
- esc = 1
+ doescape = 1
+ cont = ""
}
{
- if ($0 ~ /-$/) {
- sub(/-$/, "", $0)
- printf("%s", $0);
- } else if($0 == "\f") {
- printf("
\n
\n")
- } else {
- if(esc > 0) {
+ $0 = cont $0
+ cont = ""
+
+ if ($0 == "\f") {
+ print "
\n
\n"
+ next
+ } else if ($0 ~ /$/) {
+ # Note : soft-hyphen is iso8859 0xad
+ # Break at last whitespace
+ match($0, "[ \t][^ \t]+$")
+ line = substr($0, 0, RSTART)
+ cont = substr($0, RSTART, RLENGTH)
+ $0 = line
+ gsub("", "", cont)
+ }
+
+ if(doescape > 0) {
gsub(/&/, "\\&", $0)
gsub(/, "\\<", $0)
gsub(/>/, "\\>", $0)
}
- print $0
- }
+ print $0 "
"
}
END {
- printf("
\n");
+ print "