fix to output <br> when needed + other misc pbs

This commit is contained in:
dockes 2006-01-27 13:37:31 +00:00
parent a2db1d5386
commit b46f99c955
4 changed files with 138 additions and 72 deletions

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -20,7 +20,7 @@
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
progname="rcldoc"
decoder="antiword -i -1 -m UTF-8"
decoder="antiword -t -i 1 -m UTF-8"
# Not ready to use this for now (it outputs html, so the code below has to
# be simplified.)
#decoder="wvWare -1 -c UTF-8"
@ -72,28 +72,36 @@ fi
$decoder "$infile" |
awk '
BEGIN {
printf("<html><head><title></title>\n")
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
printf("</head>\n<body><p>");
esc = 1
print "<html><head><title></title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
print "</head>\n<body>\n<p>"
cont = ""
}
{
if ($0 ~ /-$/) {
sub(/-$/, "", $0)
printf("%s", $0);
} else if($0 == "\f") {
printf("</p>\n<hr>\n<p>")
} else {
if(esc > 0) {
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
}
print $0
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
if($0 == "\f") {
print "</p><hr><p>"
next
}
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0 "<br>"
}
END {
printf("</p></body></html>\n");
print "</p></body></html>"
}' | iconv -f UTF-8 -t UTF-8 -c -s
# exit normally

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# This is copied almost verbatim from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -7,10 +7,26 @@
#================================================================
#================================================================
# rclpdf
# Strip a file of PDF and extract its text as HTML.
#================================================================
# Convert a pdf file to HTML.
#
# We use pdftotxt from the xpdf package. This does not perfect results as
# whitespace is sometimes either arbitrarily inserted or stripped from the
# text. This seems to depend on the usage of option -raw, and,
# unfortunately also of the document itself, so that there does not seem to
# be an universally good solution
#
# Also, the filter sometimes seems to output problematic utf-8. I did not
# check if it was actually incorrect or just mis-understood by qtextedit
# (tobedone)
#
# In any case, for example, the code emitted for an fi ligature (correct or
# not, I did not check) should be replaced with f and i characters as this
# is what will get searched for.
# Comment the following if you get better results without
optionraw=-raw
# set variables
LANG=C ; export LANG
LC_ALL=C ; export LC_ALL
@ -59,44 +75,70 @@ checkcmds()
}
checkcmds pdftotext iconv awk
# output the result
pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
# Run pdftotext and fix the result (add a charset tag and fix the html escaping
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c -s |
awk '
BEGIN {
esc = 0
doescape = 0
cont = ""
charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
}
{
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
gsub(/<[^>]*>/, "", $0)
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
printf("<title>%s</title>\n", $0)
} else if($0 == "<pre>"){
$0 = cont $0
cont = ""
# Insert charset meta tag at end of header
if(doescape == 0 && $0 ~ /<\/head>/) {
match($0, /<\/head>/)
part1 = substr($0, 0, RSTART-1)
part2 = substr($0, RSTART, length($0))
$0 = part1 charsetmeta part2
}
if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
match($0, /<title>.*<\/title>/)
part1 = substr($0, 0, RSTART-1)
mid = substr($0, RSTART, RLENGTH)
part2 = substr($0, RSTART + RLENGTH, length($0))
gsub(/<title>/, "", mid)
gsub(/<\/title>/, "", mid)
gsub(/&/, "\\&amp;", mid)
gsub(/</, "\\&lt;", mid)
gsub(/>/, "\\&gt;", mid)
mid = "<title>" mid "</title>"
$0 = part1 mid part2
}
if ($0 == "<pre>"){
# Begin of body text. need to escape some chars from now on as
# pdftotext sometimes doesnt do it
esc++
printf("<p>")
doescape++
print $0
next
} else if ($0 ~ /<\/pre>/){
esc--
printf("</p>\n")
} else if($0 ~ /-$/){
sub(/-$/, "", $0)
printf("%s", $0);
doescape--
print $0
next
} else if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
# print "LINE [" $0 "] CONT[" cont "]"
} else if($0 == "\f"){
printf("</p>\n<hr>\n<p>")
} else {
if(esc > 0){
$0 = "<hr>"
print
next
}
if(doescape > 0){
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
gsub(/^ */, "", $0)
gsub(/ *$/, "", $0)
}
print $0
}
print $0
}
'

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -9,7 +9,8 @@
# rclps
# Extract text from a postscript file by executing pstotext or ps2ascii.
#
# The default is to use pstotext which can deal with accents.
# The default is to use pstotext which can deal with accents, but in a
# partially broken way (it always outputs iso8859-1, when it should use utf.
#
# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
# better (ie: on some openoffice output files).
@ -74,24 +75,34 @@ BEGIN {
printf("<html><head><title></title>\n")
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
printf("</head>\n<body><p>");
esc = 1
doescape = 1
cont = ""
}
{
if ($0 ~ /-$/) {
sub(/-$/, "", $0)
printf("%s", $0);
} else if($0 == "\f") {
printf("</p>\n<hr>\n<p>")
} else {
if(esc > 0) {
$0 = cont $0
cont = ""
if ($0 == "\f") {
print "</p>\n<hr>\n<p>"
next
} else if ($0 ~ /­$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH)
$0 = line
gsub("­", "", cont)
}
if(doescape > 0) {
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
}
print $0
}
print $0 "<br>"
}
END {
printf("</p></body></html>\n");
print "</p></body></html>"
}' | iconv -f iso-8859-1 -t UTF-8 -c -s

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -125,22 +125,27 @@ echo '</head><body><p>'
echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
awk '
BEGIN {
esc = 1
cont = ""
}
{
if ($0 ~ /-$/) {
sub(/-$/, "", $0)
printf("%s", $0);
} else if($0 == "\f") {
printf("</p>\n<hr>\n<p>")
} else {
if(esc > 0) {
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
$0 = cont $0
cont = ""
if ($0 ~ /[­-]$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
}
printf("%s<br>", $0)
}
if($0 == "\f") {
print "</p>\n<hr>\n<p>"
next
}
print $0 "<br>"
}
END {
printf("</p></body></html>\n");