fix to output <br> when needed + other misc pbs
This commit is contained in:
parent
a2db1d5386
commit
b46f99c955
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
@ -20,7 +20,7 @@
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcldoc"
|
||||
decoder="antiword -i -1 -m UTF-8"
|
||||
decoder="antiword -t -i 1 -m UTF-8"
|
||||
# Not ready to use this for now (it outputs html, so the code below has to
|
||||
# be simplified.)
|
||||
#decoder="wvWare -1 -c UTF-8"
|
||||
@ -72,28 +72,36 @@ fi
|
||||
$decoder "$infile" |
|
||||
awk '
|
||||
BEGIN {
|
||||
printf("<html><head><title></title>\n")
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
|
||||
printf("</head>\n<body><p>");
|
||||
esc = 1
|
||||
print "<html><head><title></title>"
|
||||
print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
|
||||
print "</head>\n<body>\n<p>"
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
if ($0 ~ /-$/) {
|
||||
sub(/-$/, "", $0)
|
||||
printf("%s", $0);
|
||||
} else if($0 == "\f") {
|
||||
printf("</p>\n<hr>\n<p>")
|
||||
} else {
|
||||
if(esc > 0) {
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
}
|
||||
print $0
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
}
|
||||
|
||||
if($0 == "\f") {
|
||||
print "</p><hr><p>"
|
||||
next
|
||||
}
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
printf("</p></body></html>\n");
|
||||
print "</p></body></html>"
|
||||
}' | iconv -f UTF-8 -t UTF-8 -c -s
|
||||
|
||||
# exit normally
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# This is copied almost verbatim from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
@ -7,10 +7,26 @@
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rclpdf
|
||||
# Strip a file of PDF and extract its text as HTML.
|
||||
#================================================================
|
||||
# Convert a pdf file to HTML.
|
||||
#
|
||||
# We use pdftotxt from the xpdf package. This does not perfect results as
|
||||
# whitespace is sometimes either arbitrarily inserted or stripped from the
|
||||
# text. This seems to depend on the usage of option -raw, and,
|
||||
# unfortunately also of the document itself, so that there does not seem to
|
||||
# be an universally good solution
|
||||
#
|
||||
# Also, the filter sometimes seems to output problematic utf-8. I did not
|
||||
# check if it was actually incorrect or just mis-understood by qtextedit
|
||||
# (tobedone)
|
||||
#
|
||||
# In any case, for example, the code emitted for an fi ligature (correct or
|
||||
# not, I did not check) should be replaced with f and i characters as this
|
||||
# is what will get searched for.
|
||||
|
||||
|
||||
# Comment the following if you get better results without
|
||||
optionraw=-raw
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
@ -59,44 +75,70 @@ checkcmds()
|
||||
}
|
||||
checkcmds pdftotext iconv awk
|
||||
|
||||
# output the result
|
||||
pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
# Run pdftotext and fix the result (add a charset tag and fix the html escaping
|
||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk '
|
||||
BEGIN {
|
||||
esc = 0
|
||||
doescape = 0
|
||||
cont = ""
|
||||
charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
|
||||
}
|
||||
{
|
||||
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
|
||||
gsub(/<[^>]*>/, "", $0)
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
printf("<title>%s</title>\n", $0)
|
||||
} else if($0 == "<pre>"){
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
# Insert charset meta tag at end of header
|
||||
if(doescape == 0 && $0 ~ /<\/head>/) {
|
||||
match($0, /<\/head>/)
|
||||
part1 = substr($0, 0, RSTART-1)
|
||||
part2 = substr($0, RSTART, length($0))
|
||||
$0 = part1 charsetmeta part2
|
||||
}
|
||||
if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
|
||||
match($0, /<title>.*<\/title>/)
|
||||
part1 = substr($0, 0, RSTART-1)
|
||||
mid = substr($0, RSTART, RLENGTH)
|
||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||
gsub(/<title>/, "", mid)
|
||||
gsub(/<\/title>/, "", mid)
|
||||
gsub(/&/, "\\&", mid)
|
||||
gsub(/</, "\\<", mid)
|
||||
gsub(/>/, "\\>", mid)
|
||||
mid = "<title>" mid "</title>"
|
||||
$0 = part1 mid part2
|
||||
}
|
||||
|
||||
if ($0 == "<pre>"){
|
||||
# Begin of body text. need to escape some chars from now on as
|
||||
# pdftotext sometimes doesnt do it
|
||||
esc++
|
||||
printf("<p>")
|
||||
doescape++
|
||||
print $0
|
||||
next
|
||||
} else if ($0 ~ /<\/pre>/){
|
||||
esc--
|
||||
printf("</p>\n")
|
||||
} else if($0 ~ /-$/){
|
||||
sub(/-$/, "", $0)
|
||||
printf("%s", $0);
|
||||
doescape--
|
||||
print $0
|
||||
next
|
||||
} else if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
# print "LINE [" $0 "] CONT[" cont "]"
|
||||
} else if($0 == "\f"){
|
||||
printf("</p>\n<hr>\n<p>")
|
||||
} else {
|
||||
if(esc > 0){
|
||||
$0 = "<hr>"
|
||||
print
|
||||
next
|
||||
}
|
||||
if(doescape > 0){
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
gsub(/^ */, "", $0)
|
||||
gsub(/ *$/, "", $0)
|
||||
}
|
||||
print $0
|
||||
}
|
||||
print $0
|
||||
}
|
||||
'
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
@ -9,7 +9,8 @@
|
||||
# rclps
|
||||
# Extract text from a postscript file by executing pstotext or ps2ascii.
|
||||
#
|
||||
# The default is to use pstotext which can deal with accents.
|
||||
# The default is to use pstotext which can deal with accents, but in a
|
||||
# partially broken way (it always outputs iso8859-1, when it should use utf.
|
||||
#
|
||||
# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
|
||||
# better (ie: on some openoffice output files).
|
||||
@ -74,24 +75,34 @@ BEGIN {
|
||||
printf("<html><head><title></title>\n")
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
|
||||
printf("</head>\n<body><p>");
|
||||
esc = 1
|
||||
doescape = 1
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
if ($0 ~ /-$/) {
|
||||
sub(/-$/, "", $0)
|
||||
printf("%s", $0);
|
||||
} else if($0 == "\f") {
|
||||
printf("</p>\n<hr>\n<p>")
|
||||
} else {
|
||||
if(esc > 0) {
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 == "\f") {
|
||||
print "</p>\n<hr>\n<p>"
|
||||
next
|
||||
} else if ($0 ~ /$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH)
|
||||
$0 = line
|
||||
gsub("", "", cont)
|
||||
}
|
||||
|
||||
if(doescape > 0) {
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
}
|
||||
print $0
|
||||
}
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
printf("</p></body></html>\n");
|
||||
print "</p></body></html>"
|
||||
}' | iconv -f iso-8859-1 -t UTF-8 -c -s
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
@ -125,22 +125,27 @@ echo '</head><body><p>'
|
||||
echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\
|
||||
awk '
|
||||
BEGIN {
|
||||
esc = 1
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
if ($0 ~ /-$/) {
|
||||
sub(/-$/, "", $0)
|
||||
printf("%s", $0);
|
||||
} else if($0 == "\f") {
|
||||
printf("</p>\n<hr>\n<p>")
|
||||
} else {
|
||||
if(esc > 0) {
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 ~ /[-]$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH-1)
|
||||
$0 = line
|
||||
}
|
||||
printf("%s<br>", $0)
|
||||
}
|
||||
|
||||
if($0 == "\f") {
|
||||
print "</p>\n<hr>\n<p>"
|
||||
next
|
||||
}
|
||||
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
printf("</p></body></html>\n");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user