fix to output <br> when needed + other misc pbs

This commit is contained in:
dockes 2006-01-27 13:37:31 +00:00
parent a2db1d5386
commit b46f99c955
4 changed files with 138 additions and 72 deletions

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier: # Parts taken from Estraier:
#================================================================ #================================================================
# Estraier: a personal full-text search system # Estraier: a personal full-text search system
@ -20,7 +20,7 @@
LANG=C ; export LANG LANG=C ; export LANG
LC_ALL=C ; export LC_ALL LC_ALL=C ; export LC_ALL
progname="rcldoc" progname="rcldoc"
decoder="antiword -i -1 -m UTF-8" decoder="antiword -t -i 1 -m UTF-8"
# Not ready to use this for now (it outputs html, so the code below has to # Not ready to use this for now (it outputs html, so the code below has to
# be simplified.) # be simplified.)
#decoder="wvWare -1 -c UTF-8" #decoder="wvWare -1 -c UTF-8"
@ -72,28 +72,36 @@ fi
$decoder "$infile" | $decoder "$infile" |
awk ' awk '
BEGIN { BEGIN {
printf("<html><head><title></title>\n") print "<html><head><title></title>"
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n") print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
printf("</head>\n<body><p>"); print "</head>\n<body>\n<p>"
esc = 1 cont = ""
} }
{ {
if ($0 ~ /-$/) { $0 = cont $0
sub(/-$/, "", $0) cont = ""
printf("%s", $0);
} else if($0 == "\f") { if ($0 ~ /[­-]$/) {
printf("</p>\n<hr>\n<p>") # Note : soft-hyphen is iso8859 0xad
} else { # Break at last whitespace
if(esc > 0) { match($0, "[ \t][^ \t]+$")
gsub(/&/, "\\&amp;", $0) line = substr($0, 0, RSTART)
gsub(/</, "\\&lt;", $0) cont = substr($0, RSTART, RLENGTH-1)
gsub(/>/, "\\&gt;", $0) $0 = line
}
print $0
} }
if($0 == "\f") {
print "</p><hr><p>"
next
}
gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0)
print $0 "<br>"
} }
END { END {
printf("</p></body></html>\n"); print "</p></body></html>"
}' | iconv -f UTF-8 -t UTF-8 -c -s }' | iconv -f UTF-8 -t UTF-8 -c -s
# exit normally # exit normally

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# This is copied almost verbatim from Estraier: # This is copied almost verbatim from Estraier:
#================================================================ #================================================================
# Estraier: a personal full-text search system # Estraier: a personal full-text search system
@ -7,10 +7,26 @@
#================================================================ #================================================================
#================================================================ #================================================================
# rclpdf # rclpdf
# Strip a file of PDF and extract its text as HTML. # Convert a pdf file to HTML.
#================================================================ #
# We use pdftotxt from the xpdf package. This does not perfect results as
# whitespace is sometimes either arbitrarily inserted or stripped from the
# text. This seems to depend on the usage of option -raw, and,
# unfortunately also of the document itself, so that there does not seem to
# be an universally good solution
#
# Also, the filter sometimes seems to output problematic utf-8. I did not
# check if it was actually incorrect or just mis-understood by qtextedit
# (tobedone)
#
# In any case, for example, the code emitted for an fi ligature (correct or
# not, I did not check) should be replaced with f and i characters as this
# is what will get searched for.
# Comment the following if you get better results without
optionraw=-raw
# set variables # set variables
LANG=C ; export LANG LANG=C ; export LANG
LC_ALL=C ; export LC_ALL LC_ALL=C ; export LC_ALL
@ -59,44 +75,70 @@ checkcmds()
} }
checkcmds pdftotext iconv awk checkcmds pdftotext iconv awk
# output the result # Run pdftotext and fix the result (add a charset tag and fix the html escaping
pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
iconv -f UTF-8 -t UTF-8 -c -s | iconv -f UTF-8 -t UTF-8 -c -s |
awk ' awk '
BEGIN { BEGIN {
esc = 0 doescape = 0
cont = ""
charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
} }
{ {
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){ $0 = cont $0
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n") cont = ""
gsub(/<[^>]*>/, "", $0) # Insert charset meta tag at end of header
gsub(/&/, "\\&amp;", $0) if(doescape == 0 && $0 ~ /<\/head>/) {
gsub(/</, "\\&lt;", $0) match($0, /<\/head>/)
gsub(/>/, "\\&gt;", $0) part1 = substr($0, 0, RSTART-1)
printf("<title>%s</title>\n", $0) part2 = substr($0, RSTART, length($0))
} else if($0 == "<pre>"){ $0 = part1 charsetmeta part2
}
if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
match($0, /<title>.*<\/title>/)
part1 = substr($0, 0, RSTART-1)
mid = substr($0, RSTART, RLENGTH)
part2 = substr($0, RSTART + RLENGTH, length($0))
gsub(/<title>/, "", mid)
gsub(/<\/title>/, "", mid)
gsub(/&/, "\\&amp;", mid)
gsub(/</, "\\&lt;", mid)
gsub(/>/, "\\&gt;", mid)
mid = "<title>" mid "</title>"
$0 = part1 mid part2
}
if ($0 == "<pre>"){
# Begin of body text. need to escape some chars from now on as # Begin of body text. need to escape some chars from now on as
# pdftotext sometimes doesnt do it # pdftotext sometimes doesnt do it
esc++ doescape++
printf("<p>") print $0
next
} else if ($0 ~ /<\/pre>/){ } else if ($0 ~ /<\/pre>/){
esc-- doescape--
printf("</p>\n") print $0
} else if($0 ~ /-$/){ next
sub(/-$/, "", $0) } else if ($0 ~ /[­-]$/) {
printf("%s", $0); # Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH-1)
$0 = line
# print "LINE [" $0 "] CONT[" cont "]"
} else if($0 == "\f"){ } else if($0 == "\f"){
printf("</p>\n<hr>\n<p>") $0 = "<hr>"
} else { print
if(esc > 0){ next
}
if(doescape > 0){
gsub(/&/, "\\&amp;", $0) gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0) gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0) gsub(/>/, "\\&gt;", $0)
gsub(/^ */, "", $0) gsub(/^ */, "", $0)
gsub(/ *$/, "", $0) gsub(/ *$/, "", $0)
}
print $0
} }
print $0
} }
' '

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier: # Parts taken from Estraier:
#================================================================ #================================================================
# Estraier: a personal full-text search system # Estraier: a personal full-text search system
@ -9,7 +9,8 @@
# rclps # rclps
# Extract text from a postscript file by executing pstotext or ps2ascii. # Extract text from a postscript file by executing pstotext or ps2ascii.
# #
# The default is to use pstotext which can deal with accents. # The default is to use pstotext which can deal with accents, but in a
# partially broken way (it always outputs iso8859-1, when it should use utf.
# #
# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work # OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
# better (ie: on some openoffice output files). # better (ie: on some openoffice output files).
@ -74,24 +75,34 @@ BEGIN {
printf("<html><head><title></title>\n") printf("<html><head><title></title>\n")
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n") printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
printf("</head>\n<body><p>"); printf("</head>\n<body><p>");
esc = 1 doescape = 1
cont = ""
} }
{ {
if ($0 ~ /-$/) { $0 = cont $0
sub(/-$/, "", $0) cont = ""
printf("%s", $0);
} else if($0 == "\f") { if ($0 == "\f") {
printf("</p>\n<hr>\n<p>") print "</p>\n<hr>\n<p>"
} else { next
if(esc > 0) { } else if ($0 ~ /­$/) {
# Note : soft-hyphen is iso8859 0xad
# Break at last whitespace
match($0, "[ \t][^ \t]+$")
line = substr($0, 0, RSTART)
cont = substr($0, RSTART, RLENGTH)
$0 = line
gsub("­", "", cont)
}
if(doescape > 0) {
gsub(/&/, "\\&amp;", $0) gsub(/&/, "\\&amp;", $0)
gsub(/</, "\\&lt;", $0) gsub(/</, "\\&lt;", $0)
gsub(/>/, "\\&gt;", $0) gsub(/>/, "\\&gt;", $0)
} }
print $0 print $0 "<br>"
}
} }
END { END {
printf("</p></body></html>\n"); print "</p></body></html>"
}' | iconv -f iso-8859-1 -t UTF-8 -c -s }' | iconv -f iso-8859-1 -t UTF-8 -c -s

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes # @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes
# Parts taken from Estraier: # Parts taken from Estraier:
#================================================================ #================================================================
# Estraier: a personal full-text search system # Estraier: a personal full-text search system
@ -125,22 +125,27 @@ echo '</head><body><p>'
echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\ echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
awk ' awk '
BEGIN { BEGIN {
esc = 1 cont = ""
} }
{ {
if ($0 ~ /-$/) { $0 = cont $0
sub(/-$/, "", $0) cont = ""
printf("%s", $0);
} else if($0 == "\f") { if ($0 ~ /[­-]$/) {
printf("</p>\n<hr>\n<p>") # Note : soft-hyphen is iso8859 0xad
} else { # Break at last whitespace
if(esc > 0) { match($0, "[ \t][^ \t]+$")
gsub(/&/, "\\&amp;", $0) line = substr($0, 0, RSTART)
gsub(/</, "\\&lt;", $0) cont = substr($0, RSTART, RLENGTH-1)
gsub(/>/, "\\&gt;", $0) $0 = line
} }
printf("%s<br>", $0)
} if($0 == "\f") {
print "</p>\n<hr>\n<p>"
next
}
print $0 "<br>"
} }
END { END {
printf("</p></body></html>\n"); printf("</p></body></html>\n");