new version of pdftotext broke us
This commit is contained in:
parent
95c89880e6
commit
f208fe9cc8
@ -1,5 +1,5 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# @(#$Id: rclpdf,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
|
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes
|
||||||
# This is copied almost verbatim from Estraier:
|
# This is copied almost verbatim from Estraier:
|
||||||
#================================================================
|
#================================================================
|
||||||
# Estraier: a personal full-text search system
|
# Estraier: a personal full-text search system
|
||||||
@ -65,8 +65,6 @@ iconv -f UTF-8 -t UTF-8 -c -s |
|
|||||||
awk '
|
awk '
|
||||||
BEGIN {
|
BEGIN {
|
||||||
esc = 0
|
esc = 0
|
||||||
mul = 1
|
|
||||||
emp = 0
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
|
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
|
||||||
@ -77,10 +75,11 @@ BEGIN {
|
|||||||
gsub(/>/, "\\>", $0)
|
gsub(/>/, "\\>", $0)
|
||||||
printf("<title>%s</title>\n", $0)
|
printf("<title>%s</title>\n", $0)
|
||||||
} else if($0 == "<pre>"){
|
} else if($0 == "<pre>"){
|
||||||
|
# Begin of body text. need to escape some chars from now on as
|
||||||
|
# pdftotext sometimes doesnt do it
|
||||||
esc++
|
esc++
|
||||||
printf("<p>")
|
printf("<p>")
|
||||||
mul = 1
|
} else if ($0 ~ /<\/pre>/){
|
||||||
} else if($0 == "</pre>"){
|
|
||||||
esc--
|
esc--
|
||||||
printf("</p>\n")
|
printf("</p>\n")
|
||||||
} else if($0 ~ /-$/){
|
} else if($0 ~ /-$/){
|
||||||
@ -100,9 +99,4 @@ BEGIN {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
'
|
'
|
||||||
# Suppressed code 2 lines above (at the last print $0), which seemed to
|
|
||||||
# deal with multibyte character being cut by a newline ? It caused problems
|
|
||||||
# (sometimes concatenated last word of a line with first of next, and I
|
|
||||||
# didn't really understand its use as iconv -c is supposed to fix the
|
|
||||||
# encoding anyway
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user