new version of pdftotext broke us

This commit is contained in:
dockes 2005-12-02 16:17:55 +00:00
parent 95c89880e6
commit f208fe9cc8

View File

@ -1,5 +1,5 @@
#!/bin/sh
# @(#$Id: rclpdf,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes
# This is copied almost verbatim from Estraier:
#================================================================
# Estraier: a personal full-text search system
@ -65,8 +65,6 @@ iconv -f UTF-8 -t UTF-8 -c -s |
awk '
BEGIN {
esc = 0
mul = 1
emp = 0
}
{
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
@ -77,10 +75,11 @@ BEGIN {
gsub(/>/, "\\&gt;", $0)
printf("<title>%s</title>\n", $0)
} else if($0 == "<pre>"){
# Begin of body text. need to escape some chars from now on as
# pdftotext sometimes doesnt do it
esc++
printf("<p>")
mul = 1
} else if($0 == "</pre>"){
} else if ($0 ~ /<\/pre>/){
esc--
printf("</p>\n")
} else if($0 ~ /-$/){
@ -100,9 +99,4 @@ BEGIN {
}
}
'
# Suppressed code 2 lines above (at the last print $0), which seemed to
# deal with multibyte character being cut by a newline ? It caused problems
# (sometimes concatenated last word of a line with first of next, and I
# didn't really understand its use as iconv -c is supposed to fix the
# encoding anyway