new version of pdftotext broke us
This commit is contained in:
parent
95c89880e6
commit
f208fe9cc8
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclpdf,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# This is copied almost verbatim from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
@ -65,8 +65,6 @@ iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk '
|
||||
BEGIN {
|
||||
esc = 0
|
||||
mul = 1
|
||||
emp = 0
|
||||
}
|
||||
{
|
||||
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
|
||||
@ -77,10 +75,11 @@ BEGIN {
|
||||
gsub(/>/, "\\>", $0)
|
||||
printf("<title>%s</title>\n", $0)
|
||||
} else if($0 == "<pre>"){
|
||||
# Begin of body text. need to escape some chars from now on as
|
||||
# pdftotext sometimes doesnt do it
|
||||
esc++
|
||||
printf("<p>")
|
||||
mul = 1
|
||||
} else if($0 == "</pre>"){
|
||||
} else if ($0 ~ /<\/pre>/){
|
||||
esc--
|
||||
printf("</p>\n")
|
||||
} else if($0 ~ /-$/){
|
||||
@ -100,9 +99,4 @@ BEGIN {
|
||||
}
|
||||
}
|
||||
'
|
||||
# Suppressed code 2 lines above (at the last print $0), which seemed to
|
||||
# deal with multibyte character being cut by a newline ? It caused problems
|
||||
# (sometimes concatenated last word of a line with first of next, and I
|
||||
# didn't really understand its use as iconv -c is supposed to fix the
|
||||
# encoding anyway
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user