diff --git a/src/filters/rclpdf b/src/filters/rclpdf index 8186dad4..9d49251b 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclpdf,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes # This is copied almost verbatim from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -65,8 +65,6 @@ iconv -f UTF-8 -t UTF-8 -c -s | awk ' BEGIN { esc = 0 - mul = 1 - emp = 0 } { if(esc < 1 && $0 ~ /^
"){
+ # Begin of body text. need to escape some chars from now on as
+ # pdftotext sometimes doesnt do it
esc++
printf("")
- mul = 1
- } else if($0 == "
"){
+ } else if ($0 ~ /<\/pre>/){
esc--
printf("\n")
} else if($0 ~ /-$/){
@@ -100,9 +99,4 @@ BEGIN {
}
}
'
-# Suppressed code 2 lines above (at the last print $0), which seemed to
-# deal with multibyte character being cut by a newline ? It caused problems
-# (sometimes concatenated last word of a line with first of next, and I
-# didn't really understand its use as iconv -c is supposed to fix the
-# encoding anyway