diff --git a/src/filters/rcldoc b/src/filters/rcldoc index 251ff539..08e0b440 100755 --- a/src/filters/rcldoc +++ b/src/filters/rcldoc @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -20,7 +20,7 @@ LANG=C ; export LANG LC_ALL=C ; export LC_ALL progname="rcldoc" -decoder="antiword -i -1 -m UTF-8" +decoder="antiword -t -i 1 -m UTF-8" # Not ready to use this for now (it outputs html, so the code below has to # be simplified.) #decoder="wvWare -1 -c UTF-8" @@ -72,28 +72,36 @@ fi $decoder "$infile" | awk ' BEGIN { - printf("\n") - printf("\n") - printf("\n

"); - esc = 1 + print "" + print "" + print "\n\n

" + cont = "" } { - if ($0 ~ /-$/) { - sub(/-$/, "", $0) - printf("%s", $0); - } else if($0 == "\f") { - printf("

\n
\n

") - } else { - if(esc > 0) { - gsub(/&/, "\\&", $0) - gsub(//, "\\>", $0) - } - print $0 + $0 = cont $0 + cont = "" + + if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line } + + if($0 == "\f") { + print "


" + next + } + gsub(/&/, "\\&", $0) + gsub(//, "\\>", $0) + + print $0 "
" } END { - printf("

\n"); + print "

" }' | iconv -f UTF-8 -t UTF-8 -c -s # exit normally diff --git a/src/filters/rclpdf b/src/filters/rclpdf index 9d49251b..7ea12703 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes # This is copied almost verbatim from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -7,10 +7,26 @@ #================================================================ #================================================================ # rclpdf -# Strip a file of PDF and extract its text as HTML. -#================================================================ +# Convert a pdf file to HTML. +# +# We use pdftotxt from the xpdf package. This does not perfect results as +# whitespace is sometimes either arbitrarily inserted or stripped from the +# text. This seems to depend on the usage of option -raw, and, +# unfortunately also of the document itself, so that there does not seem to +# be an universally good solution +# +# Also, the filter sometimes seems to output problematic utf-8. I did not +# check if it was actually incorrect or just mis-understood by qtextedit +# (tobedone) +# +# In any case, for example, the code emitted for an fi ligature (correct or +# not, I did not check) should be replaced with f and i characters as this +# is what will get searched for. +# Comment the following if you get better results without +optionraw=-raw + # set variables LANG=C ; export LANG LC_ALL=C ; export LC_ALL @@ -59,44 +75,70 @@ checkcmds() } checkcmds pdftotext iconv awk -# output the result -pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | +# Run pdftotext and fix the result (add a charset tag and fix the html escaping +pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | iconv -f UTF-8 -t UTF-8 -c -s | awk ' BEGIN { - esc = 0 + doescape = 0 + cont = "" + charsetmeta = "" } { - if(esc < 1 && $0 ~ /^/ && $0 ~ /title>$/){ - printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n") - gsub(/<[^>]*>/, "", $0) - gsub(/&/, "\\&", $0) - gsub(/</, "\\<", $0) - gsub(/>/, "\\>", $0) - printf("<title>%s\n", $0) - } else if($0 == "
"){
+  $0 = cont $0
+  cont = ""
+  # Insert charset meta tag at end of header
+  if(doescape == 0 && $0 ~ /<\/head>/) {
+    match($0, /<\/head>/)
+    part1 = substr($0, 0, RSTART-1)
+    part2 = substr($0, RSTART, length($0))
+    $0 =  part1 charsetmeta part2
+  }
+  if(doescape == 0 && $0 ~ /.*<\/title>/){
+    match($0, /<title>.*<\/title>/)
+    part1 = substr($0, 0, RSTART-1)
+    mid = substr($0, RSTART, RLENGTH)
+    part2 = substr($0, RSTART + RLENGTH, length($0))
+    gsub(/<title>/, "", mid)
+    gsub(/<\/title>/, "", mid)
+    gsub(/&/, "\\&", mid)
+    gsub(/</, "\\<", mid)
+    gsub(/>/, "\\>", mid)
+    mid = "<title>" mid ""
+    $0 = part1 mid part2
+  } 
+
+  if ($0 == "
"){
     # Begin of body text. need to escape some chars from now on as 
     # pdftotext sometimes doesnt do it
-    esc++
-    printf("

") + doescape++ + print $0 + next } else if ($0 ~ /<\/pre>/){ - esc-- - printf("

\n") - } else if($0 ~ /-$/){ - sub(/-$/, "", $0) - printf("%s", $0); + doescape-- + print $0 + next + } else if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line + # print "LINE [" $0 "] CONT[" cont "]" } else if($0 == "\f"){ - printf("

\n
\n

") - } else { - if(esc > 0){ + $0 = "


" + print + next + } + if(doescape > 0){ gsub(/&/, "\\&", $0) gsub(//, "\\>", $0) gsub(/^ */, "", $0) gsub(/ *$/, "", $0) - } - print $0 } + print $0 } ' diff --git a/src/filters/rclps b/src/filters/rclps index 8d70dc94..ffde92d7 100755 --- a/src/filters/rclps +++ b/src/filters/rclps @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -9,7 +9,8 @@ # rclps # Extract text from a postscript file by executing pstotext or ps2ascii. # -# The default is to use pstotext which can deal with accents. +# The default is to use pstotext which can deal with accents, but in a +# partially broken way (it always outputs iso8859-1, when it should use utf. # # OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work # better (ie: on some openoffice output files). @@ -74,24 +75,34 @@ BEGIN { printf("\n") printf("\n") printf("\n

"); - esc = 1 + doescape = 1 + cont = "" } { - if ($0 ~ /-$/) { - sub(/-$/, "", $0) - printf("%s", $0); - } else if($0 == "\f") { - printf("

\n
\n

") - } else { - if(esc > 0) { + $0 = cont $0 + cont = "" + + if ($0 == "\f") { + print "

\n
\n

" + next + } else if ($0 ~ /­$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH) + $0 = line + gsub("­", "", cont) + } + + if(doescape > 0) { gsub(/&/, "\\&", $0) gsub(//, "\\>", $0) } - print $0 - } + print $0 "
" } END { - printf("

\n"); + print "

" }' | iconv -f iso-8859-1 -t UTF-8 -c -s diff --git a/src/filters/rclsoff b/src/filters/rclsoff index e2934497..ec011390 100755 --- a/src/filters/rclsoff +++ b/src/filters/rclsoff @@ -1,5 +1,5 @@ #!/bin/sh -# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $ (C) 2004 J.F.Dockes # Parts taken from Estraier: #================================================================ # Estraier: a personal full-text search system @@ -125,22 +125,27 @@ echo '

' echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ awk ' BEGIN { - esc = 1 + cont = "" } { - if ($0 ~ /-$/) { - sub(/-$/, "", $0) - printf("%s", $0); - } else if($0 == "\f") { - printf("

\n
\n

") - } else { - if(esc > 0) { - gsub(/&/, "\\&", $0) - gsub(//, "\\>", $0) + $0 = cont $0 + cont = "" + + if ($0 ~ /[­-]$/) { + # Note : soft-hyphen is iso8859 0xad + # Break at last whitespace + match($0, "[ \t][^ \t]+$") + line = substr($0, 0, RSTART) + cont = substr($0, RSTART, RLENGTH-1) + $0 = line } - printf("%s
", $0) - } + + if($0 == "\f") { + print "

\n
\n

" + next + } + + print $0 "
" } END { printf("

\n");