fix to output <br> when needed + other misc pbs

2006-01-27 13:37:31 +00:00 · 2006-01-27 13:37:31 +00:00 · b46f99c955
commit b46f99c955
parent a2db1d5386
4 changed files with 138 additions and 72 deletions
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -20,7 +20,7 @@
 LANG=C ; export LANG
 LC_ALL=C ; export LC_ALL
 progname="rcldoc"
-decoder="antiword -i -1 -m UTF-8"
+decoder="antiword -t -i 1 -m UTF-8"
 # Not ready to use this for now (it outputs html, so the code below has to
 # be simplified.)
 #decoder="wvWare -1 -c UTF-8"
@ -72,28 +72,36 @@ fi
 $decoder "$infile" |
 awk '
 BEGIN {
-  printf("<html><head><title></title>\n")
-  printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
-  printf("</head>\n<body><p>");
-  esc = 1
+  print "<html><head><title></title>"
+  print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
+  print "</head>\n<body>\n<p>"
+  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
-    sub(/-$/, "", $0)
-    printf("%s", $0);
-  } else if($0 == "\f") {
-    printf("</p>\n<hr>\n<p>")
-  } else {
-    if(esc > 0) {
-      gsub(/&/, "\\&amp;", $0)
-      gsub(/</, "\\&lt;", $0)
-      gsub(/>/, "\\&gt;", $0)
-    }
-    print $0
+  $0 = cont $0
+  cont = ""
+
+  if ($0 ~ /[-]$/) {
+    # Note : soft-hyphen is iso8859 0xad
+    # Break at last whitespace
+    match($0, "[ \t][^ \t]+$")
+    line = substr($0, 0, RSTART)
+    cont = substr($0, RSTART, RLENGTH-1)
+    $0 = line
  }
+
+  if($0 == "\f") {
+    print "</p><hr><p>"
+    next
+  } 
+  gsub(/&/, "\\&amp;", $0)
+  gsub(/</, "\\&lt;", $0)
+  gsub(/>/, "\\&gt;", $0)
+
+  print $0 "<br>"
 }
 END {
-    printf("</p></body></html>\n");
+    print "</p></body></html>"
 }' | iconv -f UTF-8 -t UTF-8 -c -s 

 # exit normally
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # This is copied almost verbatim from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -7,10 +7,26 @@
 #================================================================
 #================================================================
 # rclpdf
-# Strip a file of PDF and extract its text as HTML.
-#================================================================
+# Convert a pdf file to  HTML.
+#
+# We use pdftotxt from the xpdf package. This does not perfect results as
+# whitespace is sometimes either arbitrarily inserted or stripped from the
+# text. This seems to depend on the usage of option -raw, and,
+# unfortunately also of the document itself, so that there does not seem to
+# be an universally good solution
+#
+# Also, the filter sometimes seems to output problematic utf-8. I did not
+# check if it was actually incorrect or just mis-understood by qtextedit
+# (tobedone) 
+#
+# In any case, for example, the code emitted for an fi ligature (correct or
+# not, I did not check) should be replaced with f and i characters as this
+# is what will get searched for.


+# Comment the following if you get better results without
+optionraw=-raw
+
 # set variables
 LANG=C ; export LANG
 LC_ALL=C ; export LC_ALL
@ -59,44 +75,70 @@ checkcmds()
 }
 checkcmds pdftotext iconv awk

-# output the result
-pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
+# Run pdftotext and fix the result (add a charset tag and fix the html escaping
+pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
 iconv -f UTF-8 -t UTF-8 -c -s |
 awk '
 BEGIN {
-  esc = 0
+  doescape = 0
+  cont = ""
+  charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
 }
 {
-  if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
-    printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
-    gsub(/<[^>]*>/, "", $0)
-    gsub(/&/, "\\&amp;", $0)
-    gsub(/</, "\\&lt;", $0)
-    gsub(/>/, "\\&gt;", $0)
-    printf("<title>%s</title>\n", $0)
-  } else if($0 == "<pre>"){
+  $0 = cont $0
+  cont = ""
+  # Insert charset meta tag at end of header
+  if(doescape == 0 && $0 ~ /<\/head>/) {
+    match($0, /<\/head>/)
+    part1 = substr($0, 0, RSTART-1)
+    part2 = substr($0, RSTART, length($0))
+    $0 =  part1 charsetmeta part2
+  }
+  if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
+    match($0, /<title>.*<\/title>/)
+    part1 = substr($0, 0, RSTART-1)
+    mid = substr($0, RSTART, RLENGTH)
+    part2 = substr($0, RSTART + RLENGTH, length($0))
+    gsub(/<title>/, "", mid)
+    gsub(/<\/title>/, "", mid)
+    gsub(/&/, "\\&amp;", mid)
+    gsub(/</, "\\&lt;", mid)
+    gsub(/>/, "\\&gt;", mid)
+    mid = "<title>" mid "</title>"
+    $0 = part1 mid part2
+  } 
+
+  if ($0 == "<pre>"){
    # Begin of body text. need to escape some chars from now on as 
    # pdftotext sometimes doesnt do it
-    esc++
-    printf("<p>")
+    doescape++
+    print $0
+    next
  } else if ($0 ~ /<\/pre>/){
-    esc--
-    printf("</p>\n")
-  } else if($0 ~ /-$/){
-    sub(/-$/, "", $0)
-    printf("%s", $0);
+    doescape--
+    print $0 
+    next
+  } else if ($0 ~ /[-]$/) {
+    # Note : soft-hyphen is iso8859 0xad
+    # Break at last whitespace
+    match($0, "[ \t][^ \t]+$")
+    line = substr($0, 0, RSTART)
+    cont = substr($0, RSTART, RLENGTH-1)
+    $0 = line
+    # print "LINE [" $0 "] CONT[" cont "]"
  } else if($0 == "\f"){
-    printf("</p>\n<hr>\n<p>")
-  } else {
-    if(esc > 0){
+    $0 = "<hr>"
+    print 
+    next
+  }
+  if(doescape > 0){
      gsub(/&/, "\\&amp;", $0)
      gsub(/</, "\\&lt;", $0)
      gsub(/>/, "\\&gt;", $0)
      gsub(/^ */, "", $0)
      gsub(/ *$/, "", $0)
-    }
-    print $0
  }
+  print $0
 }
 ' 

--- a/src/filters/rclps
+++ b/src/filters/rclps
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -9,7 +9,8 @@
 # rclps
 # Extract text from a postscript file by executing pstotext or ps2ascii. 
 #
-# The default is to use pstotext which can deal with accents.
+# The default is to use pstotext which can deal with accents, but in a
+# partially broken way (it always outputs iso8859-1, when it should use utf.
 #
 # OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
 # better (ie: on some openoffice output files).
@ -74,24 +75,34 @@ BEGIN {
  printf("<html><head><title></title>\n")
  printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
  printf("</head>\n<body><p>");
-  esc = 1
+  doescape = 1
+  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
-    sub(/-$/, "", $0)
-    printf("%s", $0);
-  } else if($0 == "\f") {
-    printf("</p>\n<hr>\n<p>")
-  } else {
-    if(esc > 0) {
+    $0 = cont $0
+    cont = ""
+
+    if ($0 == "\f") {
+       print "</p>\n<hr>\n<p>"
+       next
+    } else if ($0 ~ /$/) {
+      # Note : soft-hyphen is iso8859 0xad
+      # Break at last whitespace
+      match($0, "[ \t][^ \t]+$")
+      line = substr($0, 0, RSTART)
+      cont = substr($0, RSTART, RLENGTH)
+      $0 = line
+      gsub("", "", cont)
+    }
+
+    if(doescape > 0) {
      gsub(/&/, "\\&amp;", $0)
      gsub(/</, "\\&lt;", $0)
      gsub(/>/, "\\&gt;", $0)
    }
-    print $0
-  }
+    print $0 "<br>"
 }
 END {
-    printf("</p></body></html>\n");
+    print "</p></body></html>"
 }' | iconv -f iso-8859-1 -t UTF-8 -c -s 

--- a/src/filters/rclsoff
+++ b/src/filters/rclsoff
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -125,22 +125,27 @@ echo '</head><body><p>'
 echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
 awk '
 BEGIN {
-  esc = 1
+  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
-    sub(/-$/, "", $0)
-    printf("%s", $0);
-  } else if($0 == "\f") {
-    printf("</p>\n<hr>\n<p>")
-  } else {
-    if(esc > 0) {
-      gsub(/&/, "\\&amp;", $0)
-      gsub(/</, "\\&lt;", $0)
-      gsub(/>/, "\\&gt;", $0)
+    $0 = cont $0
+    cont = ""
+
+    if ($0 ~ /[-]$/) {
+      # Note : soft-hyphen is iso8859 0xad
+      # Break at last whitespace
+      match($0, "[ \t][^ \t]+$")
+      line = substr($0, 0, RSTART)
+      cont = substr($0, RSTART, RLENGTH-1)
+      $0 = line
    }
-    printf("%s<br>", $0)
-  }
+
+    if($0 == "\f") {
+        print "</p>\n<hr>\n<p>"
+        next
+    } 
+
+    print $0 "<br>"
 }
 END {
    printf("</p></body></html>\n");