fix to output <br> when needed + other misc pbs

2006-01-27 13:37:31 +00:00 · 2006-01-27 13:37:31 +00:00 · b46f99c955
commit b46f99c955
parent a2db1d5386
4 changed files with 138 additions and 72 deletions
--- a/src/filters/rcldoc
+++ b/src/filters/rcldoc
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rcldoc,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rcldoc,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -20,7 +20,7 @@
 LANG=C ; export LANG
 LC_ALL=C ; export LC_ALL
 progname="rcldoc"
-decoder="antiword -i -1 -m UTF-8"
+decoder="antiword -t -i 1 -m UTF-8"
 # Not ready to use this for now (it outputs html, so the code below has to
 # be simplified.)
 #decoder="wvWare -1 -c UTF-8"
@ -72,28 +72,36 @@ fi
 $decoder "$infile" |
 awk '
 BEGIN {
-  printf("<html><head><title></title>\n")
+  print "<html><head><title></title>"
-  printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
+  print "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">"
-  printf("</head>\n<body><p>");
+  print "</head>\n<body>\n<p>"
-  esc = 1
+  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
+  $0 = cont $0
-    sub(/-$/, "", $0)
+  cont = ""
-    printf("%s", $0);
+
-  } else if($0 == "\f") {
+  if ($0 ~ /[-]$/) {
-    printf("</p>\n<hr>\n<p>")
+    # Note : soft-hyphen is iso8859 0xad
-  } else {
+    # Break at last whitespace
-    if(esc > 0) {
+    match($0, "[ \t][^ \t]+$")
-      gsub(/&/, "\\&amp;", $0)
+    line = substr($0, 0, RSTART)
-      gsub(/</, "\\&lt;", $0)
+    cont = substr($0, RSTART, RLENGTH-1)
-      gsub(/>/, "\\&gt;", $0)
+    $0 = line
    }
    print $0
  }
  if($0 == "\f") {
    print "</p><hr><p>"
    next
  } 
  gsub(/&/, "\\&amp;", $0)
  gsub(/</, "\\&lt;", $0)
  gsub(/>/, "\\&gt;", $0)
  print $0 "<br>"
 }
 END {
-    printf("</p></body></html>\n");
+    print "</p></body></html>"
 }' | iconv -f UTF-8 -t UTF-8 -c -s 
 # exit normally
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclpdf,v 1.5 2005-12-02 16:17:55 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclpdf,v 1.6 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # This is copied almost verbatim from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -7,10 +7,26 @@
 #================================================================
 #================================================================
 # rclpdf
-# Strip a file of PDF and extract its text as HTML.
+# Convert a pdf file to  HTML.
-#================================================================
+#
 # We use pdftotxt from the xpdf package. This does not perfect results as
 # whitespace is sometimes either arbitrarily inserted or stripped from the
 # text. This seems to depend on the usage of option -raw, and,
 # unfortunately also of the document itself, so that there does not seem to
 # be an universally good solution
 #
 # Also, the filter sometimes seems to output problematic utf-8. I did not
 # check if it was actually incorrect or just mis-understood by qtextedit
 # (tobedone) 
 #
 # In any case, for example, the code emitted for an fi ligature (correct or
 # not, I did not check) should be replaced with f and i characters as this
 # is what will get searched for.
 # Comment the following if you get better results without
 optionraw=-raw
 # set variables
 LANG=C ; export LANG
 LC_ALL=C ; export LC_ALL
@ -59,44 +75,70 @@ checkcmds()
 }
 checkcmds pdftotext iconv awk
-# output the result
+# Run pdftotext and fix the result (add a charset tag and fix the html escaping
-pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
+pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
 iconv -f UTF-8 -t UTF-8 -c -s |
 awk '
 BEGIN {
-  esc = 0
+  doescape = 0
  cont = ""
  charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
 }
 {
-  if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
+  $0 = cont $0
-    printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
+  cont = ""
-    gsub(/<[^>]*>/, "", $0)
+  # Insert charset meta tag at end of header
-    gsub(/&/, "\\&amp;", $0)
+  if(doescape == 0 && $0 ~ /<\/head>/) {
-    gsub(/</, "\\&lt;", $0)
+    match($0, /<\/head>/)
-    gsub(/>/, "\\&gt;", $0)
+    part1 = substr($0, 0, RSTART-1)
-    printf("<title>%s</title>\n", $0)
+    part2 = substr($0, RSTART, length($0))
-  } else if($0 == "<pre>"){
+    $0 =  part1 charsetmeta part2
  }
  if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
    match($0, /<title>.*<\/title>/)
    part1 = substr($0, 0, RSTART-1)
    mid = substr($0, RSTART, RLENGTH)
    part2 = substr($0, RSTART + RLENGTH, length($0))
    gsub(/<title>/, "", mid)
    gsub(/<\/title>/, "", mid)
    gsub(/&/, "\\&amp;", mid)
    gsub(/</, "\\&lt;", mid)
    gsub(/>/, "\\&gt;", mid)
    mid = "<title>" mid "</title>"
    $0 = part1 mid part2
  } 
  if ($0 == "<pre>"){
    # Begin of body text. need to escape some chars from now on as 
    # pdftotext sometimes doesnt do it
-    esc++
+    doescape++
-    printf("<p>")
+    print $0
    next
  } else if ($0 ~ /<\/pre>/){
-    esc--
+    doescape--
-    printf("</p>\n")
+    print $0 
-  } else if($0 ~ /-$/){
+    next
-    sub(/-$/, "", $0)
+  } else if ($0 ~ /[-]$/) {
-    printf("%s", $0);
+    # Note : soft-hyphen is iso8859 0xad
    # Break at last whitespace
    match($0, "[ \t][^ \t]+$")
    line = substr($0, 0, RSTART)
    cont = substr($0, RSTART, RLENGTH-1)
    $0 = line
    # print "LINE [" $0 "] CONT[" cont "]"
  } else if($0 == "\f"){
-    printf("</p>\n<hr>\n<p>")
+    $0 = "<hr>"
-  } else {
+    print 
-    if(esc > 0){
+    next
  }
  if(doescape > 0){
      gsub(/&/, "\\&amp;", $0)
      gsub(/</, "\\&lt;", $0)
      gsub(/>/, "\\&gt;", $0)
      gsub(/^ */, "", $0)
      gsub(/ *$/, "", $0)
    }
    print $0
  }
  print $0
 }
 ' 
--- a/src/filters/rclps
+++ b/src/filters/rclps
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclps,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclps,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -9,7 +9,8 @@
 # rclps
 # Extract text from a postscript file by executing pstotext or ps2ascii. 
 #
-# The default is to use pstotext which can deal with accents.
+# The default is to use pstotext which can deal with accents, but in a
 # partially broken way (it always outputs iso8859-1, when it should use utf.
 #
 # OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
 # better (ie: on some openoffice output files).
@ -74,24 +75,34 @@ BEGIN {
  printf("<html><head><title></title>\n")
  printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
  printf("</head>\n<body><p>");
-  esc = 1
+  doescape = 1
  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
+    $0 = cont $0
-    sub(/-$/, "", $0)
+    cont = ""
-    printf("%s", $0);
+
-  } else if($0 == "\f") {
+    if ($0 == "\f") {
-    printf("</p>\n<hr>\n<p>")
+       print "</p>\n<hr>\n<p>"
-  } else {
+       next
-    if(esc > 0) {
+    } else if ($0 ~ /$/) {
      # Note : soft-hyphen is iso8859 0xad
      # Break at last whitespace
      match($0, "[ \t][^ \t]+$")
      line = substr($0, 0, RSTART)
      cont = substr($0, RSTART, RLENGTH)
      $0 = line
      gsub("", "", cont)
    }
    if(doescape > 0) {
      gsub(/&/, "\\&amp;", $0)
      gsub(/</, "\\&lt;", $0)
      gsub(/>/, "\\&gt;", $0)
    }
-    print $0
+    print $0 "<br>"
  }
 }
 END {
-    printf("</p></body></html>\n");
+    print "</p></body></html>"
 }' | iconv -f iso-8859-1 -t UTF-8 -c -s 
--- a/src/filters/rclsoff
+++ b/src/filters/rclsoff
@ -1,5 +1,5 @@
 #!/bin/sh
-# @(#$Id: rclsoff,v 1.4 2005-10-20 15:42:29 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: rclsoff,v 1.5 2006-01-27 13:37:31 dockes Exp $  (C) 2004 J.F.Dockes
 # Parts taken from Estraier:
 #================================================================
 # Estraier: a personal full-text search system
@ -125,22 +125,27 @@ echo '</head><body><p>'
 echo "$content" | sed -e "s/&apos;/'/g" -e 's/&quot;/"/g' |\
 awk '
 BEGIN {
-  esc = 1
+  cont = ""
 }
 {
-  if ($0 ~ /-$/) {
+    $0 = cont $0
-    sub(/-$/, "", $0)
+    cont = ""
-    printf("%s", $0);
+
-  } else if($0 == "\f") {
+    if ($0 ~ /[-]$/) {
-    printf("</p>\n<hr>\n<p>")
+      # Note : soft-hyphen is iso8859 0xad
-  } else {
+      # Break at last whitespace
-    if(esc > 0) {
+      match($0, "[ \t][^ \t]+$")
-      gsub(/&/, "\\&amp;", $0)
+      line = substr($0, 0, RSTART)
-      gsub(/</, "\\&lt;", $0)
+      cont = substr($0, RSTART, RLENGTH-1)
-      gsub(/>/, "\\&gt;", $0)
+      $0 = line
    }
-    printf("%s<br>", $0)
+
-  }
+    if($0 == "\f") {
        print "</p>\n<hr>\n<p>"
        next
    } 
    print $0 "<br>"
 }
 END {
    printf("</p></body></html>\n");