rclpdf: also escape text inside meta content attributes

2014-08-25 14:16:45 +02:00 · 2014-08-25 14:16:45 +02:00 · 552eb0965b
commit 552eb0965b
parent 26c7e1c690
1 changed files with 19 additions and 10 deletions
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@ -1,22 +1,18 @@
 #!/bin/sh
 # @(#$Id: rclpdf,v 1.10 2007-07-12 17:13:38 dockes Exp $  (C) 2004 J.F.Dockes
 # This is copied almost verbatim from Estraier:
 #================================================================
 # Some parts are Copyright Estraier (GPL v2).
 # Estraier: a personal full-text search system
 # Copyright (C) 2003-2004 Mikio Hirabayashi
 # Copyright (C) 2014 J.F. Dockes
 # This file is licensed under the GPL v2
 #================================================================
 #================================================================
 # Convert a pdf file to  HTML.
 #
-# We use pdftotxt from the xpdf package. This does not perfect results as
+# We use pdftotext from the xpdf/poppler-utils package. 
 # whitespace is sometimes either arbitrarily inserted or stripped from the
 # text. This seems to depend on the usage of option -raw, and,
 # unfortunately also of the document itself, so that there does not seem to
 # be an universally good solution
 #
-# Also, the filter sometimes seems to output problematic utf-8. I did not
+# pdftotext sometimes outputs unescaped text inside HTML text sections.
-# check if it was actually incorrect or just mis-understood by qtextedit
+# We try to correct.
 # (tobedone) 
 # Uncomment the following if you get better results without. The
 # pdftotext manual says that the option is no longer recommended The
@ -133,6 +129,19 @@ awk 'BEGIN'\
    mid = "<title>" mid "</title>"
    $0 = part1 mid part2
  }
  if(doescape == 0 && $0 ~ /content=".*"\/>/){
    match($0, /content=".*"\/>/)
    part1 = substr($0, 0, RSTART-1)
    mid = substr($0, RSTART, RLENGTH)
    part2 = substr($0, RSTART + RLENGTH, length($0))
    gsub(/content="/, "", mid)
    gsub(/"\/>/, "", mid)
    gsub(/&/, "\\&amp;", mid)
    gsub(/</, "\\&lt;", mid)
    gsub(/>/, "\\&gt;", mid)
    mid = "content=\"" mid "\"/>"
    $0 = part1 mid part2
  }
  # Recoll treats "Subject" as a "title" element (based on emails). The PDF
  # "Subject" metadata field is more like an HTML "description"