From 922a9384f94b8d9b357f8f07b4bcd21042169e39 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 30 Jun 2015 10:35:22 +0200 Subject: [PATCH] rclpdf: work with newer poppler version which do escape html text inside --- src/filters/rclpdf | 73 ++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/src/filters/rclpdf b/src/filters/rclpdf index e44b3093..3e11ea6d 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -127,67 +127,90 @@ trap cleanup EXIT HUP QUIT INT TERM runpdftotext() { + # Test poppler version: at some point before 0.24, poppler began + # to properly escape text inside the header (but not the body). + XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'` + MAJOR=`echo $XYZ | cut -d. -f 1` + MINOR=`echo $XYZ | cut -d. -f 2` + escapeheader=1 + escapebody=1 + if test "$MAJOR" -gt 0 ; then + escapeheader=0 + elif test "$MINOR" -ge 24; then + escapeheader=0; + fi + # Run pdftotext and fix the result (add a charset tag and fix the - # html escaping. + # html escaping). The escaping is a half-hearted job. We do try to + # fix some header fields, only for those which are single-line. pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - | iconv -f UTF-8 -t UTF-8 -c -s | - awk 'BEGIN'\ + awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\ ' { - doescape = 0 + inbodypre = 0 cont = "" - charsetmeta = "" } +function escapehtml(s) +{ + gsub(/&/, "\\&", s) + gsub(//, "\\>", s) + gsub(/"/, "\\"", s) + return s +} { $0 = cont $0 cont = "" # Insert charset meta tag at end of header - if(doescape == 0 && $0 ~ /<\/head>/) { + if(inbodypre == 0 && $0 ~ /<\/head>/) { match($0, /<\/head>/) part1 = substr($0, 0, RSTART-1) part2 = substr($0, RSTART, length($0)) - $0 = part1 charsetmeta part2 + charsetmeta = "" + $0 = part1 charsetmeta "\n" part2 } - if(doescape == 0 && $0 ~ /.*<\/title>/){ + if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){ match($0, /<title>.*<\/title>/) part1 = substr($0, 0, RSTART-1) mid = substr($0, RSTART, RLENGTH) part2 = substr($0, RSTART + RLENGTH, length($0)) gsub(/<title>/, "", mid) gsub(/<\/title>/, "", mid) - gsub(/&/, "\\&", mid) - gsub(/</, "\\<", mid) - gsub(/>/, "\\>", mid) + if (escapeheader) { + mid = escapehtml(mid) + } mid = "<title>" mid "" $0 = part1 mid part2 } - if(doescape == 0 && $0 ~ /content=".*"\/>/){ + # This matches all single-line meta fields + if(inbodypre == 0 && $0 ~ /content=".*"\/>/){ match($0, /content=".*"\/>/) part1 = substr($0, 0, RSTART-1) mid = substr($0, RSTART, RLENGTH) part2 = substr($0, RSTART + RLENGTH, length($0)) gsub(/content="/, "", mid) gsub(/"\/>/, "", mid) - gsub(/&/, "\\&", mid) - gsub(//, "\\>", mid) + if (escapeheader) { + mid = escapehtml(mid) + } mid = "content=\"" mid "\"/>" $0 = part1 mid part2 } # Recoll treats "Subject" as a "title" element (based on emails). The PDF # "Subject" metadata field is more like an HTML "description" - if(doescape == 0 && $0 ~ /"){ - # Begin of body text. need to escape some chars from now on as - # pdftotext sometimes doesnt do it - doescape++ + # Begin of body text. + inbodypre++ print $0 next } else if ($0 ~ /<\/pre>/){ - doescape-- + inbodypre-- print $0 next } else if ($0 ~ /[­-]$/) { @@ -198,15 +221,9 @@ runpdftotext() cont = substr($0, RSTART, RLENGTH-1) $0 = line # print "LINE [" $0 "] CONT[" cont "]" - } else if($0 == "\f"){ - $0 = "
" - print - next - } - if(doescape > 0){ - gsub(/&/, "\\&", $0) - gsub(//, "\\>", $0) + } + if(inbodypre > 0 && escapebody){ + $0 = escapehtml($0) } print $0 }