rclpdf: work with newer poppler version which do escape html text inside <head>
This commit is contained in:
parent
fd62105a9d
commit
922a9384f9
@ -127,67 +127,90 @@ trap cleanup EXIT HUP QUIT INT TERM
|
|||||||
|
|
||||||
runpdftotext()
|
runpdftotext()
|
||||||
{
|
{
|
||||||
|
# Test poppler version: at some point before 0.24, poppler began
|
||||||
|
# to properly escape text inside the header (but not the body).
|
||||||
|
XYZ=`pdftotext -v 2>&1 | awk '/pdftotext/{print $3}'`
|
||||||
|
MAJOR=`echo $XYZ | cut -d. -f 1`
|
||||||
|
MINOR=`echo $XYZ | cut -d. -f 2`
|
||||||
|
escapeheader=1
|
||||||
|
escapebody=1
|
||||||
|
if test "$MAJOR" -gt 0 ; then
|
||||||
|
escapeheader=0
|
||||||
|
elif test "$MINOR" -ge 24; then
|
||||||
|
escapeheader=0;
|
||||||
|
fi
|
||||||
|
|
||||||
# Run pdftotext and fix the result (add a charset tag and fix the
|
# Run pdftotext and fix the result (add a charset tag and fix the
|
||||||
# html escaping.
|
# html escaping). The escaping is a half-hearted job. We do try to
|
||||||
|
# fix some header fields, only for those which are single-line.
|
||||||
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
pdftotext $optionraw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||||
awk 'BEGIN'\
|
awk -v escapeheader=$escapeheader -v escapebody=$escapebody 'BEGIN'\
|
||||||
' {
|
' {
|
||||||
doescape = 0
|
inbodypre = 0
|
||||||
cont = ""
|
cont = ""
|
||||||
charsetmeta = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
|
|
||||||
}
|
}
|
||||||
|
function escapehtml(s)
|
||||||
|
{
|
||||||
|
gsub(/&/, "\\&", s)
|
||||||
|
gsub(/</, "\\<", s)
|
||||||
|
gsub(/>/, "\\>", s)
|
||||||
|
gsub(/"/, "\\"", s)
|
||||||
|
return s
|
||||||
|
}
|
||||||
{
|
{
|
||||||
$0 = cont $0
|
$0 = cont $0
|
||||||
cont = ""
|
cont = ""
|
||||||
# Insert charset meta tag at end of header
|
# Insert charset meta tag at end of header
|
||||||
if(doescape == 0 && $0 ~ /<\/head>/) {
|
if(inbodypre == 0 && $0 ~ /<\/head>/) {
|
||||||
match($0, /<\/head>/)
|
match($0, /<\/head>/)
|
||||||
part1 = substr($0, 0, RSTART-1)
|
part1 = substr($0, 0, RSTART-1)
|
||||||
part2 = substr($0, RSTART, length($0))
|
part2 = substr($0, RSTART, length($0))
|
||||||
$0 = part1 charsetmeta part2
|
charsetmeta = "<meta http-equiv=\"Content-Type\" "\
|
||||||
|
"content=\"text/html; charset=UTF-8\">"
|
||||||
|
$0 = part1 charsetmeta "\n" part2
|
||||||
}
|
}
|
||||||
if(doescape == 0 && $0 ~ /<title>.*<\/title>/){
|
if(inbodypre == 0 && $0 ~ /<title>.*<\/title>/){
|
||||||
match($0, /<title>.*<\/title>/)
|
match($0, /<title>.*<\/title>/)
|
||||||
part1 = substr($0, 0, RSTART-1)
|
part1 = substr($0, 0, RSTART-1)
|
||||||
mid = substr($0, RSTART, RLENGTH)
|
mid = substr($0, RSTART, RLENGTH)
|
||||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||||
gsub(/<title>/, "", mid)
|
gsub(/<title>/, "", mid)
|
||||||
gsub(/<\/title>/, "", mid)
|
gsub(/<\/title>/, "", mid)
|
||||||
gsub(/&/, "\\&", mid)
|
if (escapeheader) {
|
||||||
gsub(/</, "\\<", mid)
|
mid = escapehtml(mid)
|
||||||
gsub(/>/, "\\>", mid)
|
}
|
||||||
mid = "<title>" mid "</title>"
|
mid = "<title>" mid "</title>"
|
||||||
$0 = part1 mid part2
|
$0 = part1 mid part2
|
||||||
}
|
}
|
||||||
if(doescape == 0 && $0 ~ /content=".*"\/>/){
|
# This matches all single-line meta fields
|
||||||
|
if(inbodypre == 0 && $0 ~ /content=".*"\/>/){
|
||||||
match($0, /content=".*"\/>/)
|
match($0, /content=".*"\/>/)
|
||||||
part1 = substr($0, 0, RSTART-1)
|
part1 = substr($0, 0, RSTART-1)
|
||||||
mid = substr($0, RSTART, RLENGTH)
|
mid = substr($0, RSTART, RLENGTH)
|
||||||
part2 = substr($0, RSTART + RLENGTH, length($0))
|
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||||
gsub(/content="/, "", mid)
|
gsub(/content="/, "", mid)
|
||||||
gsub(/"\/>/, "", mid)
|
gsub(/"\/>/, "", mid)
|
||||||
gsub(/&/, "\\&", mid)
|
if (escapeheader) {
|
||||||
gsub(/</, "\\<", mid)
|
mid = escapehtml(mid)
|
||||||
gsub(/>/, "\\>", mid)
|
}
|
||||||
mid = "content=\"" mid "\"/>"
|
mid = "content=\"" mid "\"/>"
|
||||||
$0 = part1 mid part2
|
$0 = part1 mid part2
|
||||||
}
|
}
|
||||||
|
|
||||||
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
||||||
# "Subject" metadata field is more like an HTML "description"
|
# "Subject" metadata field is more like an HTML "description"
|
||||||
if(doescape == 0 && $0 ~ /<meta ?name="Subject"/){
|
if(inbodypre == 0 && $0 ~ /<meta ?name="Subject"/){
|
||||||
gsub(/="Subject"/, "=\"Description\"", $0)
|
gsub(/="Subject"/, "=\"Description\"", $0)
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($0 == "<pre>"){
|
if ($0 == "<pre>"){
|
||||||
# Begin of body text. need to escape some chars from now on as
|
# Begin of body text.
|
||||||
# pdftotext sometimes doesnt do it
|
inbodypre++
|
||||||
doescape++
|
|
||||||
print $0
|
print $0
|
||||||
next
|
next
|
||||||
} else if ($0 ~ /<\/pre>/){
|
} else if ($0 ~ /<\/pre>/){
|
||||||
doescape--
|
inbodypre--
|
||||||
print $0
|
print $0
|
||||||
next
|
next
|
||||||
} else if ($0 ~ /[-]$/) {
|
} else if ($0 ~ /[-]$/) {
|
||||||
@ -198,15 +221,9 @@ runpdftotext()
|
|||||||
cont = substr($0, RSTART, RLENGTH-1)
|
cont = substr($0, RSTART, RLENGTH-1)
|
||||||
$0 = line
|
$0 = line
|
||||||
# print "LINE [" $0 "] CONT[" cont "]"
|
# print "LINE [" $0 "] CONT[" cont "]"
|
||||||
} else if($0 == "\f"){
|
}
|
||||||
$0 = "<hr>"
|
if(inbodypre > 0 && escapebody){
|
||||||
print
|
$0 = escapehtml($0)
|
||||||
next
|
|
||||||
}
|
|
||||||
if(doescape > 0){
|
|
||||||
gsub(/&/, "\\&", $0)
|
|
||||||
gsub(/</, "\\<", $0)
|
|
||||||
gsub(/>/, "\\>", $0)
|
|
||||||
}
|
}
|
||||||
print $0
|
print $0
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user