rclpdf: also escape text inside meta content attributes
This commit is contained in:
parent
26c7e1c690
commit
552eb0965b
@ -1,22 +1,18 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# @(#$Id: rclpdf,v 1.10 2007-07-12 17:13:38 dockes Exp $ (C) 2004 J.F.Dockes
|
|
||||||
# This is copied almost verbatim from Estraier:
|
|
||||||
#================================================================
|
#================================================================
|
||||||
|
# Some parts are Copyright Estraier (GPL v2).
|
||||||
# Estraier: a personal full-text search system
|
# Estraier: a personal full-text search system
|
||||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||||
|
# Copyright (C) 2014 J.F. Dockes
|
||||||
|
# This file is licensed under the GPL v2
|
||||||
#================================================================
|
#================================================================
|
||||||
#================================================================
|
#================================================================
|
||||||
# Convert a pdf file to HTML.
|
# Convert a pdf file to HTML.
|
||||||
#
|
#
|
||||||
# We use pdftotxt from the xpdf package. This does not perfect results as
|
# We use pdftotext from the xpdf/poppler-utils package.
|
||||||
# whitespace is sometimes either arbitrarily inserted or stripped from the
|
|
||||||
# text. This seems to depend on the usage of option -raw, and,
|
|
||||||
# unfortunately also of the document itself, so that there does not seem to
|
|
||||||
# be an universally good solution
|
|
||||||
#
|
#
|
||||||
# Also, the filter sometimes seems to output problematic utf-8. I did not
|
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
||||||
# check if it was actually incorrect or just mis-understood by qtextedit
|
# We try to correct.
|
||||||
# (tobedone)
|
|
||||||
|
|
||||||
# Uncomment the following if you get better results without. The
|
# Uncomment the following if you get better results without. The
|
||||||
# pdftotext manual says that the option is no longer recommended The
|
# pdftotext manual says that the option is no longer recommended The
|
||||||
@ -133,6 +129,19 @@ awk 'BEGIN'\
|
|||||||
mid = "<title>" mid "</title>"
|
mid = "<title>" mid "</title>"
|
||||||
$0 = part1 mid part2
|
$0 = part1 mid part2
|
||||||
}
|
}
|
||||||
|
if(doescape == 0 && $0 ~ /content=".*"\/>/){
|
||||||
|
match($0, /content=".*"\/>/)
|
||||||
|
part1 = substr($0, 0, RSTART-1)
|
||||||
|
mid = substr($0, RSTART, RLENGTH)
|
||||||
|
part2 = substr($0, RSTART + RLENGTH, length($0))
|
||||||
|
gsub(/content="/, "", mid)
|
||||||
|
gsub(/"\/>/, "", mid)
|
||||||
|
gsub(/&/, "\\&", mid)
|
||||||
|
gsub(/</, "\\<", mid)
|
||||||
|
gsub(/>/, "\\>", mid)
|
||||||
|
mid = "content=\"" mid "\"/>"
|
||||||
|
$0 = part1 mid part2
|
||||||
|
}
|
||||||
|
|
||||||
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
||||||
# "Subject" metadata field is more like an HTML "description"
|
# "Subject" metadata field is more like an HTML "description"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user