take care of pathologic charset decls with empty value

This commit is contained in:
Jean-Francois Dockes 2012-11-26 11:40:08 +01:00
parent 9ba04fc9c7
commit 6457fb4100

View File

@ -181,8 +181,9 @@ MyHtmlParser::MyHtmlParser()
indexing_allowed(true) indexing_allowed(true)
{ {
// The default html document charset is iso-8859-1. We'll update // The default html document charset is iso-8859-1. We'll update
// this value from the encoding tag if found. // this value from the encoding tag if found. Actually use cp1252 which
charset = "iso-8859-1"; // is a superset
charset = "CP1252";
} }
void MyHtmlParser::decode_entities(string &s) void MyHtmlParser::decode_entities(string &s)
@ -402,7 +403,8 @@ MyHtmlParser::opening_tag(const string &tag)
if ((k = p.params.find(cstr_html_charset)) != if ((k = p.params.find(cstr_html_charset)) !=
p.params.end()) { p.params.end()) {
charset = k->second; charset = k->second;
if (!samecharset(charset, fromcharset)) { if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1(("Doc http-equiv charset '%s' " LOGDEB1(("Doc http-equiv charset '%s' "
"differs from dir deflt '%s'\n", "differs from dir deflt '%s'\n",
charset.c_str(), charset.c_str(),
@ -418,7 +420,8 @@ MyHtmlParser::opening_tag(const string &tag)
// HTML5 added: <meta charset="..."> // HTML5 added: <meta charset="...">
lowercase_term(newcharset); lowercase_term(newcharset);
charset = newcharset; charset = newcharset;
if (!samecharset(charset, fromcharset)) { if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1(("Doc html5 charset '%s' " LOGDEB1(("Doc html5 charset '%s' "
"differs from dir deflt '%s'\n", "differs from dir deflt '%s'\n",
charset.c_str(), charset.c_str(),