renamed the html charset values to stick to omega usage
This commit is contained in:
parent
750d1c918d
commit
a2659b48e4
@ -32,7 +32,7 @@ using std::map;
|
|||||||
|
|
||||||
class HtmlParser {
|
class HtmlParser {
|
||||||
protected:
|
protected:
|
||||||
void decode_entities(string &s);
|
virtual void decode_entities(string &s);
|
||||||
bool in_script;
|
bool in_script;
|
||||||
string charset;
|
string charset;
|
||||||
static map<string, unsigned int> named_ents;
|
static map<string, unsigned int> named_ents;
|
||||||
|
|||||||
@ -70,7 +70,7 @@ bool MimeHandlerHtml::next_document()
|
|||||||
m_filename.erase();
|
m_filename.erase();
|
||||||
|
|
||||||
string charset = m_defcharset;
|
string charset = m_defcharset;
|
||||||
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",
|
LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
|
||||||
charset.c_str()));
|
charset.c_str()));
|
||||||
|
|
||||||
// - We first try to convert from the default configured charset
|
// - We first try to convert from the default configured charset
|
||||||
@ -79,14 +79,13 @@ bool MimeHandlerHtml::next_document()
|
|||||||
// - During parsing, if we find a charset parameter, and it differs from
|
// - During parsing, if we find a charset parameter, and it differs from
|
||||||
// what we started with, we abort and restart with the parameter value
|
// what we started with, we abort and restart with the parameter value
|
||||||
// instead of the configuration one.
|
// instead of the configuration one.
|
||||||
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
|
||||||
|
|
||||||
|
|
||||||
MyHtmlParser result;
|
MyHtmlParser result;
|
||||||
for (int pass = 0; pass < 2; pass++) {
|
for (int pass = 0; pass < 2; pass++) {
|
||||||
string transcoded;
|
string transcoded;
|
||||||
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
||||||
MyHtmlParser p;
|
MyHtmlParser p;
|
||||||
|
|
||||||
// Try transcoding. If it fails, use original text.
|
// Try transcoding. If it fails, use original text.
|
||||||
int ecnt;
|
int ecnt;
|
||||||
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
||||||
@ -94,21 +93,21 @@ bool MimeHandlerHtml::next_document()
|
|||||||
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
|
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
|
||||||
transcoded = m_html;
|
transcoded = m_html;
|
||||||
// We don't know the charset, at all
|
// We don't know the charset, at all
|
||||||
p.ocharset = p.charset = charset = "";
|
p.reset_charsets();
|
||||||
|
charset = "";
|
||||||
} else {
|
} else {
|
||||||
if (ecnt) {
|
if (ecnt) {
|
||||||
if (pass == 0) {
|
if (pass == 0) {
|
||||||
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
|
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
|
||||||
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
|
"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
|
||||||
} else {
|
} else {
|
||||||
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
|
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
|
||||||
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
|
"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// ocharset has the putative source charset, transcoded is now
|
// charset has the putative source charset, transcoded is now
|
||||||
// in utf-8
|
// in utf-8
|
||||||
p.ocharset = charset;
|
p.set_charsets(charset, "utf-8");
|
||||||
p.charset = "utf-8";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -118,14 +117,19 @@ bool MimeHandlerHtml::next_document()
|
|||||||
break;
|
break;
|
||||||
} catch (bool diag) {
|
} catch (bool diag) {
|
||||||
result = p;
|
result = p;
|
||||||
if (diag == true)
|
if (diag == true) {
|
||||||
|
// Parser throws true at end of text. ok
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
||||||
charset.c_str(),result.doccharset.c_str()));
|
charset.c_str(), result.get_charset().c_str()));
|
||||||
if (!result.doccharset.empty() &&
|
if (!result.get_charset().empty() &&
|
||||||
!samecharset(result.doccharset, result.ocharset)) {
|
!samecharset(result.get_charset(), result.fromcharset)) {
|
||||||
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
||||||
charset = result.doccharset;
|
// Set the origin charset as specified in document before
|
||||||
|
// transcoding again
|
||||||
|
charset = result.get_charset();
|
||||||
} else {
|
} else {
|
||||||
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
||||||
return false;
|
return false;
|
||||||
@ -133,7 +137,7 @@ bool MimeHandlerHtml::next_document()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m_metaData["origcharset"] = m_defcharset;
|
m_metaData["origcharset"] = result.get_charset();
|
||||||
m_metaData["content"] = result.dump;
|
m_metaData["content"] = result.dump;
|
||||||
m_metaData["charset"] = "utf-8";
|
m_metaData["charset"] = "utf-8";
|
||||||
// Avoid setting empty values which would crush ones possibly inherited
|
// Avoid setting empty values which would crush ones possibly inherited
|
||||||
|
|||||||
@ -158,6 +158,10 @@ MyHtmlParser::MyHtmlParser()
|
|||||||
pending_space(false),
|
pending_space(false),
|
||||||
indexing_allowed(true)
|
indexing_allowed(true)
|
||||||
{
|
{
|
||||||
|
// The default html document charset is iso-8859-1. We'll update
|
||||||
|
// this value from the encoding tag if found.
|
||||||
|
charset = "iso-8859-1";
|
||||||
|
|
||||||
if (my_named_ents.empty()) {
|
if (my_named_ents.empty()) {
|
||||||
for (int i = 0;;) {
|
for (int i = 0;;) {
|
||||||
const char *ent;
|
const char *ent;
|
||||||
@ -175,11 +179,11 @@ MyHtmlParser::MyHtmlParser()
|
|||||||
|
|
||||||
void MyHtmlParser::decode_entities(string &s)
|
void MyHtmlParser::decode_entities(string &s)
|
||||||
{
|
{
|
||||||
LOGDEB(("MyHtmlParser::decode_entities\n"));
|
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|
||||||
// This has no meaning whatsoever if the character encoding is unknown,
|
// This has no meaning whatsoever if the character encoding is unknown,
|
||||||
// so don't do it. If charset known, caller has converted text to utf-8,
|
// so don't do it. If charset known, caller has converted text to utf-8,
|
||||||
// and this is also how we translate entities
|
// and this is also how we translate entities
|
||||||
// if (charset != "utf-8")
|
// if (tocharset != "utf-8")
|
||||||
// return;
|
// return;
|
||||||
|
|
||||||
// We need a const_iterator version of s.end() - otherwise the
|
// We need a const_iterator version of s.end() - otherwise the
|
||||||
@ -378,12 +382,12 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
map<string, string>::const_iterator k;
|
map<string, string>::const_iterator k;
|
||||||
if ((k = p.params.find("charset")) !=
|
if ((k = p.params.find("charset")) !=
|
||||||
p.params.end()) {
|
p.params.end()) {
|
||||||
doccharset = k->second;
|
charset = k->second;
|
||||||
if (!samecharset(doccharset, ocharset)) {
|
if (!samecharset(charset, fromcharset)) {
|
||||||
LOGDEB1(("Doc specified charset '%s' "
|
LOGDEB1(("Doc specified charset '%s' "
|
||||||
"differs from announced '%s'\n",
|
"differs from dir deflt '%s'\n",
|
||||||
doccharset.c_str(),
|
charset.c_str(),
|
||||||
ocharset.c_str()));
|
fromcharset.c_str()));
|
||||||
throw false;
|
throw false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -504,7 +508,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
|||||||
break;
|
break;
|
||||||
case 't':
|
case 't':
|
||||||
if (tag == "title") {
|
if (tag == "title") {
|
||||||
if (meta["title"].empty()) {
|
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
||||||
meta["title"] = dump;
|
meta["title"] = dump;
|
||||||
dump = "";
|
dump = "";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,15 +42,32 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
map<string,string> meta;
|
map<string,string> meta;
|
||||||
static map<string, string> my_named_ents;
|
static map<string, string> my_named_ents;
|
||||||
string dump, dmtime;
|
string dump, dmtime;
|
||||||
string ocharset; // This is the charset our user thinks the doc was
|
// This is the charset our caller thinks the doc used (initially
|
||||||
// charset is declared by HtmlParser
|
// comes from the environment/configuration, used as source for
|
||||||
//string charset; // This is the charset it was supposedly converted to
|
// conversion to utf-8)
|
||||||
string doccharset; // Set this to value of charset parameter in header
|
string fromcharset;
|
||||||
|
// This is the charset it was supposedly converted to (always
|
||||||
|
// utf-8 in fact, except if conversion utterly failed)
|
||||||
|
string tocharset;
|
||||||
|
// charset is declared by HtmlParser. It is the charset from the
|
||||||
|
// document: default, then from html or xml header.
|
||||||
|
// string charset;
|
||||||
|
|
||||||
bool indexing_allowed;
|
bool indexing_allowed;
|
||||||
|
|
||||||
void process_text(const string &text);
|
void process_text(const string &text);
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
void opening_tag(const string &tag, const map<string,string> &p);
|
||||||
void closing_tag(const string &tag);
|
void closing_tag(const string &tag);
|
||||||
void do_eof();
|
void do_eof();
|
||||||
void decode_entities(string &s);
|
void decode_entities(string &s);
|
||||||
|
void reset_charsets() {fromcharset = tocharset = "";}
|
||||||
|
void set_charsets(const string& f, const string& t)
|
||||||
|
{
|
||||||
|
fromcharset = f;
|
||||||
|
tocharset = t;
|
||||||
|
}
|
||||||
|
// Return charset as determined from html
|
||||||
|
const string& get_charset() {return charset;}
|
||||||
|
|
||||||
MyHtmlParser();
|
MyHtmlParser();
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user