renamed the html charset values to stick to omega usage

This commit is contained in:
dockes 2007-06-19 12:17:07 +00:00
parent 750d1c918d
commit a2659b48e4
4 changed files with 53 additions and 28 deletions

View File

@ -32,7 +32,7 @@ using std::map;
class HtmlParser { class HtmlParser {
protected: protected:
void decode_entities(string &s); virtual void decode_entities(string &s);
bool in_script; bool in_script;
string charset; string charset;
static map<string, unsigned int> named_ents; static map<string, unsigned int> named_ents;

View File

@ -70,7 +70,7 @@ bool MimeHandlerHtml::next_document()
m_filename.erase(); m_filename.erase();
string charset = m_defcharset; string charset = m_defcharset;
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
charset.c_str())); charset.c_str()));
// - We first try to convert from the default configured charset // - We first try to convert from the default configured charset
@ -79,14 +79,13 @@ bool MimeHandlerHtml::next_document()
// - During parsing, if we find a charset parameter, and it differs from // - During parsing, if we find a charset parameter, and it differs from
// what we started with, we abort and restart with the parameter value // what we started with, we abort and restart with the parameter value
// instead of the configuration one. // instead of the configuration one.
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser result; MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) { for (int pass = 0; pass < 2; pass++) {
string transcoded; string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass)); LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p; MyHtmlParser p;
// Try transcoding. If it fails, use original text. // Try transcoding. If it fails, use original text.
int ecnt; int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
@ -94,21 +93,21 @@ bool MimeHandlerHtml::next_document()
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str())); "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
transcoded = m_html; transcoded = m_html;
// We don't know the charset, at all // We don't know the charset, at all
p.ocharset = p.charset = charset = ""; p.reset_charsets();
charset = "";
} else { } else {
if (ecnt) { if (ecnt) {
if (pass == 0) { if (pass == 0) {
LOGDEB(("textHtmlToDoc: init transcode had %d errors for " LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
} else { } else {
LOGERR(("textHtmlToDoc: final transcode had %d errors for " LOGERR(("textHtmlToDoc: final transcode had %d errors for "
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
} }
} }
// ocharset has the putative source charset, transcoded is now // charset has the putative source charset, transcoded is now
// in utf-8 // in utf-8
p.ocharset = charset; p.set_charsets(charset, "utf-8");
p.charset = "utf-8";
} }
try { try {
@ -118,14 +117,19 @@ bool MimeHandlerHtml::next_document()
break; break;
} catch (bool diag) { } catch (bool diag) {
result = p; result = p;
if (diag == true) if (diag == true) {
// Parser throws true at end of text. ok
break; break;
}
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
charset.c_str(),result.doccharset.c_str())); charset.c_str(), result.get_charset().c_str()));
if (!result.doccharset.empty() && if (!result.get_charset().empty() &&
!samecharset(result.doccharset, result.ocharset)) { !samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB(("textHtmlToDoc: reparse for charsets\n")); LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
charset = result.doccharset; // Set the origin charset as specified in document before
// transcoding again
charset = result.get_charset();
} else { } else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n")); LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return false; return false;
@ -133,7 +137,7 @@ bool MimeHandlerHtml::next_document()
} }
} }
m_metaData["origcharset"] = m_defcharset; m_metaData["origcharset"] = result.get_charset();
m_metaData["content"] = result.dump; m_metaData["content"] = result.dump;
m_metaData["charset"] = "utf-8"; m_metaData["charset"] = "utf-8";
// Avoid setting empty values which would crush ones possibly inherited // Avoid setting empty values which would crush ones possibly inherited

View File

@ -158,6 +158,10 @@ MyHtmlParser::MyHtmlParser()
pending_space(false), pending_space(false),
indexing_allowed(true) indexing_allowed(true)
{ {
// The default html document charset is iso-8859-1. We'll update
// this value from the encoding tag if found.
charset = "iso-8859-1";
if (my_named_ents.empty()) { if (my_named_ents.empty()) {
for (int i = 0;;) { for (int i = 0;;) {
const char *ent; const char *ent;
@ -175,11 +179,11 @@ MyHtmlParser::MyHtmlParser()
void MyHtmlParser::decode_entities(string &s) void MyHtmlParser::decode_entities(string &s)
{ {
LOGDEB(("MyHtmlParser::decode_entities\n")); LOGDEB2(("MyHtmlParser::decode_entities\n"));
// This has no meaning whatsoever if the character encoding is unknown, // This has no meaning whatsoever if the character encoding is unknown,
// so don't do it. If charset known, caller has converted text to utf-8, // so don't do it. If charset known, caller has converted text to utf-8,
// and this is also how we translate entities // and this is also how we translate entities
// if (charset != "utf-8") // if (tocharset != "utf-8")
// return; // return;
// We need a const_iterator version of s.end() - otherwise the // We need a const_iterator version of s.end() - otherwise the
@ -378,12 +382,12 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
map<string, string>::const_iterator k; map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != if ((k = p.params.find("charset")) !=
p.params.end()) { p.params.end()) {
doccharset = k->second; charset = k->second;
if (!samecharset(doccharset, ocharset)) { if (!samecharset(charset, fromcharset)) {
LOGDEB1(("Doc specified charset '%s' " LOGDEB1(("Doc specified charset '%s' "
"differs from announced '%s'\n", "differs from dir deflt '%s'\n",
doccharset.c_str(), charset.c_str(),
ocharset.c_str())); fromcharset.c_str()));
throw false; throw false;
} }
} }
@ -504,7 +508,7 @@ MyHtmlParser::closing_tag(const string &tag)
break; break;
case 't': case 't':
if (tag == "title") { if (tag == "title") {
if (meta["title"].empty()) { if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = dump; meta["title"] = dump;
dump = ""; dump = "";
} }

View File

@ -42,15 +42,32 @@ class MyHtmlParser : public HtmlParser {
map<string,string> meta; map<string,string> meta;
static map<string, string> my_named_ents; static map<string, string> my_named_ents;
string dump, dmtime; string dump, dmtime;
string ocharset; // This is the charset our user thinks the doc was // This is the charset our caller thinks the doc used (initially
// charset is declared by HtmlParser // comes from the environment/configuration, used as source for
//string charset; // This is the charset it was supposedly converted to // conversion to utf-8)
string doccharset; // Set this to value of charset parameter in header string fromcharset;
// This is the charset it was supposedly converted to (always
// utf-8 in fact, except if conversion utterly failed)
string tocharset;
// charset is declared by HtmlParser. It is the charset from the
// document: default, then from html or xml header.
// string charset;
bool indexing_allowed; bool indexing_allowed;
void process_text(const string &text); void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p); void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag); void closing_tag(const string &tag);
void do_eof(); void do_eof();
void decode_entities(string &s); void decode_entities(string &s);
void reset_charsets() {fromcharset = tocharset = "";}
void set_charsets(const string& f, const string& t)
{
fromcharset = f;
tocharset = t;
}
// Return charset as determined from html
const string& get_charset() {return charset;}
MyHtmlParser(); MyHtmlParser();
}; };