From a2659b48e49ad9c27313e6e06335f7cd226dd0aa Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 19 Jun 2007 12:17:07 +0000 Subject: [PATCH] renamed the html charset values to stick to omega usage --- src/internfile/htmlparse.h | 2 +- src/internfile/mh_html.cpp | 34 +++++++++++++++++++--------------- src/internfile/myhtmlparse.cpp | 20 ++++++++++++-------- src/internfile/myhtmlparse.h | 25 +++++++++++++++++++++---- 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h index 12bdf6d8..0c6a1309 100644 --- a/src/internfile/htmlparse.h +++ b/src/internfile/htmlparse.h @@ -32,7 +32,7 @@ using std::map; class HtmlParser { protected: - void decode_entities(string &s); + virtual void decode_entities(string &s); bool in_script; string charset; static map named_ents; diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 788e1993..29844033 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -70,7 +70,7 @@ bool MimeHandlerHtml::next_document() m_filename.erase(); string charset = m_defcharset; - LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", + LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", charset.c_str())); // - We first try to convert from the default configured charset @@ -79,14 +79,13 @@ bool MimeHandlerHtml::next_document() // - During parsing, if we find a charset parameter, and it differs from // what we started with, we abort and restart with the parameter value // instead of the configuration one. - LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); - MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; LOGDEB(("Html::mkDoc: pass %d\n", pass)); MyHtmlParser p; + // Try transcoding. If it fails, use original text. int ecnt; if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { @@ -94,21 +93,21 @@ bool MimeHandlerHtml::next_document() "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str())); transcoded = m_html; // We don't know the charset, at all - p.ocharset = p.charset = charset = ""; + p.reset_charsets(); + charset = ""; } else { if (ecnt) { if (pass == 0) { LOGDEB(("textHtmlToDoc: init transcode had %d errors for " - "[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); + "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } else { LOGERR(("textHtmlToDoc: final transcode had %d errors for " - "[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); + "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } } - // ocharset has the putative source charset, transcoded is now + // charset has the putative source charset, transcoded is now // in utf-8 - p.ocharset = charset; - p.charset = "utf-8"; + p.set_charsets(charset, "utf-8"); } try { @@ -118,14 +117,19 @@ bool MimeHandlerHtml::next_document() break; } catch (bool diag) { result = p; - if (diag == true) + if (diag == true) { + // Parser throws true at end of text. ok break; + } + LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", - charset.c_str(),result.doccharset.c_str())); - if (!result.doccharset.empty() && - !samecharset(result.doccharset, result.ocharset)) { + charset.c_str(), result.get_charset().c_str())); + if (!result.get_charset().empty() && + !samecharset(result.get_charset(), result.fromcharset)) { LOGDEB(("textHtmlToDoc: reparse for charsets\n")); - charset = result.doccharset; + // Set the origin charset as specified in document before + // transcoding again + charset = result.get_charset(); } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); return false; @@ -133,7 +137,7 @@ bool MimeHandlerHtml::next_document() } } - m_metaData["origcharset"] = m_defcharset; + m_metaData["origcharset"] = result.get_charset(); m_metaData["content"] = result.dump; m_metaData["charset"] = "utf-8"; // Avoid setting empty values which would crush ones possibly inherited diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index d9b2e4a4..6b661630 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -158,6 +158,10 @@ MyHtmlParser::MyHtmlParser() pending_space(false), indexing_allowed(true) { + // The default html document charset is iso-8859-1. We'll update + // this value from the encoding tag if found. + charset = "iso-8859-1"; + if (my_named_ents.empty()) { for (int i = 0;;) { const char *ent; @@ -175,11 +179,11 @@ MyHtmlParser::MyHtmlParser() void MyHtmlParser::decode_entities(string &s) { - LOGDEB(("MyHtmlParser::decode_entities\n")); + LOGDEB2(("MyHtmlParser::decode_entities\n")); // This has no meaning whatsoever if the character encoding is unknown, // so don't do it. If charset known, caller has converted text to utf-8, // and this is also how we translate entities - // if (charset != "utf-8") + // if (tocharset != "utf-8") // return; // We need a const_iterator version of s.end() - otherwise the @@ -378,12 +382,12 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) map::const_iterator k; if ((k = p.params.find("charset")) != p.params.end()) { - doccharset = k->second; - if (!samecharset(doccharset, ocharset)) { + charset = k->second; + if (!samecharset(charset, fromcharset)) { LOGDEB1(("Doc specified charset '%s' " - "differs from announced '%s'\n", - doccharset.c_str(), - ocharset.c_str())); + "differs from dir deflt '%s'\n", + charset.c_str(), + fromcharset.c_str())); throw false; } } @@ -504,7 +508,7 @@ MyHtmlParser::closing_tag(const string &tag) break; case 't': if (tag == "title") { - if (meta["title"].empty()) { + if (meta.find("title") == meta.end()|| meta["title"].empty()) { meta["title"] = dump; dump = ""; } diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index f12e6864..141a2cd1 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -42,15 +42,32 @@ class MyHtmlParser : public HtmlParser { map meta; static map my_named_ents; string dump, dmtime; - string ocharset; // This is the charset our user thinks the doc was - // charset is declared by HtmlParser - //string charset; // This is the charset it was supposedly converted to - string doccharset; // Set this to value of charset parameter in header + // This is the charset our caller thinks the doc used (initially + // comes from the environment/configuration, used as source for + // conversion to utf-8) + string fromcharset; + // This is the charset it was supposedly converted to (always + // utf-8 in fact, except if conversion utterly failed) + string tocharset; + // charset is declared by HtmlParser. It is the charset from the + // document: default, then from html or xml header. + // string charset; + bool indexing_allowed; + void process_text(const string &text); void opening_tag(const string &tag, const map &p); void closing_tag(const string &tag); void do_eof(); void decode_entities(string &s); + void reset_charsets() {fromcharset = tocharset = "";} + void set_charsets(const string& f, const string& t) + { + fromcharset = f; + tocharset = t; + } + // Return charset as determined from html + const string& get_charset() {return charset;} + MyHtmlParser(); };