From a2659b48e49ad9c27313e6e06335f7cd226dd0aa Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Tue, 19 Jun 2007 12:17:07 +0000
Subject: [PATCH] renamed the html charset values to stick to omega usage

---
 src/internfile/htmlparse.h     |  2 +-
 src/internfile/mh_html.cpp     | 34 +++++++++++++++++++---------------
 src/internfile/myhtmlparse.cpp | 20 ++++++++++++--------
 src/internfile/myhtmlparse.h   | 25 +++++++++++++++++++++----
 4 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h
index 12bdf6d8..0c6a1309 100644
--- a/src/internfile/htmlparse.h
+++ b/src/internfile/htmlparse.h
@@ -32,7 +32,7 @@ using std::map;
 
 class HtmlParser {
     protected:
-	void decode_entities(string &s);
+        virtual void decode_entities(string &s);
         bool in_script;
         string charset;
 	static map<string, unsigned int> named_ents;
diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
index 788e1993..29844033 100644
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -70,7 +70,7 @@ bool MimeHandlerHtml::next_document()
     m_filename.erase();
 
     string charset = m_defcharset;
-    LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", 
+    LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", 
 	    charset.c_str()));
 
     // - We first try to convert from the default configured charset
@@ -79,14 +79,13 @@ bool MimeHandlerHtml::next_document()
     // - During parsing, if we find a charset parameter, and it differs from
     //   what we started with, we abort and restart with the parameter value
     //   instead of the configuration one.
-    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
-
 
     MyHtmlParser result;
     for (int pass = 0; pass < 2; pass++) {
 	string transcoded;
 	LOGDEB(("Html::mkDoc: pass %d\n", pass));
 	MyHtmlParser p;
+
 	// Try transcoding. If it fails, use original text.
 	int ecnt;
 	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
@@ -94,21 +93,21 @@ bool MimeHandlerHtml::next_document()
 		    "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
 	    transcoded = m_html;
 	    // We don't know the charset, at all
-	    p.ocharset = p.charset = charset = "";
+	    p.reset_charsets();
+	    charset = "";
 	} else {
 	    if (ecnt) {
 		if (pass == 0) {
 		    LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
-			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
 		} else {
 		    LOGERR(("textHtmlToDoc: final transcode had %d errors for "
-			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
 		}
 	    }
-	    // ocharset has the putative source charset, transcoded is now
+	    // charset has the putative source charset, transcoded is now
 	    // in utf-8
-	    p.ocharset = charset;
-	    p.charset = "utf-8";
+	    p.set_charsets(charset, "utf-8");
 	}
 
 	try {
@@ -118,14 +117,19 @@ bool MimeHandlerHtml::next_document()
 	    break;
 	} catch (bool diag) {
 	    result = p;
-	    if (diag == true)
+	    if (diag == true) {
+		// Parser throws true at end of text. ok
 		break;
+	    }
+
 	    LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
-		    charset.c_str(),result.doccharset.c_str()));
-	    if (!result.doccharset.empty() && 
-		!samecharset(result.doccharset, result.ocharset)) {
+		    charset.c_str(), result.get_charset().c_str()));
+	    if (!result.get_charset().empty() && 
+		!samecharset(result.get_charset(), result.fromcharset)) {
 		LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
-		charset = result.doccharset;
+		// Set the origin charset as specified in document before
+		// transcoding again
+		charset = result.get_charset();
 	    } else {
 		LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
 		return false;
@@ -133,7 +137,7 @@ bool MimeHandlerHtml::next_document()
 	}
     }
 
-    m_metaData["origcharset"] = m_defcharset;
+    m_metaData["origcharset"] = result.get_charset();
     m_metaData["content"] = result.dump;
     m_metaData["charset"] = "utf-8";
     // Avoid setting empty values which would crush ones possibly inherited
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index d9b2e4a4..6b661630 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -158,6 +158,10 @@ MyHtmlParser::MyHtmlParser()
       pending_space(false),
       indexing_allowed(true)
 {
+    // The default html document charset is iso-8859-1. We'll update
+    // this value from the encoding tag if found.
+    charset = "iso-8859-1";
+
     if (my_named_ents.empty()) {
 	for (int i = 0;;) {
 	    const char *ent;
@@ -175,11 +179,11 @@ MyHtmlParser::MyHtmlParser()
 
 void MyHtmlParser::decode_entities(string &s)
 {
-    LOGDEB(("MyHtmlParser::decode_entities\n"));
+    LOGDEB2(("MyHtmlParser::decode_entities\n"));
     // This has no meaning whatsoever if the character encoding is unknown,
     // so don't do it. If charset known, caller has converted text to utf-8, 
     // and this is also how we translate entities
-    //    if (charset != "utf-8")
+    //    if (tocharset != "utf-8")
     //    	return;
 
     // We need a const_iterator version of s.end() - otherwise the
@@ -378,12 +382,12 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 			    map<string, string>::const_iterator k;
 			    if ((k = p.params.find("charset")) != 
 				p.params.end()) {
-				doccharset = k->second;
-				if (!samecharset(doccharset, ocharset)) {
+				charset = k->second;
+				if (!samecharset(charset, fromcharset)) {
 				    LOGDEB1(("Doc specified charset '%s' "
-					     "differs from announced '%s'\n",
-					     doccharset.c_str(), 
-					     ocharset.c_str()));
+					    "differs from dir deflt '%s'\n",
+					    charset.c_str(), 
+					    fromcharset.c_str()));
 				    throw false;
 				}
 			    }
@@ -504,7 +508,7 @@ MyHtmlParser::closing_tag(const string &tag)
 	    break;
 	case 't':
 	    if (tag == "title") {
-		if (meta["title"].empty()) {
+		if (meta.find("title") == meta.end()|| meta["title"].empty()) {
 		    meta["title"] = dump;
 		    dump = "";
 		}
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index f12e6864..141a2cd1 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -42,15 +42,32 @@ class MyHtmlParser : public HtmlParser {
     map<string,string> meta;
     static map<string, string> my_named_ents;
     string dump, dmtime;
-    string ocharset; // This is the charset our user thinks the doc was
-    // charset is declared by HtmlParser
-    //string charset; // This is the charset it was supposedly converted to
-    string doccharset; // Set this to value of charset parameter in header
+    // This is the charset our caller thinks the doc used (initially
+    // comes from the environment/configuration, used as source for
+    // conversion to utf-8)
+    string fromcharset; 
+    // This is the charset it was supposedly converted to (always
+    // utf-8 in fact, except if conversion utterly failed)
+    string tocharset; 
+    // charset is declared by HtmlParser. It is the charset from the
+    // document: default, then from html or xml header.
+    // string charset; 
+
     bool indexing_allowed;
+
     void process_text(const string &text);
     void opening_tag(const string &tag, const map<string,string> &p);
     void closing_tag(const string &tag);
     void do_eof();
     void decode_entities(string &s);
+    void reset_charsets() {fromcharset = tocharset = "";}
+    void set_charsets(const string& f, const string& t) 
+    {
+	fromcharset = f;
+	tocharset = t;
+    }
+    // Return charset as determined from html
+    const string& get_charset() {return charset;}
+
     MyHtmlParser();
 };