diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h
index 12bdf6d8..0c6a1309 100644
--- a/src/internfile/htmlparse.h
+++ b/src/internfile/htmlparse.h
@@ -32,7 +32,7 @@ using std::map;
class HtmlParser {
protected:
- void decode_entities(string &s);
+ virtual void decode_entities(string &s);
bool in_script;
string charset;
static map named_ents;
diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
index 788e1993..29844033 100644
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -70,7 +70,7 @@ bool MimeHandlerHtml::next_document()
m_filename.erase();
string charset = m_defcharset;
- LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",
+ LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
charset.c_str()));
// - We first try to convert from the default configured charset
@@ -79,14 +79,13 @@ bool MimeHandlerHtml::next_document()
// - During parsing, if we find a charset parameter, and it differs from
// what we started with, we abort and restart with the parameter value
// instead of the configuration one.
- LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
-
MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
+
// Try transcoding. If it fails, use original text.
int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
@@ -94,21 +93,21 @@ bool MimeHandlerHtml::next_document()
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
transcoded = m_html;
// We don't know the charset, at all
- p.ocharset = p.charset = charset = "";
+ p.reset_charsets();
+ charset = "";
} else {
if (ecnt) {
if (pass == 0) {
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
- "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+ "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
} else {
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
- "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+ "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
}
}
- // ocharset has the putative source charset, transcoded is now
+ // charset has the putative source charset, transcoded is now
// in utf-8
- p.ocharset = charset;
- p.charset = "utf-8";
+ p.set_charsets(charset, "utf-8");
}
try {
@@ -118,14 +117,19 @@ bool MimeHandlerHtml::next_document()
break;
} catch (bool diag) {
result = p;
- if (diag == true)
+ if (diag == true) {
+ // Parser throws true at end of text. ok
break;
+ }
+
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
- charset.c_str(),result.doccharset.c_str()));
- if (!result.doccharset.empty() &&
- !samecharset(result.doccharset, result.ocharset)) {
+ charset.c_str(), result.get_charset().c_str()));
+ if (!result.get_charset().empty() &&
+ !samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
- charset = result.doccharset;
+ // Set the origin charset as specified in document before
+ // transcoding again
+ charset = result.get_charset();
} else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return false;
@@ -133,7 +137,7 @@ bool MimeHandlerHtml::next_document()
}
}
- m_metaData["origcharset"] = m_defcharset;
+ m_metaData["origcharset"] = result.get_charset();
m_metaData["content"] = result.dump;
m_metaData["charset"] = "utf-8";
// Avoid setting empty values which would crush ones possibly inherited
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index d9b2e4a4..6b661630 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -158,6 +158,10 @@ MyHtmlParser::MyHtmlParser()
pending_space(false),
indexing_allowed(true)
{
+ // The default html document charset is iso-8859-1. We'll update
+ // this value from the encoding tag if found.
+ charset = "iso-8859-1";
+
if (my_named_ents.empty()) {
for (int i = 0;;) {
const char *ent;
@@ -175,11 +179,11 @@ MyHtmlParser::MyHtmlParser()
void MyHtmlParser::decode_entities(string &s)
{
- LOGDEB(("MyHtmlParser::decode_entities\n"));
+ LOGDEB2(("MyHtmlParser::decode_entities\n"));
// This has no meaning whatsoever if the character encoding is unknown,
// so don't do it. If charset known, caller has converted text to utf-8,
// and this is also how we translate entities
- // if (charset != "utf-8")
+ // if (tocharset != "utf-8")
// return;
// We need a const_iterator version of s.end() - otherwise the
@@ -378,12 +382,12 @@ MyHtmlParser::opening_tag(const string &tag, const map &p)
map::const_iterator k;
if ((k = p.params.find("charset")) !=
p.params.end()) {
- doccharset = k->second;
- if (!samecharset(doccharset, ocharset)) {
+ charset = k->second;
+ if (!samecharset(charset, fromcharset)) {
LOGDEB1(("Doc specified charset '%s' "
- "differs from announced '%s'\n",
- doccharset.c_str(),
- ocharset.c_str()));
+ "differs from dir deflt '%s'\n",
+ charset.c_str(),
+ fromcharset.c_str()));
throw false;
}
}
@@ -504,7 +508,7 @@ MyHtmlParser::closing_tag(const string &tag)
break;
case 't':
if (tag == "title") {
- if (meta["title"].empty()) {
+ if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = dump;
dump = "";
}
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index f12e6864..141a2cd1 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -42,15 +42,32 @@ class MyHtmlParser : public HtmlParser {
map meta;
static map my_named_ents;
string dump, dmtime;
- string ocharset; // This is the charset our user thinks the doc was
- // charset is declared by HtmlParser
- //string charset; // This is the charset it was supposedly converted to
- string doccharset; // Set this to value of charset parameter in header
+ // This is the charset our caller thinks the doc used (initially
+ // comes from the environment/configuration, used as source for
+ // conversion to utf-8)
+ string fromcharset;
+ // This is the charset it was supposedly converted to (always
+ // utf-8 in fact, except if conversion utterly failed)
+ string tocharset;
+ // charset is declared by HtmlParser. It is the charset from the
+ // document: default, then from html or xml header.
+ // string charset;
+
bool indexing_allowed;
+
void process_text(const string &text);
void opening_tag(const string &tag, const map &p);
void closing_tag(const string &tag);
void do_eof();
void decode_entities(string &s);
+ void reset_charsets() {fromcharset = tocharset = "";}
+ void set_charsets(const string& f, const string& t)
+ {
+ fromcharset = f;
+ tocharset = t;
+ }
+ // Return charset as determined from html
+ const string& get_charset() {return charset;}
+
MyHtmlParser();
};