/* Copyright (C) 2005 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "cstr.h" #include "mimehandler.h" #include "debuglog.h" #include "readfile.h" #include "transcode.h" #include "mimeparse.h" #include "myhtmlparse.h" #include "indextext.h" #include "mh_html.h" #include "smallut.h" #include "md5.h" #include #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ bool MimeHandlerHtml::set_document_file(const string &fn) { LOGDEB0(("textHtmlToDoc: %s\n", fn.c_str())); RecollFilter::set_document_file(fn); string otext; if (!file_to_string(fn, otext)) { LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str())); return false; } m_filename = fn; return set_document_string(otext); } bool MimeHandlerHtml::set_document_string(const string& htext) { m_html = htext; m_havedoc = true; // We want to compute the md5 now because we may modify m_html later string md5, xmd5; MD5String(htext, md5); m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); return true; } bool MimeHandlerHtml::next_document() { if (m_havedoc == false) return false; m_havedoc = false; // If set_doc(fn), take note of file name. string fn = m_filename; m_filename.erase(); string charset = m_dfltInputCharset; LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n", charset.c_str())); // Override default input charset if someone took care to set one: map::const_iterator it = m_metaData.find(cstr_dj_keycharset); if (it != m_metaData.end() && !it->second.empty()) { charset = it->second; LOGDEB(("MHHtml: next_doc.: input charset from ext. metadata: [%s]\n", charset.c_str())); } // - We first try to convert from the supposed charset // (which may depend of the current directory) to utf-8. If this // fails, we keep the original text // - During parsing, if we find a charset parameter, and it differs from // what we started with, we abort and restart with the parameter value // instead of the configuration one. MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; LOGDEB(("Html::mkDoc: pass %d\n", pass)); MyHtmlParser p; // Try transcoding. If it fails, use original text. int ecnt; if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for" "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str())); transcoded = m_html; // We don't know the charset, at all p.reset_charsets(); charset.clear(); } else { if (ecnt) { if (pass == 0) { LOGDEB(("textHtmlToDoc: init transcode had %d errors for " "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } else { LOGERR(("textHtmlToDoc: final transcode had %d errors for " "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } } // charset has the putative source charset, transcoded is now // in utf-8 p.set_charsets(charset, "utf-8"); } try { p.parse_html(transcoded); // No exception: ok? But throw true to use the same // code path as if an exception had been thrown by parse_html throw true; break; } catch (bool diag) { result = p; if (diag == true) { // Parser throws true at end of text. ok if (m_forPreview) { // Save the html text m_html = transcoded; // In many cases, we need to change the charset decl, // because the file was transcoded. It seems that just // inserting one is enough (only the 1st one seems to // be used by browsers/qtextedit). string::size_type idx = m_html.find(""); if (idx == string::npos) idx = m_html.find(""); if (idx != string::npos) m_html.replace(idx+6, 0, ""); } break; } LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", charset.c_str(), result.get_charset().c_str())); if (!result.get_charset().empty() && !samecharset(result.get_charset(), result.fromcharset)) { LOGDEB(("textHtmlToDoc: reparse for charsets\n")); // Set the origin charset as specified in document before // transcoding again charset = result.get_charset(); } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); return false; } } } m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); m_metaData[cstr_dj_keycontent] = result.dump; m_metaData[cstr_dj_keycharset] = "utf-8"; // Avoid setting empty values which would crush ones possibly inherited // from parent (if we're an attachment) if (!result.dmtime.empty()) m_metaData[cstr_dj_keymd] = result.dmtime; m_metaData[cstr_dj_keymt] = cstr_textplain; for (map::const_iterator it = result.meta.begin(); it != result.meta.end(); it++) { if (!it->second.empty()) m_metaData[it->first] = it->second; } return true; }