From bfc6512d24592d6daf488521523f3c31ef88e6fb Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 9 Oct 2008 09:19:37 +0000 Subject: [PATCH] need to transcode text to utf-8 --- src/internfile/mh_exec.cpp | 24 +++++++++++++++++++++--- src/internfile/mimehandler.cpp | 6 +++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 43ebeb4f..0b5774bc 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.13 2008-10-06 06:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.14 2008-10-09 09:19:37 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -24,6 +24,7 @@ static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.13 2008-10-06 06:22:46 dockes Exp #include "debuglog.h" #include "cancelcheck.h" #include "smallut.h" +#include "transcode.h" #include #include @@ -106,11 +107,28 @@ bool MimeHandlerExec::next_document() return false; } + // if output is text, we must handle the conversion to utf-8 + string charset = cfgCharset.empty() ? "utf-8" : cfgCharset; + string mt = cfgMtype.empty() ? "text/html" : cfgMtype; + if (!mt.compare("text/plain") && charset.compare("utf-8")) { + string transcoded; + int ecnt; + if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { + LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n", + charset.c_str())); + } else { + if (ecnt) { + LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n", + ecnt, charset.c_str())); + } + output = transcoded; + } + } // Success. Store some external metadata m_metaData["origcharset"] = m_defcharset; // Default charset: all recoll filters output utf-8, but this // could still be overridden by the content-type meta tag. - m_metaData["charset"] = cfgCharset.empty() ? "utf-8" : cfgCharset; - m_metaData["mimetype"] = cfgMtype.empty() ? "text/html" : cfgMtype; + m_metaData["charset"] = charset; + m_metaData["mimetype"] = mt; return true; } diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 26068ef7..9c821f9a 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.24 2008-10-06 06:22:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.25 2008-10-09 09:19:37 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -106,9 +106,9 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs) val = line.substr(eqpos+1, string::npos); trimstring(val); if (!nm.compare("charset")) { - h->cfgCharset = val; + h->cfgCharset = stringtolower((const string&)val); } else if (!nm.compare("mimetype")) { - h->cfgMtype = val; + h->cfgMtype = stringtolower((const string&)val); } }