From 2fc294a9c6b5ee7b1bb609102b42eb41ad2a72f0 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 6 Oct 2012 12:14:04 +0200 Subject: [PATCH] factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail --- src/common/cstr.h | 1 + src/internfile/mh_exec.cpp | 41 ++++++++++++--------- src/internfile/mh_exec.h | 7 ++++ src/internfile/mh_execm.cpp | 24 ++++--------- src/internfile/mh_html.cpp | 2 +- src/internfile/mh_mail.cpp | 65 ++++++++++++++++++---------------- src/internfile/mimehandler.cpp | 3 +- src/internfile/mimehandler.h | 1 + src/sampleconf/mimeconf | 10 +++--- 9 files changed, 82 insertions(+), 72 deletions(-) diff --git a/src/common/cstr.h b/src/common/cstr.h index 641d1033..0b1c5cf6 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes"); DEF_CSTR(fileu, "file://"); DEF_CSTR(fmtime, "fmtime"); DEF_CSTR(iso_8859_1, "ISO-8859-1"); +DEF_CSTR(utf8, "UTF-8"); DEF_CSTR(minwilds, "*?["); DEF_CSTR(newline, "\n"); DEF_CSTR(null, ""); diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 55d5644b..2b0fc03b 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document() return true; } -void MimeHandlerExec::finaldetails() +void MimeHandlerExec::handle_cs(const string& mt, const string& icharset) { - m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; + string charset(icharset); // cfgFilterOutputCharset comes from the mimeconf filter - // definition line If the value is "default", we use the charset - // value defined in recoll.conf (which may vary depending on - // directory) - string& charset = m_metaData[cstr_dj_keycharset]; - charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset; - if (!stringlowercmp("default", charset)) { - charset = m_dfltInputCharset; + // definition line and defaults to UTF-8 if empty. If the value is + // "default", we use the default input charset value defined in + // recoll.conf (which may vary depending on directory) + if (charset.empty()) { + charset = cfgFilterOutputCharset.empty() ? cstr_utf8 : + cfgFilterOutputCharset; + if (!stringlowercmp("default", charset)) { + charset = m_dfltInputCharset; + } } - - // The output mime type is html except if defined otherwise in the filter - // definition. - string& mt = m_metaData[cstr_dj_keymt]; - mt = cfgFilterOutputMtype.empty() ? "text/html" : - cfgFilterOutputMtype; + m_metaData[cstr_dj_keyorigcharset] = charset; // If this is text/plain transcode_to/check utf-8 if (!mt.compare(cstr_textplain)) { - (void)txtdcode("mh_exec"); + (void)txtdcode("mh_exec/m"); + } else { + m_metaData[cstr_dj_keycharset] = charset; } +} + +void MimeHandlerExec::finaldetails() +{ + // The default output mime type is html, but it may be defined + // otherwise in the filter definition. + m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : + cfgFilterOutputMtype; string md5, xmd5, reason; if (MD5File(m_fn, md5, &reason)) { @@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails() LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n", m_fn.c_str(), reason.c_str())); } + + handle_cs(m_metaData[cstr_dj_keymt]); } diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h index 41ba6b78..4c5a2654 100644 --- a/src/internfile/mh_exec.h +++ b/src/internfile/mh_exec.h @@ -77,6 +77,13 @@ protected: string m_fn; string m_ipath; + // Set up the character set metadata fields and possibly transcode + // text/plain output. + // @param charset when called from mh_execm, a possible explicit + // value from the filter (else the data will come from the config) + virtual void handle_cs(const string& mt, const string& charset = string()); + +private: virtual void finaldetails(); }; diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index de0badf9..2b9f3846 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data) ibuf.c_str())); return false; } - LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); + LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); if (len / 1024 > m_maxmemberkb) { LOGERR(("MHExecMultiple: data len > maxmemberkb\n")); return false; @@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document() } } - // Charset. For many document types it doesn't matter. For text - // and html it does. We supply a default from the configuration. - if (charset.empty()) { - charset = cfgFilterOutputCharset.empty() ? "utf-8" : - cfgFilterOutputCharset; - if (!stringlowercmp("default", charset)) { - charset = m_dfltInputCharset; - } - } - m_metaData[cstr_dj_keyorigcharset] = charset; - m_metaData[cstr_dj_keycharset] = charset; + handle_cs(m_metaData[cstr_dj_keymt], charset); - if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) { - (void)txtdcode("mh_execm"); - } - if (eofnext_received) m_havedoc = false; LOGDEB0(("MHExecMultiple: returning %d bytes of content," - " mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(), - m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str())); + " mtype [%s] charset [%s]\n", + m_metaData[cstr_dj_keycontent].size(), + m_metaData[cstr_dj_keymt].c_str(), + m_metaData[cstr_dj_keycharset].c_str())); return true; } diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 31615397..5a9de050 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document() m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); m_metaData[cstr_dj_keycontent] = result.dump; - m_metaData[cstr_dj_keycharset] = "utf-8"; + m_metaData[cstr_dj_keycharset] = cstr_utf8; // Avoid setting empty values which would crush ones possibly inherited // from parent (if we're an attachment) if (!result.dmtime.empty()) diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 1ac0890a..7e177c3a 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach() MHMailAttach *att = m_attachments[m_idx]; m_metaData[cstr_dj_keymt] = att->m_contentType; + m_metaData[cstr_dj_keyorigcharset] = att->m_charset; m_metaData[cstr_dj_keycharset] = att->m_charset; m_metaData[cstr_dj_keyfn] = att->m_filename; - // Change the title to something helpul m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")"; LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n", att->m_contentType.c_str(), att->m_charset.c_str(), att->m_filename.c_str())); + // Erase current content and replace m_metaData[cstr_dj_keycontent] = string(); string& body = m_metaData[cstr_dj_keycontent]; att->m_part->getBody(body, 0, att->m_part->bodylength); - string decoded; - const string *bdp; - if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) { - return false; - } - if (bdp != &body) - body = decoded; - - // Special case for text/plain content. Internfile should deal - // with this but it expects text/plain to be utf-8 already, so we - // handle the transcoding if needed - if (m_metaData[cstr_dj_keymt] == cstr_textplain) { - string utf8; - if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) { - LOGERR((" processAttach: transcode to utf-8 failed " - "for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str())); - // can't transcode at all -> data is garbage just erase it - body.clear(); - } else { - body = utf8; + { + string decoded; + const string *bdp; + if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) { + return false; } + if (bdp != &body) + body.swap(decoded); } // Special case for application/octet-stream: try to better @@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach() m_metaData[cstr_dj_keymt] = mt; } + // Special case for text/plain content. Internfile should deal + // with this but it expects text/plain to be utf-8 already, so we + // handle the transcoding if needed + if (m_metaData[cstr_dj_keymt] == cstr_textplain) { + string utf8; + if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) { + LOGERR((" processAttach: transcode to utf-8 failed for charset " + "[%s]\n", m_metaData[cstr_dj_keycharset].c_str())); + // can't transcode at all -> data is garbage just erase it + body.clear(); + } else { + m_metaData[cstr_dj_keycharset] = cstr_utf8; + body.swap(utf8); + } + } + // Ipath char nbuf[20]; sprintf(nbuf, "%d", m_idx); @@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) // "Simple" part. LOGDEB2(("walkmime: simple part\n")); - // Normally the default charset is us-ascii. But it happens that - // 8 bit chars exist in a message that is stated as us-ascii. Ie the - // mailer used by yahoo support ('KANA') does this. We could convert - // to iso-8859 only if the transfer-encoding is 8 bit, or test for - // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default + // Normally the default charset is us-ascii. But it happens that 8 + // bit chars exist in a message that is stated as us-ascii. Ie the + // mailer used by yahoo support ('KANA') does this. We could + // convert to iso-8859 only if the transfer-encoding is 8 bit, or + // test for actual 8 bit chars, but what the heck, le'ts use + // 8859-1 (actually CP1252 which is compatible, but with more + // useful chars) as default. string charset; it = content_type.params.find(cstr_mail_charset); if (it != content_type.params.end()) @@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) !stringlowercmp("unknown", charset) ) { m_config->getConfParam("maildefcharset", charset); if (charset.empty()) - charset = "iso-8859-1"; + charset = "CP1252"; } // Content transfer encoding @@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) body = decoded; // Handle html stripping and transcoding to utf8 - string utf8; - const string *putf8 = 0; if (!stringlowercmp("text/html", content_type.value)) { MimeHandlerHtml mh(m_config, "text/html"); mh.set_property(Dijon::Filter::OPERATING_MODE, @@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) if (it != mh.get_meta_data().end()) out += it->second; } else { + string utf8; // Transcode to utf-8 LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str())); - if (!transcode(body, utf8, charset, "UTF-8")) { + if (!transcode(body, utf8, charset, cstr_utf8)) { LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); out += body; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 83c464c7..55b2f498 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, out: if (h) { - string charset = cfg->getDefCharset(); - h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset()); } return h; } diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 301815f6..4124f587 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -109,6 +109,7 @@ public: } // This only makes sense if the contents are currently txt/plain + // It converts from keyorigcharset to UTF-8 and sets keycharset. bool txtdcode(const string& who); protected: diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index d0011916..b9b536e5 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t # The default is now again to use rcldoc. Use raw antiword if speed is more # important for you than catching all data, application/msword = exec rcldoc -#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 +#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain # You can also use wvware directly but it's much slower. # application/msword = exec wvWare --charset=utf-8 --nographics @@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc application/ogg = execm rclaudio application/pdf = exec rclpdf application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain -application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain -application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain +application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain +application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain application/vn.oasis.opendocument.txt = exec rclsoff application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ exec rclopxml @@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = exec rclabw application/x-awk = internal text/plain application/x-chm = execm rclchm -application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8 +application/x-dia-diagram = execm rcldia;mimetype=text/plain application/x-dvi = exec rcldvi application/x-flac = execm rclaudio application/x-gnuinfo = execm rclinfo @@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu image/svg+xml = exec rclsvg image/x-xcf = execm rclimg message/rfc822 = internal -text/calendar = execm rclics;mimetype=text/plain;charset=utf-8 +text/calendar = execm rclics;mimetype=text/plain text/html = internal text/plain = internal text/rtf = exec unrtf --nopict --html;mimetype=text/html