From 385a7b9547e07205e5e83ccd742f3fd4e9b36f36 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 22 Nov 2018 14:25:47 +0100 Subject: [PATCH] indent + comments + explicit std:: --- src/utils/mimeparse.cpp | 1079 ++++++++++++++++++++------------------- src/utils/mimeparse.h | 69 ++- 2 files changed, 578 insertions(+), 570 deletions(-) diff --git a/src/utils/mimeparse.cpp b/src/utils/mimeparse.cpp index a224b10c..876b29d6 100644 --- a/src/utils/mimeparse.cpp +++ b/src/utils/mimeparse.cpp @@ -74,25 +74,25 @@ bool rfc2231_decode(const string &in, string &out, string &charset) string::size_type pos1, pos2=0; if (charset.empty()) { - if ((pos1 = in.find("'")) == string::npos) - return false; - charset = in.substr(0, pos1); - // fprintf(stderr, "Charset: [%s]\n", charset.c_str()); - pos1++; + if ((pos1 = in.find("'")) == string::npos) + return false; + charset = in.substr(0, pos1); + // fprintf(stderr, "Charset: [%s]\n", charset.c_str()); + pos1++; - if ((pos2 = in.find("'", pos1)) == string::npos) - return false; - // We have no use for lang for now - // string lang = in.substr(pos1, pos2-pos1); - // fprintf(stderr, "Lang: [%s]\n", lang.c_str()); - pos2++; + if ((pos2 = in.find("'", pos1)) == string::npos) + return false; + // We have no use for lang for now + // string lang = in.substr(pos1, pos2-pos1); + // fprintf(stderr, "Lang: [%s]\n", lang.c_str()); + pos2++; } string raw; qp_decode(in.substr(pos2), raw, '%'); // fprintf(stderr, "raw [%s]\n", raw.c_str()); if (!transcode(raw, out, charset, "UTF-8")) - return false; + return false; return true; } @@ -102,7 +102,7 @@ bool rfc2231_decode(const string &in, string &out, string &charset) // The lexical token returned by find_next_token class Lexical { - public: +public: enum kind {none, token, separator}; kind what; string value; @@ -118,26 +118,26 @@ skip_comment(const string &in, string::size_type start, Lexical &lex) { int commentlevel = 0; for (; start < in.size(); start++) { - if (in[start] == '\\') { - // Skip escaped char. - if (start+1 < in.size()) { - start++; - continue; - } else { - lex.error.append("\\ at end of string "); - return in.size(); - } - } - if (in[start] == '(') - commentlevel++; - if (in[start] == ')') { - if (--commentlevel == 0) - break; - } + if (in[start] == '\\') { + // Skip escaped char. + if (start+1 < in.size()) { + start++; + continue; + } else { + lex.error.append("\\ at end of string "); + return in.size(); + } + } + if (in[start] == '(') + commentlevel++; + if (in[start] == ')') { + if (--commentlevel == 0) + break; + } } if (start == in.size() && commentlevel != 0) { - lex.error.append("Unclosed comment "); - return in.size(); + lex.error.append("Unclosed comment "); + return in.size(); } return start; } @@ -145,17 +145,17 @@ skip_comment(const string &in, string::size_type start, Lexical &lex) // Skip initial whitespace and (possibly nested) comments. static string::size_type skip_whitespace_and_comment(const string &in, string::size_type start, - Lexical &lex) + Lexical &lex) { while (1) { - if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos) - return in.size(); - if (in[start] == '(') { - if ((start = skip_comment(in, start, lex)) == string::npos) - return string::npos; - } else { - break; - } + if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos) + return in.size(); + if (in[start] == '(') { + if ((start = skip_comment(in, start, lex)) == string::npos) + return string::npos; + } else { + break; + } } return start; } @@ -168,20 +168,20 @@ skip_whitespace_and_comment(const string &in, string::size_type start, /// @param delims separators we should look for static string::size_type find_next_token(const string &in, string::size_type start, - Lexical &lex, string delims = ";=") + Lexical &lex, string delims = ";=") { char oquot, cquot; start = skip_whitespace_and_comment(in, start, lex); if (start == string::npos || start == in.size()) - return in.size(); + return in.size(); // Begins with separator ? return it. string::size_type delimi = delims.find_first_of(in[start]); if (delimi != string::npos) { - lex.what = Lexical::separator; - lex.value = delims[delimi]; - return start+1; + lex.what = Lexical::separator; + lex.value = delims[delimi]; + return start+1; } // Check for start of quoted string @@ -193,41 +193,41 @@ find_next_token(const string &in, string::size_type start, } if (cquot != 0) { - // Quoted string parsing - string::size_type end; - start++; // Skip quote character - for (end = start;end < in.size() && in[end] != cquot; end++) { - if (in[end] == '\\') { - // Skip escaped char. - if (end+1 < in.size()) { - end++; - } else { - // backslash at end of string: error - lex.error.append("\\ at end of string "); - return string::npos; - } - } - } - if (end == in.size()) { - // Found end of string before closing quote character: error - lex.error.append("Unclosed quoted string "); - return string::npos; - } - lex.what = Lexical::token; - lex.value = in.substr(start, end-start); - lex.quote = oquot; - return ++end; + // Quoted string parsing + string::size_type end; + start++; // Skip quote character + for (end = start;end < in.size() && in[end] != cquot; end++) { + if (in[end] == '\\') { + // Skip escaped char. + if (end+1 < in.size()) { + end++; + } else { + // backslash at end of string: error + lex.error.append("\\ at end of string "); + return string::npos; + } + } + } + if (end == in.size()) { + // Found end of string before closing quote character: error + lex.error.append("Unclosed quoted string "); + return string::npos; + } + lex.what = Lexical::token; + lex.value = in.substr(start, end-start); + lex.quote = oquot; + return ++end; } else { - string::size_type end = in.find_first_of(delims + "\r\n \t(", start); - lex.what = Lexical::token; - lex.quote = 0; - if (end == string::npos) { - end = in.size(); - lex.value = in.substr(start); - } else { - lex.value = in.substr(start, end-start); - } - return end; + string::size_type end = in.find_first_of(delims + "\r\n \t(", start); + lex.what = Lexical::token; + lex.quote = 0; + if (end == string::npos) { + end = in.size(); + lex.value = in.substr(start); + } else { + lex.value = in.substr(start, end-start); + } + return end; } } @@ -246,7 +246,7 @@ public: void stringtolower(string &out, const string& in) { for (string::size_type i = 0; i < in.size(); i++) - out.append(1, char(tolower(in[i]))); + out.append(1, char(tolower(in[i]))); } // Parse MIME field value. Should look like: @@ -262,43 +262,43 @@ bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed) // Get the field value start = find_next_token(value, start, lex); if (start == string::npos || lex.what != Lexical::token) - return false; + return false; parsed.value = lex.value; map rawparams; // Look for parameters for (;;) { - string paramname, paramvalue; - lex.reset(); - start = find_next_token(value, start, lex); - if (start == value.size()) - break; - if (start == string::npos) { - //fprintf(stderr, "Find_next_token error(1)\n"); - return false; - } - if (lex.what == Lexical::separator && lex.value[0] == ';') - continue; - if (lex.what != Lexical::token) - return false; - stringtolower(paramname, lex.value); + string paramname, paramvalue; + lex.reset(); + start = find_next_token(value, start, lex); + if (start == value.size()) + break; + if (start == string::npos) { + //fprintf(stderr, "Find_next_token error(1)\n"); + return false; + } + if (lex.what == Lexical::separator && lex.value[0] == ';') + continue; + if (lex.what != Lexical::token) + return false; + stringtolower(paramname, lex.value); - start = find_next_token(value, start, lex); - if (start == string::npos || lex.what != Lexical::separator || - lex.value[0] != '=') { - //fprintf(stderr, "Find_next_token error (2)\n"); - return false; - } + start = find_next_token(value, start, lex); + if (start == string::npos || lex.what != Lexical::separator || + lex.value[0] != '=') { + //fprintf(stderr, "Find_next_token error (2)\n"); + return false; + } - start = find_next_token(value, start, lex); - if (start == string::npos || lex.what != Lexical::token) { - //fprintf(stderr, "Parameter has no value!"); - return false; - } - paramvalue = lex.value; - rawparams[paramname] = paramvalue; - //fprintf(stderr, "RAW: name [%s], value [%s]\n", paramname.c_str(), - // paramvalue.c_str()); + start = find_next_token(value, start, lex); + if (start == string::npos || lex.what != Lexical::token) { + //fprintf(stderr, "Parameter has no value!"); + return false; + } + paramvalue = lex.value; + rawparams[paramname] = paramvalue; + //fprintf(stderr, "RAW: name [%s], value [%s]\n", paramname.c_str(), + // paramvalue.c_str()); } // fprintf(stderr, "Number of raw params %d\n", rawparams.size()); @@ -309,38 +309,38 @@ bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed) map chunks; for (map::const_iterator it = rawparams.begin(); - it != rawparams.end(); it++) { - string nm = it->first; - // fprintf(stderr, "NM: [%s]\n", nm.c_str()); - if (nm.empty()) // ?? - continue; + it != rawparams.end(); it++) { + string nm = it->first; + // fprintf(stderr, "NM: [%s]\n", nm.c_str()); + if (nm.empty()) // ?? + continue; - Chunk chunk; - if (nm[nm.length()-1] == '*') { - nm.erase(nm.length() - 1); - chunk.decode = true; - } else - chunk.decode = false; - // fprintf(stderr, "NM1: [%s]\n", nm.c_str()); + Chunk chunk; + if (nm[nm.length()-1] == '*') { + nm.erase(nm.length() - 1); + chunk.decode = true; + } else + chunk.decode = false; + // fprintf(stderr, "NM1: [%s]\n", nm.c_str()); - chunk.value = it->second; + chunk.value = it->second; - // Look for another asterisk in nm. If none, assign index 0 - string::size_type aster; - int idx = 0; - if ((aster = nm.rfind("*")) != string::npos) { - string num = nm.substr(aster+1); - //fprintf(stderr, "NUM: [%s]\n", num.c_str()); - nm.erase(aster); - idx = atoi(num.c_str()); - } - Chunks empty; - if (chunks.find(nm) == chunks.end()) - chunks[nm] = empty; - chunks[nm].chunks.resize(idx+1); - chunks[nm].chunks[idx] = chunk; - //fprintf(stderr, "CHNKS: nm [%s], idx %d, decode %d, value [%s]\n", - // nm.c_str(), idx, int(chunk.decode), chunk.value.c_str()); + // Look for another asterisk in nm. If none, assign index 0 + string::size_type aster; + int idx = 0; + if ((aster = nm.rfind("*")) != string::npos) { + string num = nm.substr(aster+1); + //fprintf(stderr, "NUM: [%s]\n", num.c_str()); + nm.erase(aster); + idx = atoi(num.c_str()); + } + Chunks empty; + if (chunks.find(nm) == chunks.end()) + chunks[nm] = empty; + chunks[nm].chunks.resize(idx+1); + chunks[nm].chunks[idx] = chunk; + //fprintf(stderr, "CHNKS: nm [%s], idx %d, decode %d, value [%s]\n", + // nm.c_str(), idx, int(chunk.decode), chunk.value.c_str()); } // For each parameter name, concatenate its chunks and possibly @@ -349,29 +349,29 @@ bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed) // which is not right because there might be uncoded chunks // according to the rfc. for (map::const_iterator it = chunks.begin(); - it != chunks.end(); it++) { - if (it->second.chunks.empty()) - continue; - string nm = it->first; - // Create the name entry - if (parsed.params.find(nm) == parsed.params.end()) - parsed.params[nm].clear(); - // Concatenate all chunks and decode the whole if the first one needs - // to. Yes, this is not quite right. - string value; - for (vector::const_iterator vi = it->second.chunks.begin(); - vi != it->second.chunks.end(); vi++) { - value += vi->value; - } - if (it->second.chunks[0].decode) { - string charset; - rfc2231_decode(value, parsed.params[nm], charset); - } else { - // rfc2047 MUST NOT but IS used by some agents - rfc2047_decode(value, parsed.params[nm]); - } - //fprintf(stderr, "FINAL: nm [%s], value [%s]\n", - //nm.c_str(), parsed.params[nm].c_str()); + it != chunks.end(); it++) { + if (it->second.chunks.empty()) + continue; + string nm = it->first; + // Create the name entry + if (parsed.params.find(nm) == parsed.params.end()) + parsed.params[nm].clear(); + // Concatenate all chunks and decode the whole if the first one needs + // to. Yes, this is not quite right. + string value; + for (vector::const_iterator vi = it->second.chunks.begin(); + vi != it->second.chunks.end(); vi++) { + value += vi->value; + } + if (it->second.chunks[0].decode) { + string charset; + rfc2231_decode(value, parsed.params[nm], charset); + } else { + // rfc2047 MUST NOT but IS used by some agents + rfc2047_decode(value, parsed.params[nm]); + } + //fprintf(stderr, "FINAL: nm [%s], value [%s]\n", + //nm.c_str(), parsed.params[nm].c_str()); } return true; @@ -385,80 +385,80 @@ bool qp_decode(const string& in, string &out, char esc) out.reserve(in.length()); string::size_type ii; for (ii = 0; ii < in.length(); ii++) { - if (in[ii] == esc) { - ii++; // Skip '=' or '%' - if(ii >= in.length() - 1) { // Need at least 2 more chars - break; - } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip - ii++; - } else if (in[ii] != '\n' && in[ii] != '\r') { // decode - char c = in[ii]; - char co; - if(c >= 'A' && c <= 'F') { - co = char((c - 'A' + 10) * 16); - } else if (c >= 'a' && c <= 'f') { - co = char((c - 'a' + 10) * 16); - } else if (c >= '0' && c <= '9') { - co = char((c - '0') * 16); - } else { - return false; - } - if(++ii >= in.length()) - break; - c = in[ii]; - if (c >= 'A' && c <= 'F') { - co += char(c - 'A' + 10); - } else if (c >= 'a' && c <= 'f') { - co += char(c - 'a' + 10); - } else if (c >= '0' && c <= '9') { - co += char(c - '0'); - } else { - return false; - } - out += co; - } - } else { - out += in[ii]; - } + if (in[ii] == esc) { + ii++; // Skip '=' or '%' + if(ii >= in.length() - 1) { // Need at least 2 more chars + break; + } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip + ii++; + } else if (in[ii] != '\n' && in[ii] != '\r') { // decode + char c = in[ii]; + char co; + if(c >= 'A' && c <= 'F') { + co = char((c - 'A' + 10) * 16); + } else if (c >= 'a' && c <= 'f') { + co = char((c - 'a' + 10) * 16); + } else if (c >= '0' && c <= '9') { + co = char((c - '0') * 16); + } else { + return false; + } + if(++ii >= in.length()) + break; + c = in[ii]; + if (c >= 'A' && c <= 'F') { + co += char(c - 'A' + 10); + } else if (c >= 'a' && c <= 'f') { + co += char(c - 'a' + 10); + } else if (c >= '0' && c <= '9') { + co += char(c - '0'); + } else { + return false; + } + out += co; + } + } else { + out += in[ii]; + } } return true; } // Decode an word encoded as quoted printable or base 64 static bool rfc2047_decodeParsed(const std::string& charset, - const std::string& encoding, - const std::string& value, - std::string &utf8) + const std::string& encoding, + const std::string& value, + std::string &utf8) { DPRINT((stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n", - charset.c_str(), encoding.c_str(), value.c_str())); + charset.c_str(), encoding.c_str(), value.c_str())); utf8.clear(); string decoded; if (!stringlowercmp("b", encoding)) { - if (!base64_decode(value, decoded)) - return false; - DPRINT((stderr, "FromB64: [%s]\n", decoded.c_str())); + if (!base64_decode(value, decoded)) + return false; + DPRINT((stderr, "FromB64: [%s]\n", decoded.c_str())); } else if (!stringlowercmp("q", encoding)) { - if (!qp_decode(value, decoded)) - return false; - // Need to translate _ to ' ' here - string temp; - for (string::size_type pos = 0; pos < decoded.length(); pos++) - if (decoded[pos] == '_') - temp += ' '; - else - temp += decoded[pos]; - decoded = temp; - DPRINT((stderr, "FromQP: [%s]\n", decoded.c_str())); + if (!qp_decode(value, decoded)) + return false; + // Need to translate _ to ' ' here + string temp; + for (string::size_type pos = 0; pos < decoded.length(); pos++) + if (decoded[pos] == '_') + temp += ' '; + else + temp += decoded[pos]; + decoded = temp; + DPRINT((stderr, "FromQP: [%s]\n", decoded.c_str())); } else { - DPRINT((stderr, "Bad encoding [%s]\n", encoding.c_str())); - return false; + DPRINT((stderr, "Bad encoding [%s]\n", encoding.c_str())); + return false; } if (!transcode(decoded, utf8, charset, "UTF-8")) { - DPRINT((stderr, "Transcode failed\n")); - return false; + DPRINT((stderr, "Transcode failed\n")); + return false; } return true; } @@ -470,8 +470,8 @@ static bool rfc2047_decodeParsed(const std::string& charset, // - We should turn off decoding while inside quoted strings // typedef enum {rfc2047ready, rfc2047open_eq, - rfc2047charset, rfc2047encoding, - rfc2047value, rfc2047close_q} Rfc2047States; + rfc2047charset, rfc2047encoding, + rfc2047value, rfc2047close_q} Rfc2047States; bool rfc2047_decode(const std::string& in, std::string &out) { @@ -483,106 +483,106 @@ bool rfc2047_decode(const std::string& in, std::string &out) out.clear(); for (string::size_type ii = 0; ii < in.length(); ii++) { - char ch = in[ii]; - switch (state) { - case rfc2047ready: - { - DPRINT((stderr, "STATE: ready, ch %c\n", ch)); - switch (ch) { - // Whitespace: stay ready - case ' ': case ' ': value += ch;break; - // '=' -> forward to next state - case '=': state = rfc2047open_eq; break; - DPRINT((stderr, "STATE: open_eq\n")); - // Other: go back to sleep - default: value += ch; state = rfc2047ready; - } - } - break; - case rfc2047open_eq: - { - DPRINT((stderr, "STATE: open_eq, ch %c\n", ch)); - switch (ch) { - case '?': - { - // Transcode current (unencoded part) value: - // we sometimes find 8-bit chars in - // there. Interpret as Iso8859. - if (value.length() > 0) { - transcode(value, utf8, "ISO-8859-1", "UTF-8"); - out += utf8; - value.clear(); - } - state = rfc2047charset; - } - break; - default: state = rfc2047ready; out += '='; out += ch;break; - } - } - break; - case rfc2047charset: - { - DPRINT((stderr, "STATE: charset, ch %c\n", ch)); - switch (ch) { - case '?': state = rfc2047encoding; break; - default: charset += ch; break; - } - } - break; - case rfc2047encoding: - { - DPRINT((stderr, "STATE: encoding, ch %c\n", ch)); - switch (ch) { - case '?': state = rfc2047value; break; - default: encoding += ch; break; - } - } - break; - case rfc2047value: - { - DPRINT((stderr, "STATE: value, ch %c\n", ch)); - switch (ch) { - case '?': state = rfc2047close_q; break; - default: value += ch;break; - } - } - break; - case rfc2047close_q: - { - DPRINT((stderr, "STATE: close_q, ch %c\n", ch)); - switch (ch) { - case '=': - { - DPRINT((stderr, "End of encoded area. Charset %s, Encoding %s\n", charset.c_str(), encoding.c_str())); - string utf8; - state = rfc2047ready; - if (!rfc2047_decodeParsed(charset, encoding, value, - utf8)) { - return false; - } - out += utf8; - charset.clear(); - encoding.clear(); - value.clear(); - } - break; - default: state = rfc2047value; value += '?';value += ch;break; - } - } - break; - default: // ?? + char ch = in[ii]; + switch (state) { + case rfc2047ready: + { + DPRINT((stderr, "STATE: ready, ch %c\n", ch)); + switch (ch) { + // Whitespace: stay ready + case ' ': case '\t': value += ch;break; + // '=' -> forward to next state + case '=': state = rfc2047open_eq; break; + DPRINT((stderr, "STATE: open_eq\n")); + // Other: go back to sleep + default: value += ch; state = rfc2047ready; + } + } + break; + case rfc2047open_eq: + { + DPRINT((stderr, "STATE: open_eq, ch %c\n", ch)); + switch (ch) { + case '?': + { + // Transcode current (unencoded part) value: + // we sometimes find 8-bit chars in + // there. Interpret as Iso8859. + if (value.length() > 0) { + transcode(value, utf8, "ISO-8859-1", "UTF-8"); + out += utf8; + value.clear(); + } + state = rfc2047charset; + } + break; + default: state = rfc2047ready; out += '='; out += ch;break; + } + } + break; + case rfc2047charset: + { + DPRINT((stderr, "STATE: charset, ch %c\n", ch)); + switch (ch) { + case '?': state = rfc2047encoding; break; + default: charset += ch; break; + } + } + break; + case rfc2047encoding: + { + DPRINT((stderr, "STATE: encoding, ch %c\n", ch)); + switch (ch) { + case '?': state = rfc2047value; break; + default: encoding += ch; break; + } + } + break; + case rfc2047value: + { + DPRINT((stderr, "STATE: value, ch %c\n", ch)); + switch (ch) { + case '?': state = rfc2047close_q; break; + default: value += ch;break; + } + } + break; + case rfc2047close_q: + { + DPRINT((stderr, "STATE: close_q, ch %c\n", ch)); + switch (ch) { + case '=': + { + DPRINT((stderr, "End of encoded area. Charset %s, Encoding %s\n", charset.c_str(), encoding.c_str())); + string utf8; + state = rfc2047ready; + if (!rfc2047_decodeParsed(charset, encoding, value, + utf8)) { + return false; + } + out += utf8; + charset.clear(); + encoding.clear(); + value.clear(); + } + break; + default: state = rfc2047value; value += '?';value += ch;break; + } + } + break; + default: // ?? DPRINT((stderr, "STATE: default ?? ch %c\n", ch)); - return false; - } + return false; + } } if (value.length() > 0) { - transcode(value, utf8, "ISO-8859-1", "UTF-8"); - out += utf8; - value.clear(); + transcode(value, utf8, "ISO-8859-1", "UTF-8"); + out += utf8; + value.clear(); } if (state != rfc2047ready) - return false; + return false; return true; } @@ -604,16 +604,16 @@ time_t rfc2822DateToUxTime(const string& dt) vector toks; string::size_type idx; if ((idx = dt.find_first_of(",")) != string::npos) { - if (idx == dt.length() - 1) { - DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n", - dt.c_str())); - return (time_t)-1; - } - string date = dt.substr(idx+1); - stringToTokens(date, toks, " \t:"); + if (idx == dt.length() - 1) { + DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n", + dt.c_str())); + return (time_t)-1; + } + string date = dt.substr(idx+1); + stringToTokens(date, toks, " \t:"); } else { // No comma. Enter strangeland - stringToTokens(dt, toks, " \t:"); + stringToTokens(dt, toks, " \t:"); // Test for date like: Sun Nov 19 06:18:41 2006 // 0 1 2 3 4 5 6 // and change to: 19 Nov 2006 06:18:41 @@ -629,20 +629,20 @@ time_t rfc2822DateToUxTime(const string& dt) #if DEBUGDATE for (list::iterator it = toks.begin(); it != toks.end(); it++) { - DATEDEB((stderr, "[%s] ", it->c_str())); + DATEDEB((stderr, "[%s] ", it->c_str())); } DATEDEB((stderr, "\n")); #endif if (toks.size() < 6) { - DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n", - dt.c_str())); - return (time_t)-1; + DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n", + dt.c_str())); + return (time_t)-1; } if (toks.size() == 6) { - // Probably no timezone, sometimes happens - toks.push_back("+0000"); + // Probably no timezone, sometimes happens + toks.push_back("+0000"); } struct tm tm; @@ -660,20 +660,20 @@ time_t rfc2822DateToUxTime(const string& dt) // Month. Only Jan-Dec are legal. January, February do happen // though. Convert to 0-11 if (*it == "Jan" || *it == "January") tm.tm_mon = 0; else if - (*it == "Feb" || *it == "February") tm.tm_mon = 1; else if - (*it == "Mar" || *it == "March") tm.tm_mon = 2; else if - (*it == "Apr" || *it == "April") tm.tm_mon = 3; else if - (*it == "May") tm.tm_mon = 4; else if - (*it == "Jun" || *it == "June") tm.tm_mon = 5; else if - (*it == "Jul" || *it == "July") tm.tm_mon = 6; else if - (*it == "Aug" || *it == "August") tm.tm_mon = 7; else if - (*it == "Sep" || *it == "September") tm.tm_mon = 8; else if - (*it == "Oct" || *it == "October") tm.tm_mon = 9; else if - (*it == "Nov" || *it == "November") tm.tm_mon = 10; else if - (*it == "Dec" || *it == "December") tm.tm_mon = 11; else { - DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n", - dt.c_str())); - return (time_t)-1; + (*it == "Feb" || *it == "February") tm.tm_mon = 1; else if + (*it == "Mar" || *it == "March") tm.tm_mon = 2; else if + (*it == "Apr" || *it == "April") tm.tm_mon = 3; else if + (*it == "May") tm.tm_mon = 4; else if + (*it == "Jun" || *it == "June") tm.tm_mon = 5; else if + (*it == "Jul" || *it == "July") tm.tm_mon = 6; else if + (*it == "Aug" || *it == "August") tm.tm_mon = 7; else if + (*it == "Sep" || *it == "September") tm.tm_mon = 8; else if + (*it == "Oct" || *it == "October") tm.tm_mon = 9; else if + (*it == "Nov" || *it == "November") tm.tm_mon = 10; else if + (*it == "Dec" || *it == "December") tm.tm_mon = 11; else { + DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n", + dt.c_str())); + return (time_t)-1; } it++; @@ -687,67 +687,67 @@ time_t rfc2822DateToUxTime(const string& dt) tm.tm_year += 1900; } if (tm.tm_year > 1900) - tm.tm_year -= 1900; + tm.tm_year -= 1900; it++; // Hour minute second need no adjustments tm.tm_hour = atoi(it->c_str()); it++; tm.tm_min = atoi(it->c_str()); it++; - tm.tm_sec = atoi(it->c_str()); it++; + tm.tm_sec = atoi(it->c_str()); it++; // Timezone is supposed to be either +-XYZT or a zone name int zonesecs = 0; if (it->length() < 1) { - DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str())); - return (time_t)-1; + DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str())); + return (time_t)-1; } if (it->at(0) == '-' || it->at(0) == '+') { - // Note that +xy:zt (instead of +xyzt) sometimes happen, we - // may want to process it one day - if (it->length() < 5) { - DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n", - dt.c_str())); - goto nozone; - } - zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+ - (it->at(3)-'0')*10 + it->at(4)-'0'; - zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs; + // Note that +xy:zt (instead of +xyzt) sometimes happen, we + // may want to process it one day + if (it->length() < 5) { + DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n", + dt.c_str())); + goto nozone; + } + zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+ + (it->at(3)-'0')*10 + it->at(4)-'0'; + zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs; } else { - int hours; - if (*it == "A") hours= 1; else if (*it == "B") hours= 2; - else if (*it == "C") hours= 3; else if (*it == "D") hours= 4; - else if (*it == "E") hours= 5; else if (*it == "F") hours= 6; - else if (*it == "G") hours= 7; else if (*it == "H") hours= 8; - else if (*it == "I") hours= 9; else if (*it == "K") hours= 10; - else if (*it == "L") hours= 11; else if (*it == "M") hours= 12; - else if (*it == "N") hours= -1; else if (*it == "O") hours= -2; - else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4; - else if (*it == "R") hours= -5; else if (*it == "S") hours= -6; - else if (*it == "T") hours= -7; else if (*it == "U") hours= -8; - else if (*it == "V") hours= -9; else if (*it == "W") hours= -10; - else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12; - else if (*it == "Z") hours= 0; else if (*it == "UT") hours= 0; - else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5; - else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6; - else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7; - else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8; - else if (*it == "PDT") hours= 7; - // Non standard names - // Standard Time (or Irish Summer Time?) is actually +5.5 - else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9; - else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0; - else if (*it == "MET") hours= -1; - else { - DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n", - dt.c_str())); - // Forget tz - goto nozone; - } - zonesecs = 3600 * hours; + int hours; + if (*it == "A") hours= 1; else if (*it == "B") hours= 2; + else if (*it == "C") hours= 3; else if (*it == "D") hours= 4; + else if (*it == "E") hours= 5; else if (*it == "F") hours= 6; + else if (*it == "G") hours= 7; else if (*it == "H") hours= 8; + else if (*it == "I") hours= 9; else if (*it == "K") hours= 10; + else if (*it == "L") hours= 11; else if (*it == "M") hours= 12; + else if (*it == "N") hours= -1; else if (*it == "O") hours= -2; + else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4; + else if (*it == "R") hours= -5; else if (*it == "S") hours= -6; + else if (*it == "T") hours= -7; else if (*it == "U") hours= -8; + else if (*it == "V") hours= -9; else if (*it == "W") hours= -10; + else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12; + else if (*it == "Z") hours= 0; else if (*it == "UT") hours= 0; + else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5; + else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6; + else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7; + else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8; + else if (*it == "PDT") hours= 7; + // Non standard names + // Standard Time (or Irish Summer Time?) is actually +5.5 + else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9; + else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0; + else if (*it == "MET") hours= -1; + else { + DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n", + dt.c_str())); + // Forget tz + goto nozone; + } + zonesecs = 3600 * hours; } DATEDEB((stderr, "Tz: [%s] -> %d\n", it->c_str(), zonesecs)); - nozone: +nozone: // Compute the UTC Unix time value #ifndef sun @@ -792,14 +792,14 @@ extern time_t rfc2822DateToUxTime(const string& date); static const char *thisprog; static char usage [] = -"-p: header value and parameter test\n" -"-q: qp decoding\n" -"-b: base64\n" -"-7: rfc2047\n" -"-1: rfc2331\n" -"-t: date time\n" -" \n\n" -; + "-p: header value and parameter test\n" + "-q: qp decoding\n" + "-b: base64\n" + "-7: rfc2047\n" + "-1: rfc2331\n" + "-t: date time\n" + " \n\n" + ; static void Usage(void) { @@ -809,8 +809,8 @@ Usage(void) static int op_flags; #define OPT_MOINS 0x1 -#define OPT_p 0x2 -#define OPT_q 0x4 +#define OPT_p 0x2 +#define OPT_q 0x4 #define OPT_b 0x8 #define OPT_7 0x10 #define OPT_1 0x20 @@ -818,159 +818,168 @@ static int op_flags; int main(int argc, const char **argv) { - int count = 10; + int count = 10; - thisprog = argv[0]; - argc--; argv++; + thisprog = argv[0]; + argc--; argv++; - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - /* Cas du "adb - core" */ - Usage(); - while (**argv) - switch (*(*argv)++) { - case 'p': op_flags |= OPT_p; break; - case 'q': op_flags |= OPT_q; break; - case 'b': op_flags |= OPT_b; break; - case '1': op_flags |= OPT_1; break; - case '7': op_flags |= OPT_7; break; - case 't': op_flags |= OPT_t; break; - default: Usage(); break; - } - b1: argc--; argv++; - } + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'p': op_flags |= OPT_p; break; + case 'q': op_flags |= OPT_q; break; + case 'b': op_flags |= OPT_b; break; + case '1': op_flags |= OPT_1; break; + case '7': op_flags |= OPT_7; break; + case 't': op_flags |= OPT_t; break; + default: Usage(); break; + } + b1: argc--; argv++; + } - if (argc != 0) - Usage(); + if (argc != 0) + Usage(); - if (op_flags & OPT_p) { - // Mime header value and parameters extraction - const char *tr[] = { - "text/html;charset = UTF-8 ; otherparam=garb; \n" - "QUOTEDPARAM=\"quoted value\"", + if (op_flags & OPT_p) { + // Mime header value and parameters extraction + const char *tr[] = { + "text/html;charset = UTF-8 ; otherparam=garb; \n" + "QUOTEDPARAM=\"quoted value\"", - "text/plain; charset=ASCII\r\n name=\"809D3016_5691DPS_5.2.LIC\"", + "text/plain; charset=ASCII\r\n name=\"809D3016_5691DPS_5.2.LIC\"", - "application/x-stuff;" - "title*0*=us-ascii'en'This%20is%20even%20more%20;" - "title*1*=%2A%2A%2Afun%2A%2A%2A%20;" - "title*2=\"isn't it!\"" - }; + "application/x-stuff;" + "title*0*=us-ascii'en'This%20is%20even%20more%20;" + "title*1*=%2A%2A%2Afun%2A%2A%2A%20;" + "title*2=\"isn't it!\"", + + // The following are all invalid, trying to crash the parser... + "", + // This does not parse because of whitespace in the value. + " complete garbage;", + // This parses, but only the first word gets into the value + " some value", + " word ;", ";", "=", "; = ", "a;=\"toto tutu\"=", ";;;;a=b", + }; - for (unsigned int i = 0; i < sizeof(tr) / sizeof(char *); i++) { - MimeHeaderValue parsed; - if (!parseMimeHeaderValue(tr[i], parsed)) { - fprintf(stderr, "PARSE ERROR for [%s]\n", tr[i]); - } - printf("Field value: [%s]\n", parsed.value.c_str()); - map::iterator it; - for (it = parsed.params.begin();it != parsed.params.end();it++) { - if (it == parsed.params.begin()) - printf("Parameters:\n"); - printf(" [%s] = [%s]\n", it->first.c_str(), it->second.c_str()); - } - } + for (unsigned int i = 0; i < sizeof(tr) / sizeof(char *); i++) { + MimeHeaderValue parsed; + if (!parseMimeHeaderValue(tr[i], parsed)) { + fprintf(stderr, "PARSE ERROR for [%s]\n", tr[i]); + continue; + } + printf("Field value: [%s]\n", parsed.value.c_str()); + map::iterator it; + for (it = parsed.params.begin();it != parsed.params.end();it++) { + if (it == parsed.params.begin()) + printf("Parameters:\n"); + printf(" [%s] = [%s]\n", it->first.c_str(), it->second.c_str()); + } + } - } else if (op_flags & OPT_q) { - // Quoted printable stuff - const char *qp = - "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme " - "agrave is: '=E0' probable skipped decode error: =\n" - "Actual decode error =xx this wont show"; + } else if (op_flags & OPT_q) { + // Quoted printable stuff + const char *qp = + "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme " + "agrave is: '=E0' probable skipped decode error: =\n" + "Actual decode error =xx this wont show"; - string out; - if (!qp_decode(string(qp), out)) { - fprintf(stderr, "qp_decode returned error\n"); - } - printf("Decoded: '%s'\n", out.c_str()); - } else if (op_flags & OPT_b) { - // Base64 - //'C'est à boire qu'il nous faut éviter l'excès.' - //'Deuxième ligne' - //'Troisième ligne' - //'Et la fin (pas de nl). ' - const char *b64 = - "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n" - "IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n"; + string out; + if (!qp_decode(string(qp), out)) { + fprintf(stderr, "qp_decode returned error\n"); + } + printf("Decoded: '%s'\n", out.c_str()); + } else if (op_flags & OPT_b) { + // Base64 + //'C'est à boire qu'il nous faut éviter l'excès.' + //'Deuxième ligne' + //'Troisième ligne' + //'Et la fin (pas de nl). ' + const char *b64 = + "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n" + "IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n"; - string out; - if (!base64_decode(string(b64), out)) { - fprintf(stderr, "base64_decode returned error\n"); - exit(1); - } - printf("Decoded: [%s]\n", out.c_str()); + string out; + if (!base64_decode(string(b64), out)) { + fprintf(stderr, "base64_decode returned error\n"); + exit(1); + } + printf("Decoded: [%s]\n", out.c_str()); #if 0 - string coded, decoded; - const char *fname = "/tmp/recoll_decodefail"; - if (!file_to_string(fname, coded)) { - fprintf(stderr, "Cant read %s\n", fname); - exit(1); - } + string coded, decoded; + const char *fname = "/tmp/recoll_decodefail"; + if (!file_to_string(fname, coded)) { + fprintf(stderr, "Cant read %s\n", fname); + exit(1); + } - if (!base64_decode(coded, decoded)) { - fprintf(stderr, "base64_decode returned error\n"); - exit(1); - } - printf("Decoded: [%s]\n", decoded.c_str()); + if (!base64_decode(coded, decoded)) { + fprintf(stderr, "base64_decode returned error\n"); + exit(1); + } + printf("Decoded: [%s]\n", decoded.c_str()); #endif - } else if (op_flags & (OPT_7|OPT_1)) { - // rfc2047 - char line [1024]; - string out; - bool res; - while (fgets(line, 1023, stdin)) { - int l = strlen(line); - if (l == 0) - continue; - line[l-1] = 0; - fprintf(stderr, "Line: [%s]\n", line); - string charset; - if (op_flags & OPT_7) { - res = rfc2047_decode(line, out); - } else { - res = rfc2231_decode(line, out, charset); - } - if (res) - fprintf(stderr, "Out: [%s] cs %s\n", out.c_str(), charset.c_str()); - else - fprintf(stderr, "Decoding failed\n"); - } - } else if (op_flags & OPT_t) { - time_t t; - - const char *dates[] = { - " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", - " Mon, 3 Jul 2006 09:51:58 +0200", - " Wed, 13 Sep 2006 08:19:48 GMT-07:00", - " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", - " Sat, 23 Dec 89 19:27:12 EST", - " 13 Jan 90 08:23:29 GMT"}; + } else if (op_flags & (OPT_7|OPT_1)) { + // rfc2047 + char line [1024]; + string out; + bool res; + while (fgets(line, 1023, stdin)) { + int l = strlen(line); + if (l == 0) + continue; + line[l-1] = 0; + fprintf(stderr, "Line: [%s]\n", line); + string charset; + if (op_flags & OPT_7) { + res = rfc2047_decode(line, out); + } else { + res = rfc2231_decode(line, out, charset); + } + if (res) + fprintf(stderr, "Out: [%s] cs %s\n", out.c_str(), charset.c_str()); + else + fprintf(stderr, "Decoding failed\n"); + } + } else if (op_flags & OPT_t) { + time_t t; + + const char *dates[] = { + " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", + " Mon, 3 Jul 2006 09:51:58 +0200", + " Wed, 13 Sep 2006 08:19:48 GMT-07:00", + " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)", + " Sat, 23 Dec 89 19:27:12 EST", + " 13 Jan 90 08:23:29 GMT"}; - for (unsigned int i = 0; i [%s]\n", dates[i], datebuf); - } - printf("Enter date:\n"); - char line [1024]; - while (fgets(line, 1023, stdin)) { - int l = strlen(line); - if (l == 0) continue; - line[l-1] = 0; - t = rfc2822DateToUxTime(line); - struct tm *tm = localtime(&t); - char datebuf[100]; - strftime(datebuf, 99, " %Y-%m-%d %H:%M:%S %z", tm); - printf("[%s] -> [%s]\n", line, datebuf); - } + for (unsigned int i = 0; i [%s]\n", dates[i], datebuf); + } + printf("Enter date:\n"); + char line [1024]; + while (fgets(line, 1023, stdin)) { + int l = strlen(line); + if (l == 0) continue; + line[l-1] = 0; + t = rfc2822DateToUxTime(line); + struct tm *tm = localtime(&t); + char datebuf[100]; + strftime(datebuf, 99, " %Y-%m-%d %H:%M:%S %z", tm); + printf("[%s] -> [%s]\n", line, datebuf); + } - } - exit(0); + } + exit(0); } #endif // TEST_MIMEPARSE diff --git a/src/utils/mimeparse.h b/src/utils/mimeparse.h index de9e17a1..eb0f9866 100644 --- a/src/utils/mimeparse.h +++ b/src/utils/mimeparse.h @@ -17,32 +17,32 @@ #ifndef _MIME_H_INCLUDED_ #define _MIME_H_INCLUDED_ /* -Mime definitions RFC to 4-9-2006: + Mime definitions RFC to 4-9-2006: -2045 Multipurpose Internet Mail Extensions (MIME) Part One: Format of - Internet Message Bodies. N. Freed, N. Borenstein. November 1996. - (Format: TXT=72932 bytes) (Obsoletes RFC1521, RFC1522, RFC1590) - (Updated by RFC2184, RFC2231) (Status: DRAFT STANDARD) + 2045 Multipurpose Internet Mail Extensions (MIME) Part One: Format of + Internet Message Bodies. N. Freed, N. Borenstein. November 1996. + (Format: TXT=72932 bytes) (Obsoletes RFC1521, RFC1522, RFC1590) + (Updated by RFC2184, RFC2231) (Status: DRAFT STANDARD) -2046 Multipurpose Internet Mail Extensions (MIME) Part Two: Media - Types. N. Freed, N. Borenstein. November 1996. (Format: TXT=105854 - bytes) (Obsoletes RFC1521, RFC1522, RFC1590) (Updated by RFC2646, - RFC3798) (Status: DRAFT STANDARD) + 2046 Multipurpose Internet Mail Extensions (MIME) Part Two: Media + Types. N. Freed, N. Borenstein. November 1996. (Format: TXT=105854 + bytes) (Obsoletes RFC1521, RFC1522, RFC1590) (Updated by RFC2646, + RFC3798) (Status: DRAFT STANDARD) -2047 MIME (Multipurpose Internet Mail Extensions) Part Three: Message - Header Extensions for Non-ASCII Text. K. Moore. November 1996. - (Format: TXT=33262 bytes) (Obsoletes RFC1521, RFC1522, RFC1590) - (Updated by RFC2184, RFC2231) (Status: DRAFT STANDARD) + 2047 MIME (Multipurpose Internet Mail Extensions) Part Three: Message + Header Extensions for Non-ASCII Text. K. Moore. November 1996. + (Format: TXT=33262 bytes) (Obsoletes RFC1521, RFC1522, RFC1590) + (Updated by RFC2184, RFC2231) (Status: DRAFT STANDARD) -2183 Communicating Presentation Information in Internet Messages: The - Content-Disposition Header Field. R. Troost, S. Dorner, K. Moore, - Ed.. August 1997. (Format: TXT=23150 bytes) (Updates RFC1806) - (Updated by RFC2184, RFC2231) (Status: PROPOSED STANDARD) + 2183 Communicating Presentation Information in Internet Messages: The + Content-Disposition Header Field. R. Troost, S. Dorner, K. Moore, + Ed.. August 1997. (Format: TXT=23150 bytes) (Updates RFC1806) + (Updated by RFC2184, RFC2231) (Status: PROPOSED STANDARD) -2231 MIME Parameter Value and Encoded Word Extensions: Character Sets, - Languages, and Continuations. N. Freed, K. Moore. November 1997. - (Format: TXT=19280 bytes) (Obsoletes RFC2184) (Updates RFC2045, - RFC2047, RFC2183) (Status: PROPOSED STANDARD) + 2231 MIME Parameter Value and Encoded Word Extensions: Character Sets, + Languages, and Continuations. N. Freed, K. Moore. November 1997. + (Format: TXT=19280 bytes) (Obsoletes RFC2184) (Updates RFC2045, + RFC2047, RFC2183) (Status: PROPOSED STANDARD) */ @@ -53,15 +53,11 @@ Mime definitions RFC to 4-9-2006: #include "base64.h" -#ifndef NO_NAMESPACES -using std::string; -#endif - /** A class to represent a MIME header value with parameters */ class MimeHeaderValue { - public: - string value; - std::map params; +public: + std::string value; + std::map params; }; /** @@ -70,14 +66,17 @@ class MimeHeaderValue { * @param in the input string should be like: value; pn1=pv1; pn2=pv2. * Example: text/plain; charset="iso-8859-1" */ -extern bool parseMimeHeaderValue(const string& in, MimeHeaderValue& psd); +extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd); /** - * Quoted printable decoding. Doubles up as rfc2231 decoder, hence the esc - * RFC2045 Quoted printable uses '=' , rfc2331 uses '%'. The two encodings are + * Quoted Printable decoding. + * + * Doubles up as rfc2231 decoder, with the help of the hence the @param esc + * parameter. + * RFC2045 Quoted Printable uses '=' , RFC2331 uses '%'. The two encodings are * otherwise similar. */ -extern bool qp_decode(const string& in, string &out, char esc = '='); +extern bool qp_decode(const std::string& in, std::string &out, char esc = '='); /** Decode an Internet mail field value encoded according to rfc2047 * @@ -90,14 +89,14 @@ extern bool qp_decode(const string& in, string &out, char esc = '='); * @param in input string, ascii with rfc2047 markup * @return out output string encoded in utf-8 */ -extern bool rfc2047_decode(const string& in, string &out); +extern bool rfc2047_decode(const std::string& in, std::string &out); -/** Decode RFC2822 date to unix time (gmt secs from 1970 +/** Decode RFC2822 date to unix time (gmt secs from 1970) * * @param dt date string (the part after Date: ) * @return unix time */ -time_t rfc2822DateToUxTime(const string& dt); +time_t rfc2822DateToUxTime(const std::string& dt); #endif /* _MIME_H_INCLUDED_ */