decode encoded mail headers, plus use message date instead of file mtime

This commit is contained in:
dockes 2005-10-15 12:18:04 +00:00
parent 8493933aef
commit 763b5f58c7
4 changed files with 189 additions and 18 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.11 2005-04-06 10:20:11 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.12 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -172,10 +172,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
if (fis == FileInterner::FIError)
break;
// Set up common fields:
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_ctime));
doc.mtime = ascdate;
// Set the date if this was not done in the document handler
// (ie: date from Date: mail header).
if (doc.mtime.empty()) {
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_ctime));
doc.mtime = ascdate;
}
// Internal access path for multi-document files
doc.ipath = ipath;
// Do database-specific work to update document data

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.5 2005-04-06 10:20:11 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.6 2005-10-15 12:18:04 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
@ -172,30 +172,42 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
return MimeHandler::MHError;
}
// Handle some headers. We should process rfc2047 encoding here
// Also there should be no 8bit chars, but there sometimes are. So
// we transcode as if from iso-8859-1, which is better than
// getting utf8 conversion errors later on
// Handle some headers.
Binc::HeaderItem hi;
string transcoded;
if (doc.h.getFirstHeader("Subject", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
rfc2047_decode(hi.getValue(), transcoded);
docout.title = transcoded;
}
if (doc.h.getFirstHeader("From", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("From: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("To", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("To: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("Date", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
rfc2047_decode(hi.getValue(), transcoded);
// Try to set the mtime from the date field.
string date = transcoded;
string::size_type pos;
// Possibly get rid of the day
if ((pos = date.find(",")) != string::npos)
date = date.substr(pos+1);
struct tm tm;
if (strptime(date.c_str(), " %d %b %Y %H:%M:%S %z ", &tm)) {
char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
docout.mtime = ascuxtime;
} else {
LOGDEB(("strptime failed for [%s]\n", date.c_str()));
}
docout.text += string("Date: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("Subject", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("Subject: ") + transcoded + string("\n");
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.4 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_MIMEPARSE
@ -348,6 +348,148 @@ bool base64_decode(const string& in, string& out)
return true;
}
#include "transcode.h"
#include "smallut.h"
// Decode a parsed encoded word
static bool rfc2047_decodeParsed(const std::string& charset,
const std::string& encoding,
const std::string& value,
std::string &utf8)
{
// fprintf(stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
// charset.c_str(), encoding.c_str(), value.c_str());
utf8 = "";
string decoded;
if (!stringlowercmp("b", encoding)) {
if (!base64_decode(value, decoded))
return false;
// fprintf(stderr, "FromB64: [%s]\n", decoded.c_str());
} else if (!stringlowercmp("q", encoding)) {
if (!qp_decode(value, decoded))
return false;
// Need to translate _ to ' ' here
string temp;
for (string::size_type pos = 0; pos < decoded.length(); pos++)
if (decoded[pos] == '_')
temp += ' ';
else
temp += decoded[pos];
decoded = temp;
// fprintf(stderr, "FromQP: [%s]\n", decoded.c_str());
} else {
// fprintf(stderr, "Bad encoding [%s]\n", encoding.c_str());
return false;
}
if (!transcode(decoded, utf8, charset, "UTF-8")) {
// fprintf(stderr, "Transcode failed\n");
return false;
}
return true;
}
// Parse a mail header encoded value
typedef enum {rfc2047base, rfc2047open_eq, rfc2047charset, rfc2047encoding,
rfc2047value, rfc2047close_q} Rfc2047States;
bool rfc2047_decode(const std::string& in, std::string &out)
{
Rfc2047States state = rfc2047base;
string encoding, charset, value, utf8;
out = "";
for (unsigned int ii = 0; ii < in.length(); ii++) {
char ch = in[ii];
switch (state) {
case rfc2047base:
{
switch (ch) {
case '=': state = rfc2047open_eq; break;
default: value += ch;
}
}
break;
case rfc2047open_eq:
{
switch (ch) {
case '?':
{
// Transcode current (unencoded part) value:
// we sometimes find 8-bit chars in
// there. Interpret as Iso8859.
if (value.length() > 0) {
transcode(value, utf8, "ISO8859-1", "UTF-8");
out += utf8;
value = "";
}
state = rfc2047charset;
}
break;
default: state = rfc2047base; out += '='; out += ch;break;
}
}
break;
case rfc2047charset:
{
switch (ch) {
case '?': state = rfc2047encoding; break;
default: charset += ch; break;
}
}
break;
case rfc2047encoding:
{
switch (ch) {
case '?': state = rfc2047value; break;
default: encoding += ch; break;
}
}
break;
case rfc2047value:
{
switch (ch) {
case '?': state = rfc2047close_q; break;
default: value += ch;break;
}
}
break;
case rfc2047close_q:
{
switch (ch) {
case '=':
{
string utf8;
state = rfc2047base;
if (!rfc2047_decodeParsed(charset, encoding, value,
utf8)) {
return false;
}
out += utf8;
charset = encoding = value = "";
}
break;
default: state = rfc2047value; value += '?';value += ch;break;
}
}
break;
default: // ??
return false;
}
}
if (value.length() > 0) {
transcode(value, utf8, "ISO8859-1", "UTF-8");
out += utf8;
value = "";
}
if (state != rfc2047base)
return false;
return true;
}
#else
#include <string>
@ -382,7 +524,7 @@ main(int argc, const char **argv)
fprintf(stderr, "qp_decode returned error\n");
}
printf("Decoded: '%s'\n", out.c_str());
#else
#elif 0
//'C'est à boire qu'il nous faut éviter l'excès.'
//'Deuxième ligne'
//'Troisième ligne'
@ -396,6 +538,18 @@ main(int argc, const char **argv)
fprintf(stderr, "base64_decode returned error\n");
}
printf("Decoded: '%s'\n", out.c_str());
#elif 1
char line [1024];
string out;
while (fgets(line, 1023, stdin)) {
int l = strlen(line);
if (l == 0)
continue;
line[l-1] = 0;
fprintf(stderr, "Line: [%s]\n", line);
rfc2047_decode(line, out);
fprintf(stderr, "Out: [%s]\n", out.c_str());
}
#endif
}

View File

@ -1,6 +1,6 @@
#ifndef _MIME_H_INCLUDED_
#define _MIME_H_INCLUDED_
/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mimeparse.h,v 1.3 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <map>
@ -15,5 +15,6 @@ extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd);
bool qp_decode(const std::string& in, std::string &out);
bool base64_decode(const std::string& in, std::string &out);
bool rfc2047_decode(const std::string& in, std::string &out);
#endif /* _MIME_H_INCLUDED_ */