decode encoded mail headers, plus use message date instead of file mtime
This commit is contained in:
parent
8493933aef
commit
763b5f58c7
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.11 2005-04-06 10:20:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.12 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -172,10 +172,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
if (fis == FileInterner::FIError)
|
||||
break;
|
||||
|
||||
// Set up common fields:
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
doc.mtime = ascdate;
|
||||
// Set the date if this was not done in the document handler
|
||||
// (ie: date from Date: mail header).
|
||||
if (doc.mtime.empty()) {
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
doc.mtime = ascdate;
|
||||
}
|
||||
// Internal access path for multi-document files
|
||||
doc.ipath = ipath;
|
||||
|
||||
// Do database-specific work to update document data
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.5 2005-04-06 10:20:11 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.6 2005-10-15 12:18:04 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
@ -172,30 +172,42 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
|
||||
// Handle some headers. We should process rfc2047 encoding here
|
||||
// Also there should be no 8bit chars, but there sometimes are. So
|
||||
// we transcode as if from iso-8859-1, which is better than
|
||||
// getting utf8 conversion errors later on
|
||||
// Handle some headers.
|
||||
Binc::HeaderItem hi;
|
||||
string transcoded;
|
||||
if (doc.h.getFirstHeader("Subject", hi)) {
|
||||
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
docout.title = transcoded;
|
||||
}
|
||||
if (doc.h.getFirstHeader("From", hi)) {
|
||||
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
docout.text += string("From: ") + transcoded + string("\n");
|
||||
}
|
||||
if (doc.h.getFirstHeader("To", hi)) {
|
||||
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
docout.text += string("To: ") + transcoded + string("\n");
|
||||
}
|
||||
if (doc.h.getFirstHeader("Date", hi)) {
|
||||
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
// Try to set the mtime from the date field.
|
||||
string date = transcoded;
|
||||
string::size_type pos;
|
||||
// Possibly get rid of the day
|
||||
if ((pos = date.find(",")) != string::npos)
|
||||
date = date.substr(pos+1);
|
||||
struct tm tm;
|
||||
if (strptime(date.c_str(), " %d %b %Y %H:%M:%S %z ", &tm)) {
|
||||
char ascuxtime[100];
|
||||
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
||||
docout.mtime = ascuxtime;
|
||||
} else {
|
||||
LOGDEB(("strptime failed for [%s]\n", date.c_str()));
|
||||
}
|
||||
|
||||
docout.text += string("Date: ") + transcoded + string("\n");
|
||||
}
|
||||
if (doc.h.getFirstHeader("Subject", hi)) {
|
||||
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
docout.text += string("Subject: ") + transcoded + string("\n");
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.4 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#ifndef TEST_MIMEPARSE
|
||||
@ -348,6 +348,148 @@ bool base64_decode(const string& in, string& out)
|
||||
return true;
|
||||
}
|
||||
|
||||
#include "transcode.h"
|
||||
#include "smallut.h"
|
||||
|
||||
// Decode a parsed encoded word
|
||||
static bool rfc2047_decodeParsed(const std::string& charset,
|
||||
const std::string& encoding,
|
||||
const std::string& value,
|
||||
std::string &utf8)
|
||||
{
|
||||
// fprintf(stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
|
||||
// charset.c_str(), encoding.c_str(), value.c_str());
|
||||
utf8 = "";
|
||||
|
||||
string decoded;
|
||||
if (!stringlowercmp("b", encoding)) {
|
||||
if (!base64_decode(value, decoded))
|
||||
return false;
|
||||
// fprintf(stderr, "FromB64: [%s]\n", decoded.c_str());
|
||||
} else if (!stringlowercmp("q", encoding)) {
|
||||
if (!qp_decode(value, decoded))
|
||||
return false;
|
||||
// Need to translate _ to ' ' here
|
||||
string temp;
|
||||
for (string::size_type pos = 0; pos < decoded.length(); pos++)
|
||||
if (decoded[pos] == '_')
|
||||
temp += ' ';
|
||||
else
|
||||
temp += decoded[pos];
|
||||
decoded = temp;
|
||||
// fprintf(stderr, "FromQP: [%s]\n", decoded.c_str());
|
||||
} else {
|
||||
// fprintf(stderr, "Bad encoding [%s]\n", encoding.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!transcode(decoded, utf8, charset, "UTF-8")) {
|
||||
// fprintf(stderr, "Transcode failed\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Parse a mail header encoded value
|
||||
typedef enum {rfc2047base, rfc2047open_eq, rfc2047charset, rfc2047encoding,
|
||||
rfc2047value, rfc2047close_q} Rfc2047States;
|
||||
|
||||
bool rfc2047_decode(const std::string& in, std::string &out)
|
||||
{
|
||||
Rfc2047States state = rfc2047base;
|
||||
string encoding, charset, value, utf8;
|
||||
|
||||
out = "";
|
||||
|
||||
for (unsigned int ii = 0; ii < in.length(); ii++) {
|
||||
char ch = in[ii];
|
||||
switch (state) {
|
||||
case rfc2047base:
|
||||
{
|
||||
switch (ch) {
|
||||
case '=': state = rfc2047open_eq; break;
|
||||
default: value += ch;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case rfc2047open_eq:
|
||||
{
|
||||
switch (ch) {
|
||||
case '?':
|
||||
{
|
||||
// Transcode current (unencoded part) value:
|
||||
// we sometimes find 8-bit chars in
|
||||
// there. Interpret as Iso8859.
|
||||
if (value.length() > 0) {
|
||||
transcode(value, utf8, "ISO8859-1", "UTF-8");
|
||||
out += utf8;
|
||||
value = "";
|
||||
}
|
||||
state = rfc2047charset;
|
||||
}
|
||||
break;
|
||||
default: state = rfc2047base; out += '='; out += ch;break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case rfc2047charset:
|
||||
{
|
||||
switch (ch) {
|
||||
case '?': state = rfc2047encoding; break;
|
||||
default: charset += ch; break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case rfc2047encoding:
|
||||
{
|
||||
switch (ch) {
|
||||
case '?': state = rfc2047value; break;
|
||||
default: encoding += ch; break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case rfc2047value:
|
||||
{
|
||||
switch (ch) {
|
||||
case '?': state = rfc2047close_q; break;
|
||||
default: value += ch;break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case rfc2047close_q:
|
||||
{
|
||||
switch (ch) {
|
||||
case '=':
|
||||
{
|
||||
string utf8;
|
||||
state = rfc2047base;
|
||||
if (!rfc2047_decodeParsed(charset, encoding, value,
|
||||
utf8)) {
|
||||
return false;
|
||||
}
|
||||
out += utf8;
|
||||
charset = encoding = value = "";
|
||||
}
|
||||
break;
|
||||
default: state = rfc2047value; value += '?';value += ch;break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default: // ??
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (value.length() > 0) {
|
||||
transcode(value, utf8, "ISO8859-1", "UTF-8");
|
||||
out += utf8;
|
||||
value = "";
|
||||
}
|
||||
if (state != rfc2047base)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <string>
|
||||
@ -382,7 +524,7 @@ main(int argc, const char **argv)
|
||||
fprintf(stderr, "qp_decode returned error\n");
|
||||
}
|
||||
printf("Decoded: '%s'\n", out.c_str());
|
||||
#else
|
||||
#elif 0
|
||||
//'C'est à boire qu'il nous faut éviter l'excès.'
|
||||
//'Deuxième ligne'
|
||||
//'Troisième ligne'
|
||||
@ -396,6 +538,18 @@ main(int argc, const char **argv)
|
||||
fprintf(stderr, "base64_decode returned error\n");
|
||||
}
|
||||
printf("Decoded: '%s'\n", out.c_str());
|
||||
#elif 1
|
||||
char line [1024];
|
||||
string out;
|
||||
while (fgets(line, 1023, stdin)) {
|
||||
int l = strlen(line);
|
||||
if (l == 0)
|
||||
continue;
|
||||
line[l-1] = 0;
|
||||
fprintf(stderr, "Line: [%s]\n", line);
|
||||
rfc2047_decode(line, out);
|
||||
fprintf(stderr, "Out: [%s]\n", out.c_str());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _MIME_H_INCLUDED_
|
||||
#define _MIME_H_INCLUDED_
|
||||
/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mimeparse.h,v 1.3 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
@ -15,5 +15,6 @@ extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd);
|
||||
|
||||
bool qp_decode(const std::string& in, std::string &out);
|
||||
bool base64_decode(const std::string& in, std::string &out);
|
||||
bool rfc2047_decode(const std::string& in, std::string &out);
|
||||
|
||||
#endif /* _MIME_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user