diff --git a/src/internfile/Makefile b/src/internfile/Makefile index 07a950a4..abbf7e1f 100644 --- a/src/internfile/Makefile +++ b/src/internfile/Makefile @@ -1,9 +1,9 @@ -# @(#$Id: Makefile,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes +# @(#$Id: Makefile,v 1.3 2007-10-03 14:53:37 dockes Exp $ (C) 2005 J.F.Dockes depth = .. include $(depth)/mk/sysconf # Only test executables get build in here -PROGS = internfile +PROGS = mh_mbox internfile all: $(BIGLIB) $(PROGS) @@ -18,6 +18,16 @@ internfile : $(INTERNFILE_OBJS) trinternfile.o : internfile.cpp $(CXX) $(ALL_CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \ internfile.cpp + +MH_MBOX_OBJS= trmh_mbox.o $(BIGLIB) $(MIMELIB) +mh_mbox : $(MH_MBOX_OBJS) + $(CXX) $(ALL_CXXFLAGS) -o mh_mbox $(MH_MBOX_OBJS) \ + $(LIBICONV) $(LIBSYS) +trmh_mbox.o : mh_mbox.cpp + $(CXX) $(ALL_CXXFLAGS) -DTEST_MH_MBOX -c -o trmh_mbox.o \ + mh_mbox.cpp + + clean: rm -f *.o $(PROGS) diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index 9607a3f6..13ae060d 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.2 2007-10-03 14:53:37 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -17,7 +17,7 @@ static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - +#ifndef TEST_MH_MBOX #include #include #include @@ -63,7 +63,70 @@ bool MimeHandlerMbox::set_document_file(const string &fn) return true; } -static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$"; +#define LL 1024 +typedef char line_type[LL+10]; +static inline void stripendnl(line_type& line, int& ll) +{ + ll = strlen(line); + while (ll > 0) { + if (line[ll-1] == '\n' || line[ll-1] == '\r') { + line[ll-1] = 0; + ll--; + } else + break; + } +} + +// The mbox format uses lines beginning with 'From ' as separator. +// Mailers are supposed to quote any other lines beginning with +// 'From ', turning it into '>From '. This should make it easy to detect +// message boundaries by matching a '^From ' regular expression +// Unfortunately this quoting is quite often incorrect in the real world. +// +// The rest of the format for the line is somewhat variable, but there will +// be a 4 digit year somewhere... +// The canonic format is the following, with a 24 characters date: +// From toto@tutu.com Sat Sep 30 16:44:06 2000 +// This resulted into the pattern for versions up to 1.9.0: +// "^From .* [1-2][0-9][0-9][0-9]$" +// +// Some mailers add a time zone to the date, this is non-"standard", +// but happens, like in: +// From toto@truc.com Sat Sep 30 16:44:06 2000 -0400 +// +// This is taken into account in the new regexp, which also matches more +// of the date format, to catch a few actual issues like +// From http://www.itu.int/newsroom/press/releases/1998/NP-2.html: +// Note that this *should* have been quoted. +// +// http://www.qmail.org/man/man5/mbox.html seems to indicate that the +// fact that From_ is normally preceded by a blank line should not be +// used, but we do it anyway (for now). +// The same source indicates that arbitrary data can follow the date field +// +// A variety of pathologic From_ lines: +// Bad date format: +// From uucp Wed May 22 11:28 GMT 1996 +// Added timezone at the end (ok, part of the "any data" after the date) +// From qian2@fas.harvard.edu Sat Sep 30 16:44:06 2000 -0400 +// Emacs VM botch ? Adds tz between hour and year +// From dockes Wed Feb 23 10:31:20 +0100 2005 +// From dockes Fri Dec 1 20:36:39 +0100 2006 +// The modified regexp gives the exact same results on the ietf mail archive +// and my own's. +static const char *frompat = +#if 0 //1.9.0 + "^From .* [1-2][0-9][0-9][0-9]$"; +#endif +#if 1 +"^From[ ]+[^ ]+[ ]+" // From toto@tutu +"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Date +"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional +"([^ ]+[ ]+)?" // Optional tz +"[12][0-9][0-9][0-9]" // Year, unanchored, more data may follow + ; +#endif + // "([ ]+[-+][0-9]{4})?$" static regex_t fromregex; static bool regcompiled; @@ -81,14 +144,15 @@ bool MimeHandlerMbox::next_document() if (m_ipath != "") { sscanf(m_ipath.c_str(), "%d", &mtarg); } else if (m_forPreview) { - // Can't preview an mbox + // Can't preview an mbox. + LOGDEB(("MimeHandlerMbox::next_document: can't preview folders!\n")); return false; } - LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", + LOGDEB0(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", m_fn.c_str(), m_msgnum, mtarg)); if (!regcompiled) { - regcomp(&fromregex, frompat, REG_NOSUB); + regcomp(&fromregex, frompat, REG_NOSUB|REG_EXTENDED); regcompiled = true; } @@ -113,18 +177,27 @@ bool MimeHandlerMbox::next_document() do { // Look for next 'From ' Line, start of message. Set start to // line after this - char line[501]; + line_type line; for (;;) { - if (!fgets(line, 500, fp)) { + if (!fgets(line, LL, fp)) { // Eof hit while looking for 'From ' -> file done. We'd need // another return code here + LOGDEB2(("MimeHandlerMbox:next: hit eof while looking for " + "start From_ line\n")); return false; } - if (line[0] == '\n' || line[0] == '\r') { + m_lineno++; + int ll; + stripendnl(line, ll); + LOGDEB2(("Start: hadempty %d ll %d Line: [%s]\n", + hademptyline, ll, line)); + if (ll <= 0) { hademptyline = true; continue; } if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { + LOGDEB0(("MimeHandlerMbox: From_ at line %d: [%s]\n", + m_lineno, line)); start = ftello(fp); m_msgnum++; break; @@ -135,32 +208,117 @@ bool MimeHandlerMbox::next_document() // Look for next 'From ' line or eof, end of message. for (;;) { end = ftello(fp); - if (!fgets(line, 500, fp)) { + if (!fgets(line, LL, fp)) { if (ferror(fp) || feof(fp)) iseof = true; break; } + m_lineno++; + int ll; + stripendnl(line, ll); + LOGDEB2(("End: hadempty %d ll %d Line: [%s]\n", + hademptyline, ll, line)); if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { + // Rewind to start of "From " line + fseek(fp, end, SEEK_SET); + m_lineno--; break; } if (mtarg <= 0 || m_msgnum == mtarg) { + line[ll] = '\n'; + line[ll+1] = 0; msgtxt += line; } - if (line[0] == '\n' || line[0] == '\r') { + if (ll <= 0) { hademptyline = true; } else { hademptyline = false; } } - fseek(fp, end, SEEK_SET); + } while (mtarg > 0 && m_msgnum < mtarg); - LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); + LOGDEB1(("Message text: [%s]\n", msgtxt.c_str())); char buf[20]; sprintf(buf, "%d", m_msgnum); m_metaData["ipath"] = buf; m_metaData["mimetype"] = "message/rfc822"; - if (iseof) + if (iseof) { + LOGDEB2(("MimeHandlerMbox::next: eof hit\n")); m_havedoc = false; + } return msgtxt.empty() ? false : true; } + +#else // Test driver -> + +#include +#include +#include +#include +#include + +#include +#include +using namespace std; + +#include "rclinit.h" +#include "mh_mbox.h" + +static char *thisprog; + +static char usage [] = +" \n\n" +; +static void +Usage(void) +{ + fprintf(stderr, "%s: usage:\n%s", thisprog, usage); + exit(1); +} + +int main(int argc, char **argv) +{ + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + default: Usage(); break; + } + argc--; argv++; + } + + if (argc != 1) + Usage(); + string filename = *argv++;argc--; + string reason; + RclConfig *conf = recollinit(RclInitFlags(0), 0, 0, reason, 0); + if (conf == 0) { + cerr << "init failed " << reason << endl; + exit(1); + } + MimeHandlerMbox mh("text/x-mail"); + if (!mh.set_document_file(filename)) { + cerr << "set_document_file failed" << endl; + exit(1); + } + int docnt = 0; + while (mh.has_documents()) { + if (!mh.next_document()) { + cerr << "next_document failed" << endl; + exit(1); + } + docnt++; + } + cout << docnt << " documents found in " << filename << endl; + exit(0); +} + + +#endif // TEST_MH_MBOX diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h index 392dbd1e..a5e8c719 100644 --- a/src/internfile/mh_mbox.h +++ b/src/internfile/mh_mbox.h @@ -16,7 +16,7 @@ */ #ifndef _MBOX_H_INCLUDED_ #define _MBOX_H_INCLUDED_ -/* @(#$Id: mh_mbox.h,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mbox.h,v 1.2 2007-10-03 14:53:37 dockes Exp $ (C) 2004 J.F.Dockes */ #include using std::string; @@ -31,7 +31,7 @@ using std::string; class MimeHandlerMbox : public RecollFilter { public: MimeHandlerMbox(const string& mime) - : RecollFilter(mime), m_vfp(0), m_msgnum(0) + : RecollFilter(mime), m_vfp(0), m_msgnum(0), m_lineno(0) {} virtual ~MimeHandlerMbox(); virtual bool set_document_file(const string &file_path); @@ -46,6 +46,7 @@ class MimeHandlerMbox : public RecollFilter { void *m_vfp; // File pointer for folder int m_msgnum; // Current message number in folder. Starts at 1 string m_ipath; + int m_lineno; // debug }; #endif /* _MBOX_H_INCLUDED_ */