From f10e14658f60e098a7a4cf8db235d01eab7c2806 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 22 Nov 2010 16:25:25 +0100 Subject: [PATCH] add quirks flag to handle bad thunderbird mbox formats --- src/internfile/mh_mbox.cpp | 65 ++++++++++++++++++++++------------- src/internfile/mh_mbox.h | 2 ++ src/sampleconf/recoll.conf.in | 7 +++- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index ace0a497..fb663ee1 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -225,6 +225,7 @@ private: const size_t MboxCache::o_b1size = 1024; static class MboxCache mcache; +static const string keyquirks("mhmboxquirks"); MimeHandlerMbox::~MimeHandlerMbox() { @@ -265,6 +266,18 @@ bool MimeHandlerMbox::set_document_file(const string &fn) fseek((FILE*)m_vfp, 0, SEEK_SET); m_havedoc = true; m_offsets.clear(); + m_quirks = 0; + + // Check for location-based quirks: + RclConfig *config = RclConfig::getMainConfig(); + string quirks; + if (config && config->getConfParam(keyquirks, quirks)) { + if (quirks == "tbird") { + LOGDEB(("MimeHandlerMbox: setting quirks TBIRD\n")); + m_quirks |= MBOXQUIRK_TBIRD; + } + } + return true; } @@ -325,10 +338,6 @@ static inline void stripendnl(line_type& line, int& ll) // emacs-vm, Recoll is not alone // Update: 2009-11-27: word after From may be quoted string: From "john bull" static const char *frompat = -#if 0 //1.9.0 - "^From .* [1-2][0-9][0-9][0-9]$"; -#endif -#if 1 "^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") ' "[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26 "[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional @@ -340,8 +349,7 @@ static const char *frompat = "[12][0-9][0-9][0-9][ ]+" // Year "[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional ; -#endif - // "([ ]+[-+][0-9]{4})?$" + static regex_t fromregex; static bool regcompiled; @@ -425,15 +433,21 @@ bool MimeHandlerMbox::next_document() hademptyline = true; continue; } - if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { - LOGDEB0(("MimeHandlerMbox: msgnum %d, From_ at line %d: [%s]\n", - m_msgnum, m_lineno, line)); - start = ftello(fp); - m_offsets.push_back(off_From); - m_msgnum++; - break; + // Non empty line. If the previous one was empty, check regex + if (hademptyline) { + // Tbird sometimes omits the empty line, so avoid resetting + // state (initially true) and hope for the best + if (!(m_quirks & MBOXQUIRK_TBIRD)) + hademptyline = false; + if (!regexec(&fromregex, line, 0, 0, 0)) { + LOGDEB0(("MimeHandlerMbox: msgnum %d, " + "From_ at line %d: [%s]\n", m_msgnum, m_lineno, line)); + start = ftello(fp); + m_offsets.push_back(off_From); + m_msgnum++; + break; + } } - hademptyline = false; } // Look for next 'From ' line or eof, end of message. @@ -449,22 +463,25 @@ bool MimeHandlerMbox::next_document() stripendnl(line, ll); LOGDEB2(("End: hadempty %d ll %d Line: [%s]\n", hademptyline, ll, line)); - if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { - // Rewind to start of "From " line - fseek(fp, end, SEEK_SET); - m_lineno--; - break; + if (hademptyline) { + if (ll > 0) { + if (!(m_quirks & MBOXQUIRK_TBIRD)) + hademptyline = false; + if (!regexec(&fromregex, line, 0, 0, 0)) { + // Rewind to start of "From " line + fseek(fp, end, SEEK_SET); + m_lineno--; + break; + } + } + } else if (ll <= 0) { + hademptyline = true; } if (mtarg <= 0 || m_msgnum == mtarg) { line[ll] = '\n'; line[ll+1] = 0; msgtxt += line; } - if (ll <= 0) { - hademptyline = true; - } else { - hademptyline = false; - } } } while (mtarg > 0 && m_msgnum < mtarg); diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h index 8c29e6b3..71c3b22a 100644 --- a/src/internfile/mh_mbox.h +++ b/src/internfile/mh_mbox.h @@ -52,6 +52,8 @@ class MimeHandlerMbox : public RecollFilter { int m_lineno; // debug mbhoff_type m_fsize; vector m_offsets; + enum Quirks {MBOXQUIRK_TBIRD=1}; + int m_quirks; }; #endif /* _MBOX_H_INCLUDED_ */ diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index 73bf1fcd..65e3965b 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -17,7 +17,8 @@ topdirs = ~ # ".*" in there (as was the case with an older sample config) # These are simple names, not paths (must contain no / ) skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \ - *~ recollrc .beagle .git .hg .bzr loop.ps + *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ + .recoll* xapiandb recollrc recoll.conf # Wildcard expressions for paths we shouldn't go into. The database and # configuration directories will be added in there, else the default value @@ -170,3 +171,7 @@ webcachemaxmbs = 40 # used for application selection inside mimeview #[/some/app/directory] #localfields = rclaptg = someapp; otherfield = somevalue + +# Enable thunderbird mbox format quirks where appropriate +[~/.thunderbird] +mhmboxquirks = tbird