add quirks flag to handle bad thunderbird mbox formats

This commit is contained in:
Jean-Francois Dockes 2010-11-22 16:25:25 +01:00
parent 6c03417195
commit f10e14658f
3 changed files with 49 additions and 25 deletions

View File

@ -225,6 +225,7 @@ private:
const size_t MboxCache::o_b1size = 1024;
static class MboxCache mcache;
static const string keyquirks("mhmboxquirks");
MimeHandlerMbox::~MimeHandlerMbox()
{
@ -265,6 +266,18 @@ bool MimeHandlerMbox::set_document_file(const string &fn)
fseek((FILE*)m_vfp, 0, SEEK_SET);
m_havedoc = true;
m_offsets.clear();
m_quirks = 0;
// Check for location-based quirks:
RclConfig *config = RclConfig::getMainConfig();
string quirks;
if (config && config->getConfParam(keyquirks, quirks)) {
if (quirks == "tbird") {
LOGDEB(("MimeHandlerMbox: setting quirks TBIRD\n"));
m_quirks |= MBOXQUIRK_TBIRD;
}
}
return true;
}
@ -325,10 +338,6 @@ static inline void stripendnl(line_type& line, int& ll)
// emacs-vm, Recoll is not alone
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
static const char *frompat =
#if 0 //1.9.0
"^From .* [1-2][0-9][0-9][0-9]$";
#endif
#if 1
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") '
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
@ -340,8 +349,7 @@ static const char *frompat =
"[12][0-9][0-9][0-9][ ]+" // Year
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional
;
#endif
// "([ ]+[-+][0-9]{4})?$"
static regex_t fromregex;
static bool regcompiled;
@ -425,15 +433,21 @@ bool MimeHandlerMbox::next_document()
hademptyline = true;
continue;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
LOGDEB0(("MimeHandlerMbox: msgnum %d, From_ at line %d: [%s]\n",
m_msgnum, m_lineno, line));
start = ftello(fp);
m_offsets.push_back(off_From);
m_msgnum++;
break;
// Non empty line. If the previous one was empty, check regex
if (hademptyline) {
// Tbird sometimes omits the empty line, so avoid resetting
// state (initially true) and hope for the best
if (!(m_quirks & MBOXQUIRK_TBIRD))
hademptyline = false;
if (!regexec(&fromregex, line, 0, 0, 0)) {
LOGDEB0(("MimeHandlerMbox: msgnum %d, "
"From_ at line %d: [%s]\n", m_msgnum, m_lineno, line));
start = ftello(fp);
m_offsets.push_back(off_From);
m_msgnum++;
break;
}
}
hademptyline = false;
}
// Look for next 'From ' line or eof, end of message.
@ -449,22 +463,25 @@ bool MimeHandlerMbox::next_document()
stripendnl(line, ll);
LOGDEB2(("End: hadempty %d ll %d Line: [%s]\n",
hademptyline, ll, line));
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
// Rewind to start of "From " line
fseek(fp, end, SEEK_SET);
m_lineno--;
break;
if (hademptyline) {
if (ll > 0) {
if (!(m_quirks & MBOXQUIRK_TBIRD))
hademptyline = false;
if (!regexec(&fromregex, line, 0, 0, 0)) {
// Rewind to start of "From " line
fseek(fp, end, SEEK_SET);
m_lineno--;
break;
}
}
} else if (ll <= 0) {
hademptyline = true;
}
if (mtarg <= 0 || m_msgnum == mtarg) {
line[ll] = '\n';
line[ll+1] = 0;
msgtxt += line;
}
if (ll <= 0) {
hademptyline = true;
} else {
hademptyline = false;
}
}
} while (mtarg > 0 && m_msgnum < mtarg);

View File

@ -52,6 +52,8 @@ class MimeHandlerMbox : public RecollFilter {
int m_lineno; // debug
mbhoff_type m_fsize;
vector<mbhoff_type> m_offsets;
enum Quirks {MBOXQUIRK_TBIRD=1};
int m_quirks;
};
#endif /* _MBOX_H_INCLUDED_ */

View File

@ -17,7 +17,8 @@ topdirs = ~
# ".*" in there (as was the case with an older sample config)
# These are simple names, not paths (must contain no / )
skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \
*~ recollrc .beagle .git .hg .bzr loop.ps
*~ .beagle .git .hg .bzr loop.ps .xsession-errors \
.recoll* xapiandb recollrc recoll.conf
# Wildcard expressions for paths we shouldn't go into. The database and
# configuration directories will be added in there, else the default value
@ -170,3 +171,7 @@ webcachemaxmbs = 40
# used for application selection inside mimeview
#[/some/app/directory]
#localfields = rclaptg = someapp; otherfield = somevalue
# Enable thunderbird mbox format quirks where appropriate
[~/.thunderbird]
mhmboxquirks = tbird