recoll/src/bincimapmime/mime-parsefull.cc

/* -*- Mode: c++; -*- */
/*  --------------------------------------------------------------------
 *  Filename:
 *    mime-parsefull.cc
 *
 *  Description:
 *    Implementation of main mime parser components
 *  --------------------------------------------------------------------
 *  Copyright 2002-2004 Andreas Aardal Hanssen
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
 *  --------------------------------------------------------------------
 */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "mime.h"
#include "mime-utils.h"
#include "mime-inputsource.h"
#include "convert.h"
#include <string>
#include <vector>
#include <map>
#include <exception>
#include <iostream>

#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <errno.h>

Binc::MimeInputSource *mimeSource = 0;

#ifndef NO_NAMESPACES
using namespace ::std;
#endif /* NO_NAMESPACES */

//------------------------------------------------------------------------
void Binc::MimeDocument::parseFull(int fd) const
{
  if (allIsParsed)
    return;

  allIsParsed = true;

  if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
    delete mimeSource;
    mimeSource = new MimeInputSource(fd);
  } else {
    mimeSource->reset();
  }

  headerstartoffsetcrlf = 0;
  headerlength = 0;
  bodystartoffsetcrlf = 0;
  bodylength = 0;
  size = 0;
  messagerfc822 = false;
  multipart = false;

  int bsize = 0;
  string bound;
  MimePart::parseFull(bound, bsize);

  // eat any trailing junk to get the correct size
  char c;
  while (mimeSource->getChar(&c));

  size = mimeSource->getOffset();
}

void Binc::MimeDocument::parseFull(istream& s) const
{
  if (allIsParsed)
    return;

  allIsParsed = true;

  delete mimeSource;
  mimeSource = new MimeInputSourceStream(s);

  headerstartoffsetcrlf = 0;
  headerlength = 0;
  bodystartoffsetcrlf = 0;
  bodylength = 0;
  size = 0;
  messagerfc822 = false;
  multipart = false;

  int bsize = 0;
  string bound;
  MimePart::parseFull(bound, bsize);

  // eat any trailing junk to get the correct size
  char c;
  while (mimeSource->getChar(&c));

  size = mimeSource->getOffset();
}

//------------------------------------------------------------------------
static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines)
{
  using namespace ::Binc;
  char c;
  bool eof = false;
  char cqueue[4];
  string name;
  string content;

  while (mimeSource->getChar(&c)) {
    // If we encounter a \r before we got to the first ':', then
    // rewind back to the start of the line and assume we're at the
    // start of the body.
    if (c == '\r') {
      for (int i = 0; i < (int) name.length() + 1; ++i)
	mimeSource->ungetChar();
      return false;
    }

    // A colon marks the end of the header name
    if (c == ':') break;

    // Otherwise add to the header name
    name += c;
  }

  cqueue[0] = '\0';
  cqueue[1] = '\0';
  cqueue[2] = '\0';
  cqueue[3] = '\0';

  // Read until the end of the header.
  bool endOfHeaders = false;
  while (!endOfHeaders) {
    if (!mimeSource->getChar(&c)) {
      eof = true;
      break;
    }

    if (c == '\n') ++*nlines;

    for (int i = 0; i < 3; ++i)
      cqueue[i] = cqueue[i + 1];
    cqueue[3] = c;

    if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
      endOfHeaders = true;
      break;
    }

    // If the last character was a newline, and the first now is not
    // whitespace, then rewind one character and store the current
    // key,value pair.
    if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
      if (content.length() > 2)
	content.resize(content.length() - 2);

      trim(content);
      header->add(name, content);

      if (c != '\r') {
	mimeSource->ungetChar();
	if (c == '\n') --*nlines;
	return true;
      }

      mimeSource->getChar(&c);
      return false;
    }

    content += c;
  }

  if (name != "") {
    if (content.length() > 2)
      content.resize(content.length() - 2);
    header->add(name, content);
  }

  return !(eof || endOfHeaders);
}

//------------------------------------------------------------------------
static void parseHeader(Binc::Header *header, unsigned int *nlines)
{
  while (parseOneHeaderLine(header, nlines))
  { }
}

//------------------------------------------------------------------------
static void analyzeHeader(Binc::Header *header, bool *multipart,
			  bool *messagerfc822, string *subtype,
			  string *boundary)
{
  using namespace ::Binc;

  // Do simple parsing of headers to determine the
  // type of message (multipart,messagerfc822 etc)
  HeaderItem ctype;
  if (header->getFirstHeader("content-type", ctype)) {
    vector<string> types;
    split(ctype.getValue(), ";", types);

    if (types.size() > 0) {
      // first element should describe content type
      string tmp = types[0];
      trim(tmp);
      vector<string> v;
      split(tmp, "/", v);
      string key, value;

      key = (v.size() > 0) ? v[0] : "text";
      value = (v.size() > 1) ? v[1] : "plain";
      lowercase(key);

      if (key == "multipart") {
	*multipart = true;
	lowercase(value);
	*subtype = value;
      } else if (key == "message") {
	lowercase(value);
	if (value == "rfc822")
	  *messagerfc822 = true;
      }
    }

    for (vector<string>::const_iterator i = types.begin();
	 i != types.end(); ++i) {
      string element = *i;
      trim(element);

      if (element.find("=") != string::npos) {
	string::size_type pos = element.find('=');
	string key = element.substr(0, pos);
	string value = element.substr(pos + 1);

	lowercase(key);
	trim(key);

	if (key == "boundary") {
	  trim(value, " \"");
	  *boundary = value;
	}
      }
    }
  }
}

static void parseMessageRFC822(vector<Binc::MimePart> *members,
			       bool *foundendofpart,
			       unsigned int *bodylength,
			       unsigned int *nbodylines,
			       const string &toboundary)
{
  using namespace ::Binc;

  // message rfc822 means a completely enclosed mime document. we
  // call the parser recursively, and pass on the boundary string
  // that we got. when parse() finds this boundary, it returns 0. if
  // it finds the end boundary (boundary + "--"), it returns != 0.
  MimePart m;

  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();

  // parsefull returns the number of bytes that need to be removed
  // from the body because of the terminating boundary string.
  int bsize = 0;
  if (m.parseFull(toboundary, bsize))
    *foundendofpart = true;

  // make sure bodylength doesn't overflow
  *bodylength = mimeSource->getOffset();
  if (*bodylength >= bodystartoffsetcrlf) {
    *bodylength -= bodystartoffsetcrlf;
    if (*bodylength >= (unsigned int) bsize) {
      *bodylength -= (unsigned int) bsize;
    } else {
      *bodylength = 0;
    }
  } else {
    *bodylength = 0;
  }

  *nbodylines += m.getNofLines();

  members->push_back(m);
}

static bool skipUntilBoundary(const string &delimiter,
			      unsigned int *nlines, bool *eof)
{
  int endpos = delimiter.length();
  char *delimiterqueue = 0;
  int delimiterpos = 0;
  const char *delimiterStr = delimiter.c_str();
  if (delimiter != "") {
    delimiterqueue = new char[endpos];
    memset(delimiterqueue, 0, endpos);
  }

  // first, skip to the first delimiter string. Anything between the
  // header and the first delimiter string is simply ignored (it's
  // usually a text message intended for non-mime clients)
  char c;

  bool foundBoundary = false;
  for (;;) {
    if (!mimeSource->getChar(&c)) {
      *eof = true;
      break;
    }

    if (c == '\n')
      ++*nlines;

    // if there is no delimiter, we just read until the end of the
    // file.
    if (!delimiterqueue)
      continue;

    delimiterqueue[delimiterpos++ % endpos] = c;

    if (compareStringToQueue(delimiterStr, delimiterqueue,
			     delimiterpos, endpos)) {
      foundBoundary = true;
      break;
    }
  }

  delete [] delimiterqueue;
  delimiterqueue = 0;

  return foundBoundary;
}


static void parseMultipart(const string &boundary,
			   const string &toboundary,
			   bool *eof,
			   unsigned int *nlines,
			   int *boundarysize,
			   bool *foundendofpart,
			   unsigned int *bodylength,
			   vector<Binc::MimePart> *members)
{
  using namespace ::Binc;
  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();

  // multipart parsing starts with skipping to the first
  // boundary. then we call parse() for all parts. the last parse()
  // command will return a code indicating that it found the last
  // boundary of this multipart. Note that the first boundary does
  // not have to start with CRLF.
  string delimiter = "--" + boundary;

  skipUntilBoundary(delimiter, nlines, eof);

  if (!eof)
    *boundarysize = delimiter.size();

  // Read two more characters. This may be CRLF, it may be "--" and
  // it may be any other two characters.
  char a;
  if (!mimeSource->getChar(&a))
    *eof = true;

  if (a == '\n')
    ++*nlines;

  char b;
  if (!mimeSource->getChar(&b))
    *eof = true;

  if (b == '\n')
    ++*nlines;

  // If we find two dashes after the boundary, then this is the end
  // of boundary marker.
  if (!*eof) {
    if (a == '-' && b == '-') {
      *foundendofpart = true;
      *boundarysize += 2;

      if (!mimeSource->getChar(&a))
	*eof = true;

      if (a == '\n')
	++*nlines;

      if (!mimeSource->getChar(&b))
	*eof = true;

      if (b == '\n')
	++*nlines;
    }

    if (a == '\r' && b == '\n') {
      // This exception is to handle a special case where the
      // delimiter of one part is not followed by CRLF, but
      // immediately followed by a CRLF prefixed delimiter.
      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
	*eof = true;
      else if (a == '-' && b == '-') {
	mimeSource->ungetChar();
	mimeSource->ungetChar();
	mimeSource->ungetChar();
	mimeSource->ungetChar();
      } else {
	mimeSource->ungetChar();
	mimeSource->ungetChar();
      }

      *boundarysize += 2;
    } else {
      mimeSource->ungetChar();
      mimeSource->ungetChar();
    }
  }

  // read all mime parts.
  if (!*foundendofpart && !*eof) {
    bool quit = false;
    do {
      MimePart m;

      // If parseFull returns != 0, then it encountered the multipart's
      // final boundary.
      int bsize = 0;
      if (m.parseFull(boundary, bsize)) {
	quit = true;
	*boundarysize = bsize;
      }

      members->push_back(m);

    } while (!quit);
  }

  if (!*foundendofpart && !*eof) {
    // multipart parsing starts with skipping to the first
    // boundary. then we call parse() for all parts. the last parse()
    // command will return a code indicating that it found the last
    // boundary of this multipart. Note that the first boundary does
    // not have to start with CRLF.
    string delimiter = "\r\n--" + toboundary;

    skipUntilBoundary(delimiter, nlines, eof);

    if (!*eof)
      *boundarysize = delimiter.size();

    // Read two more characters. This may be CRLF, it may be "--" and
    // it may be any other two characters.
    char a = '\0';
    if (!mimeSource->getChar(&a))
      *eof = true;

    if (a == '\n')
      ++*nlines;

    char b = '\0';
    if (!mimeSource->getChar(&b))
      *eof = true;

    if (b == '\n')
      ++*nlines;

    // If we find two dashes after the boundary, then this is the end
    // of boundary marker.
    if (!*eof) {
      if (a == '-' && b == '-') {
	*foundendofpart = true;
	*boundarysize += 2;

	if (!mimeSource->getChar(&a))
	  *eof = true;

	if (a == '\n')
	  ++*nlines;

	if (!mimeSource->getChar(&b))
	  *eof = true;

	if (b == '\n')
	  ++*nlines;
      }

      if (a == '\r' && b == '\n') {
	// This exception is to handle a special case where the
	// delimiter of one part is not followed by CRLF, but
	// immediately followed by a CRLF prefixed delimiter.
	if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
	  *eof = true;
	else if (a == '-' && b == '-') {
	  mimeSource->ungetChar();
	  mimeSource->ungetChar();
	  mimeSource->ungetChar();
	  mimeSource->ungetChar();
	} else {
	  mimeSource->ungetChar();
	  mimeSource->ungetChar();
	}

	*boundarysize += 2;
      } else {
	mimeSource->ungetChar();
	mimeSource->ungetChar();
      }
    }
  }

  // make sure bodylength doesn't overflow
  *bodylength = mimeSource->getOffset();
  if (*bodylength >= bodystartoffsetcrlf) {
    *bodylength -= bodystartoffsetcrlf;
    if (*bodylength >= (unsigned int) *boundarysize) {
      *bodylength -= (unsigned int) *boundarysize;
    } else {
      *bodylength = 0;
    }
  } else {
    *bodylength = 0;
  }
}

static void parseSinglePart(const string &toboundary,
			    int *boundarysize,
			    unsigned int *nbodylines,
			    unsigned int *nlines,
			    bool *eof, bool *foundendofpart,
			    unsigned int *bodylength)
{
  using namespace ::Binc;
  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();

  // If toboundary is empty, then we read until the end of the
  // file. Otherwise we will read until we encounter toboundary.
  string _toboundary;
  if (toboundary != "") {
    _toboundary = "\r\n--";
    _toboundary += toboundary;
  }

  //  if (skipUntilBoundary(_toboundary, nlines, eof))
  //    *boundarysize = _toboundary.length();

  char *boundaryqueue = 0;
  int endpos = _toboundary.length();
  if (toboundary != "") {
    boundaryqueue = new char[endpos];
    memset(boundaryqueue, 0, endpos);
  }
  int boundarypos = 0;

  *boundarysize = 0;

  const char *_toboundaryStr = _toboundary.c_str();
  string line;
  bool toboundaryIsEmpty = (toboundary == "");
  char c;
  while (mimeSource->getChar(&c)) {
    if (c == '\n') { ++*nbodylines; ++*nlines; }

    if (toboundaryIsEmpty)
      continue;

    // find boundary
    boundaryqueue[boundarypos++ % endpos] = c;

    if (compareStringToQueue(_toboundaryStr, boundaryqueue,
			     boundarypos, endpos)) {
      *boundarysize = _toboundary.length();
      break;
    }
  }

  delete [] boundaryqueue;

  if (toboundary != "") {
    char a;
    if (!mimeSource->getChar(&a))
      *eof = true;

    if (a == '\n')
      ++*nlines;
    char b;
    if (!mimeSource->getChar(&b))
      *eof = true;

    if (b == '\n')
      ++*nlines;

    if (a == '-' && b == '-') {
      *boundarysize += 2;
      *foundendofpart = true;
      if (!mimeSource->getChar(&a))
	*eof = true;

      if (a == '\n')
	++*nlines;

      if (!mimeSource->getChar(&b))
	*eof = true;

      if (b == '\n')
	++*nlines;
    }

    if (a == '\r' && b == '\n') {
      // This exception is to handle a special case where the
      // delimiter of one part is not followed by CRLF, but
      // immediately followed by a CRLF prefixed delimiter.
      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
	*eof = true;
      else if (a == '-' && b == '-') {
	mimeSource->ungetChar();
	mimeSource->ungetChar();
	mimeSource->ungetChar();
	mimeSource->ungetChar();
      } else {
	mimeSource->ungetChar();
	mimeSource->ungetChar();
      }

      *boundarysize += 2;
    } else {
      mimeSource->ungetChar();
      mimeSource->ungetChar();
    }
  }

  // make sure bodylength doesn't overflow
  *bodylength = mimeSource->getOffset();
  if (*bodylength >= bodystartoffsetcrlf) {
    *bodylength -= bodystartoffsetcrlf;
    if (*bodylength >= (unsigned int) *boundarysize) {
      *bodylength -= (unsigned int) *boundarysize;
    } else {
      *bodylength = 0;
    }
  } else {
    *bodylength = 0;
  }

}

//------------------------------------------------------------------------
int Binc::MimePart::parseFull(const string &toboundary,
			      int &boundarysize) const
{
  headerstartoffsetcrlf = mimeSource->getOffset();

  // Parse the header of this mime part.
  parseHeader(&h, &nlines);

  // Headerlength includes the seperating CRLF. Body starts after the
  // CRLF.
  headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
  bodystartoffsetcrlf = mimeSource->getOffset();
  bodylength = 0;

  // Determine the type of mime part by looking at fields in the
  // header.
  analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);

  bool eof = false;
  bool foundendofpart = false;

  if (messagerfc822) {
    parseMessageRFC822(&members, &foundendofpart, &bodylength,
		       &nbodylines, toboundary);

  } else if (multipart) {
    parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
		   &foundendofpart, &bodylength,
		   &members);
  } else {
    parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
		    &eof, &foundendofpart, &bodylength);
  }

  return (eof || foundendofpart) ? 1 : 0;
}