mail ckpt
This commit is contained in:
parent
63a29c7ced
commit
d392d317bb
3
src/bincimapmime/00README.recoll
Normal file
3
src/bincimapmime/00README.recoll
Normal file
@ -0,0 +1,3 @@
|
||||
Most of the code in this directory was taken from the Binc IMAP project
|
||||
(http://www.bincimap.org/), version 1.3.3
|
||||
|
||||
@ -72,3 +72,27 @@ void Binc::MimePart::printBody(int fd, IODevice &output,
|
||||
output << (char)c;
|
||||
}
|
||||
}
|
||||
|
||||
void Binc::MimePart::getBody(int fd, string &s,
|
||||
unsigned int startoffset,
|
||||
unsigned int length) const
|
||||
{
|
||||
if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
|
||||
delete mimeSource;
|
||||
mimeSource = new MimeInputSource(fd);
|
||||
}
|
||||
|
||||
mimeSource->reset();
|
||||
mimeSource->seek(bodystartoffsetcrlf + startoffset);
|
||||
|
||||
if (startoffset + length > bodylength)
|
||||
length = bodylength - startoffset;
|
||||
|
||||
char c = '\0';
|
||||
for (unsigned int i = 0; i < length; ++i) {
|
||||
if (!mimeSource->getChar(&c))
|
||||
break;
|
||||
|
||||
s += (char)c;
|
||||
}
|
||||
}
|
||||
|
||||
@ -107,6 +107,7 @@ namespace Binc {
|
||||
inline unsigned int getBodyStartOffset(void) const { return bodystartoffsetcrlf; }
|
||||
|
||||
void printBody(int fd, Binc::IODevice &output, unsigned int startoffset, unsigned int length) const;
|
||||
void getBody(int fd, std::string& s, unsigned int startoffset, unsigned int length) const;
|
||||
void printHeader(int fd, Binc::IODevice &output, std::vector<std::string> headers, bool includeheaders, unsigned int startoffset, unsigned int length, std::string &storage) const;
|
||||
void printDoc(int fd, Binc::IODevice &output, unsigned int startoffset, unsigned int length) const;
|
||||
virtual void clear(void) const;
|
||||
|
||||
87
src/bincimapmime/trbinc.cc
Normal file
87
src/bincimapmime/trbinc.cc
Normal file
@ -0,0 +1,87 @@
|
||||
#ifndef lint
|
||||
static char rcsid [] = "@(#$Id: trbinc.cc,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 1994 CDKIT";
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "mime.h"
|
||||
|
||||
static char *thisprog;
|
||||
|
||||
static char usage [] =
|
||||
"trbinc <mboxfile> \n\n"
|
||||
;
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_MOINS 0x1
|
||||
#define OPT_s 0x2
|
||||
#define OPT_b 0x4
|
||||
|
||||
#define DEFCOUNT 10
|
||||
|
||||
const char *hnames[] = {"Subject", "Content-type"};
|
||||
int nh = sizeof(hnames) / sizeof(char *);
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int count = DEFCOUNT;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'b': op_flags |= OPT_b; if (argc < 2) Usage();
|
||||
if ((sscanf(*(++argv), "%d", &count)) != 1)
|
||||
Usage();
|
||||
argc--;
|
||||
goto b1;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
}
|
||||
|
||||
if (argc != 1)
|
||||
Usage();
|
||||
|
||||
char *mfile = *argv++;argc--;
|
||||
int fd;
|
||||
if ((fd = open(mfile, 0)) < 0) {
|
||||
perror("Opening");
|
||||
exit(1);
|
||||
}
|
||||
Binc::MimeDocument doc;
|
||||
doc.parseFull(fd);
|
||||
|
||||
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
||||
fprintf(stderr, "Parse error\n");
|
||||
exit(1);
|
||||
}
|
||||
close(fd);
|
||||
Binc::HeaderItem hi;
|
||||
for (int i = 0; i < nh ; i++) {
|
||||
if (!doc.h.getFirstHeader(hnames[i], hi)) {
|
||||
fprintf(stderr, "No %s\n", hnames[i]);
|
||||
exit(1);
|
||||
}
|
||||
printf("%s: %s\n", hnames[i], hi.getValue().c_str());
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
include ../mk/sysconf
|
||||
BIGLIB = ../lib/librcl.a
|
||||
|
||||
|
||||
PROGS = unacpp textsplit
|
||||
all: $(PROGS)
|
||||
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
include ../mk/sysconf
|
||||
|
||||
BIGLIB = ../lib/librcl.a
|
||||
MIMELIB = ../bincimapmime/libmime.a
|
||||
|
||||
PROGS = recollindex csguess
|
||||
all: $(PROGS)
|
||||
|
||||
RECOLLINDEX_OBJS= recollindex.o $(BIGLIB)
|
||||
RECOLLINDEX_OBJS= recollindex.o $(BIGLIB) $(MIMELIB)
|
||||
recollindex : $(RECOLLINDEX_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o recollindex $(RECOLLINDEX_OBJS) \
|
||||
$(LIBXAPIAN) $(LIBICONV) $(LIBSYS)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.8 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -146,18 +146,25 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
Rcl::Doc doc;
|
||||
if (!internfile(fn, config, doc, tmpdir))
|
||||
return FsTreeWalker::FtwOk;
|
||||
FileInterner interner(fn, config, tmpdir);
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
Rcl::Doc doc;
|
||||
string ipath;
|
||||
fis = interner.internfile(doc, ipath);
|
||||
if (fis == FileInterner::FIError)
|
||||
break;
|
||||
|
||||
// Set up common fields:
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
doc.mtime = ascdate;
|
||||
// Set up common fields:
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
doc.mtime = ascdate;
|
||||
doc.ipath = ipath;
|
||||
|
||||
// Do database-specific work to update document data
|
||||
if (!db.add(fn, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
// Do database-specific work to update document data
|
||||
if (!db.add(fn, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
@ -1,14 +1,47 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.5 2005-02-09 13:34:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
#include <list>
|
||||
using std::list;
|
||||
|
||||
#include "mimetype.h"
|
||||
#include "debuglog.h"
|
||||
#include "execmd.h"
|
||||
#include "conftree.h"
|
||||
|
||||
static string mimetypefromdata(const string &fn)
|
||||
{
|
||||
list<string> args;
|
||||
|
||||
args.push_back("-i");
|
||||
args.push_back(fn);
|
||||
ExecCmd ex;
|
||||
string result;
|
||||
string cmd = "file";
|
||||
int status = ex.doexec(cmd, args, 0, &result);
|
||||
if (status) {
|
||||
LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status));
|
||||
return "";
|
||||
}
|
||||
// LOGDEB(("mimetypefromdata: %s [%s]\n", result.c_str(), fn.c_str()));
|
||||
list<string> res;
|
||||
ConfTree::stringToStrings(result, res);
|
||||
if (res.size() <= 1)
|
||||
return "";
|
||||
list<string>::iterator it = res.begin();
|
||||
it++;
|
||||
string mime = *it;
|
||||
|
||||
if (mime.length() > 0 && !isalpha(mime[mime.length() - 1]))
|
||||
mime.erase(mime.length() -1);
|
||||
|
||||
return mime;
|
||||
}
|
||||
|
||||
string mimetype(const string &fn, ConfTree *mtypes)
|
||||
{
|
||||
@ -38,8 +71,9 @@ string mimetype(const string &fn, ConfTree *mtypes)
|
||||
|
||||
// If the file name has a suffix and we find it in the map, we're done
|
||||
string::size_type dot = fn.find_last_of(".");
|
||||
string suff;
|
||||
if (dot != string::npos) {
|
||||
string suff = fn.substr(dot);
|
||||
suff = fn.substr(dot);
|
||||
for (unsigned int i = 0; i < suff.length(); i++)
|
||||
suff[i] = tolower(suff[i]);
|
||||
|
||||
@ -48,7 +82,9 @@ string mimetype(const string &fn, ConfTree *mtypes)
|
||||
return mtype;
|
||||
}
|
||||
|
||||
// Look at file data ? One day maybe
|
||||
// Look at file data ? Only when no suffix
|
||||
if (suff.empty())
|
||||
return mimetypefromdata(fn);
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.3 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.4 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
@ -70,30 +70,27 @@ static bool uncompressfile(RclConfig *conf, const string& ifn,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void tmpcleanup(const string& tdir, const string& tfile)
|
||||
void FileInterner::tmpcleanup()
|
||||
{
|
||||
if (tdir.empty() || tfile.empty())
|
||||
return;
|
||||
if (unlink(tfile.c_str()) < 0) {
|
||||
LOGERR(("tmpcleanup: unlink(%s) errno %d\n", tfile.c_str(),
|
||||
errno));
|
||||
LOGERR(("FileInterner::tmpcleanup: unlink(%s) errno %d\n",
|
||||
tfile.c_str(), errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
|
||||
const string& tdir)
|
||||
// Handler==0 on return says we're in error
|
||||
FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
|
||||
const string& td)
|
||||
: fn(f), config(cnf), tdir(td), handler(0)
|
||||
{
|
||||
string fn = ifn;
|
||||
string tfile;
|
||||
MimeHandler *handler = 0;
|
||||
bool ret = false;
|
||||
|
||||
string mime = mimetype(fn, config->getMimeMap());
|
||||
mime = mimetype(fn, config->getMimeMap());
|
||||
if (mime.empty()) {
|
||||
// No mime type: not listed in our map.
|
||||
LOGDEB(("internfile: (no mime) [%s]\n", fn.c_str()));
|
||||
return false;
|
||||
LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", fn.c_str()));
|
||||
return;
|
||||
}
|
||||
|
||||
// First check for a compressed file. If so, create a temporary
|
||||
@ -101,8 +98,9 @@ bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
|
||||
// rest with the temp file.
|
||||
list<string>ucmd;
|
||||
if (getUncompressor(mime, config->getMimeConf(), ucmd)) {
|
||||
if (!uncompressfile(config, fn, ucmd, tdir, tfile))
|
||||
return false;
|
||||
if (!uncompressfile(config, fn, ucmd, tdir, tfile)) {
|
||||
return;
|
||||
}
|
||||
LOGDEB(("internfile: after ucomp: tdir %s, tfile %s\n",
|
||||
tdir.c_str(), tfile.c_str()));
|
||||
fn = tfile;
|
||||
@ -110,33 +108,43 @@ bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
|
||||
if (mime.empty()) {
|
||||
// No mime type ?? pass on.
|
||||
LOGDEB(("internfile: (no mime) [%s]\n", fn.c_str()));
|
||||
goto out;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Look for appropriate handler
|
||||
handler = getMimeHandler(mime, config->getMimeConf());
|
||||
if (!handler) {
|
||||
// No handler for this type, for now :(
|
||||
LOGDEB(("internfile: %s : no handler\n", mime.c_str()));
|
||||
goto out;
|
||||
LOGDEB(("FileInterner::FileInterner: %s: no handler\n", mime.c_str()));
|
||||
return;
|
||||
}
|
||||
|
||||
LOGDEB(("internfile: %s [%s]\n", mime.c_str(), fn.c_str()));
|
||||
LOGDEB(("FileInterner::FileInterner: %s [%s]\n",mime.c_str(), fn.c_str()));
|
||||
}
|
||||
|
||||
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
|
||||
{
|
||||
if (!handler)
|
||||
return FIError;
|
||||
|
||||
// Turn file into a document. The document has fields for title, body
|
||||
// etc., all text converted to utf8
|
||||
if (!handler->worker(config, fn, mime, doc)) {
|
||||
goto out;
|
||||
MimeHandler::Status mhs = handler->worker(config, fn, mime, doc, ipath);
|
||||
FileInterner::Status ret = FIError;
|
||||
switch (mhs) {
|
||||
case MimeHandler::MHError: break;
|
||||
case MimeHandler::MHDone: ret = FIDone;break;
|
||||
case MimeHandler::MHAgain: ret = FIAgain;break;
|
||||
}
|
||||
doc.mimetype = mime;
|
||||
|
||||
// Clean up. We delete the temp file and its father directory
|
||||
ret = true;
|
||||
out:
|
||||
delete handler;
|
||||
tmpcleanup(tdir, tfile);
|
||||
doc.mimetype = mime;
|
||||
return ret;
|
||||
}
|
||||
|
||||
FileInterner::~FileInterner()
|
||||
{
|
||||
delete handler;
|
||||
handler = 0;
|
||||
tmpcleanup();
|
||||
}
|
||||
|
||||
@ -1,14 +1,31 @@
|
||||
#ifndef _INTERNFILE_H_INCLUDED_
|
||||
#define _INTERNFILE_H_INCLUDED_
|
||||
/* @(#$Id: internfile.h,v 1.2 2005-02-09 12:07:29 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: internfile.h,v 1.3 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "rcldb.h"
|
||||
|
||||
class MimeHandler;
|
||||
|
||||
/// Turn external file into internal representation, according to mime type etc
|
||||
extern bool internfile(const std::string &fn, RclConfig *config,
|
||||
Rcl::Doc& doc, const string& tdir);
|
||||
class FileInterner {
|
||||
string fn;
|
||||
RclConfig *config;
|
||||
const string &tdir;
|
||||
MimeHandler *handler;
|
||||
string tfile;
|
||||
string mime;
|
||||
|
||||
void tmpcleanup();
|
||||
|
||||
public:
|
||||
FileInterner(const std::string &f, RclConfig *cnf, const string& td);
|
||||
~FileInterner();
|
||||
|
||||
enum Status {FIError, FIDone, FIAgain};
|
||||
Status internfile(Rcl::Doc& doc, string &ipath);
|
||||
};
|
||||
|
||||
#endif /* _INTERNFILE_H_INCLUDED_ */
|
||||
|
||||
@ -38,21 +38,23 @@
|
||||
using namespace std;
|
||||
|
||||
|
||||
bool MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
MimeHandler::Status
|
||||
MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&)
|
||||
{
|
||||
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext)) {
|
||||
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
||||
return false;
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
return worker1(conf, fn, otext, mtype, docout);
|
||||
}
|
||||
|
||||
bool MimeHandlerHtml::worker1(RclConfig *conf, const string &,
|
||||
const string& htext,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
MimeHandler::Status
|
||||
MimeHandlerHtml::worker1(RclConfig *conf, const string &,
|
||||
const string& htext,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
// Character set handling:
|
||||
|
||||
@ -111,5 +113,5 @@ bool MimeHandlerHtml::worker1(RclConfig *conf, const string &,
|
||||
out.keywords = pres.keywords;
|
||||
out.abstract = pres.sample;
|
||||
docout = out;
|
||||
return true;
|
||||
return MimeHandler::MHDone;
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _HTML_H_INCLUDED_
|
||||
#define _HTML_H_INCLUDED_
|
||||
/* @(#$Id: mh_html.h,v 1.2 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mh_html.h,v 1.3 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include "mimehandler.h"
|
||||
|
||||
// Code to turn an html document into an internal one. There are 2
|
||||
@ -11,9 +11,9 @@
|
||||
// carry titles, abstracts, whatever)
|
||||
class MimeHandlerHtml : public MimeHandler {
|
||||
public:
|
||||
virtual bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
virtual bool worker1(RclConfig *conf, const string &fn,
|
||||
virtual MimeHandler::Status worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&);
|
||||
virtual MimeHandler::Status worker1(RclConfig *conf, const string &fn,
|
||||
const string& htext,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
};
|
||||
|
||||
178
src/internfile/mh_mail.cpp
Normal file
178
src/internfile/mh_mail.cpp
Normal file
@ -0,0 +1,178 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <map>
|
||||
using std::map;
|
||||
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
#include "csguess.h"
|
||||
#include "readfile.h"
|
||||
#include "transcode.h"
|
||||
#include "mimeparse.h"
|
||||
#include "indextext.h"
|
||||
#include "mail.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "mimeparse.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
// We are called for two different file types: mbox-type folders
|
||||
// holding multiple messages, and maildir-type files with one rfc822
|
||||
// message
|
||||
MimeHandler::Status
|
||||
MimeHandlerMail::worker(RclConfig *cnf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&)
|
||||
{
|
||||
LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
|
||||
conf = cnf;
|
||||
|
||||
if (!stringlowercmp("message/rfc822", mtype)) {
|
||||
return processone(fn, docout);
|
||||
} else if (!stringlowercmp("text/x-mail", mtype)) {
|
||||
return MimeHandler::MHError;
|
||||
} else
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
|
||||
|
||||
#include "mime.h"
|
||||
|
||||
const char *hnames[] = {"Subject", "Content-type"};
|
||||
int nh = sizeof(hnames) / sizeof(char *);
|
||||
|
||||
void walkmime(string &out, Binc::MimePart& doc, int fd, int depth);
|
||||
|
||||
// Transform a single message into a document. The subject becomes the
|
||||
// title, and any simple body part with a content-type of text or html
|
||||
// and content-disposition inline gets concatenated as text.
|
||||
MimeHandler::Status
|
||||
MimeHandlerMail::processone(const string &fn, Rcl::Doc &docout)
|
||||
{
|
||||
int fd;
|
||||
if ((fd = open(fn.c_str(), 0)) < 0) {
|
||||
LOGERR(("MimeHandlerMail::processone: open(%s) errno %d\n",
|
||||
fn.c_str(), errno));
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
Binc::MimeDocument doc;
|
||||
doc.parseFull(fd);
|
||||
|
||||
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
||||
LOGERR(("MimeHandlerMail::processone: parse error for %s\n",
|
||||
fn.c_str()));
|
||||
close(fd);
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
|
||||
doc.isMultipart(), doc.getSubType().c_str()));
|
||||
walkmime(docout.text, doc, fd, 0);
|
||||
close(fd);
|
||||
LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
|
||||
void walkmime(string &out, Binc::MimePart& doc, int fd, int depth)
|
||||
{
|
||||
if (depth > 5) {
|
||||
LOGINFO(("walkmime: max depth exceeded\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (doc.isMultipart()) {
|
||||
LOGDEB(("walkmime: ismultipart %d subtype '%s'\n",
|
||||
doc.isMultipart(), doc.getSubType().c_str()));
|
||||
// We only handle alternative and mixed for now. For
|
||||
// alternative, we look for a text/plain part, else html and process it
|
||||
// For mixed, we process each part.
|
||||
std::vector<Binc::MimePart>::iterator it;
|
||||
if (!stringicmp("mixed", doc.getSubType())) {
|
||||
for (it = doc.members.begin(); it != doc.members.end();it++) {
|
||||
walkmime(out, *it, fd, depth+1);
|
||||
}
|
||||
} else if (!stringicmp("alternative", doc.getSubType())) {
|
||||
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
|
||||
ittxt = ithtml = doc.members.end();
|
||||
for (it = doc.members.begin(); it != doc.members.end();it++) {
|
||||
// Get and parse content-type header
|
||||
Binc::HeaderItem hi;
|
||||
if (!doc.h.getFirstHeader("Content-Type", hi))
|
||||
continue;
|
||||
LOGDEB(("walkmime:content-type: %s\n", hi.getValue().c_str()));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If content-type is text or html and content-disposition is inline,
|
||||
// decode and add to text.
|
||||
|
||||
// Get and parse content-type header.
|
||||
Binc::HeaderItem hi;
|
||||
string ctt = "text/plain";
|
||||
if (doc.h.getFirstHeader("Content-Type", hi)) {
|
||||
ctt = hi.getValue();
|
||||
}
|
||||
LOGDEB(("walkmime:content-type: %s\n", ctt.c_str()));
|
||||
MimeHeaderValue content_type;
|
||||
parseMimeHeaderValue(ctt, content_type);
|
||||
if (stringlowercmp("text/plain", content_type.value) &&
|
||||
stringlowercmp("text/html", content_type.value)) {
|
||||
return;
|
||||
}
|
||||
string charset = "us-ascii";
|
||||
map<string,string>::const_iterator it;
|
||||
it = content_type.params.find(string("charset"));
|
||||
if (it != content_type.params.end())
|
||||
charset = it->second;
|
||||
|
||||
// Content disposition
|
||||
string ctd = "inline";
|
||||
if (doc.h.getFirstHeader("Content-Disposition", hi)) {
|
||||
ctd = hi.getValue();
|
||||
}
|
||||
MimeHeaderValue content_disposition;
|
||||
parseMimeHeaderValue(ctd, content_disposition);
|
||||
if (stringlowercmp("inline", content_disposition.value)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Content transfer encoding
|
||||
string cte = "7bit";
|
||||
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
|
||||
cte = hi.getValue();
|
||||
}
|
||||
|
||||
LOGDEB(("walkmime: final: body start offset %d, length %d\n",
|
||||
doc.getBodyStartOffset(), doc.getBodyLength()));
|
||||
string body;
|
||||
doc.getBody(fd, body, 0, doc.bodylength);
|
||||
|
||||
// Decode content transfer encoding
|
||||
if (stringlowercmp("quoted-printable", content_disposition.value)) {
|
||||
string decoded;
|
||||
qp_decode(body, decoded);
|
||||
body = decoded;
|
||||
} else if (stringlowercmp("base64", content_disposition.value)) {
|
||||
string decoded;
|
||||
base64_decode(body, decoded);
|
||||
body = decoded;
|
||||
}
|
||||
|
||||
|
||||
// Transcode to utf-8
|
||||
string transcoded;
|
||||
if (!transcode(body, transcoded, charset, "UTF-8")) {
|
||||
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
||||
charset.c_str()));
|
||||
transcoded = body;
|
||||
}
|
||||
|
||||
out += string("\r\n") + transcoded;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
16
src/internfile/mh_mail.h
Normal file
16
src/internfile/mh_mail.h
Normal file
@ -0,0 +1,16 @@
|
||||
#ifndef _MAIL_H_INCLUDED_
|
||||
#define _MAIL_H_INCLUDED_
|
||||
/* @(#$Id: mh_mail.h,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include "mimehandler.h"
|
||||
|
||||
// Code to turn a mail folder file into internal documents
|
||||
class MimeHandlerMail : public MimeHandler {
|
||||
RclConfig *conf;
|
||||
MimeHandler::Status processone(const string &fn, Rcl::Doc &docout);
|
||||
public:
|
||||
MimeHandlerMail() : conf(0) {}
|
||||
virtual MimeHandler::Status
|
||||
worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string& ipath);
|
||||
};
|
||||
#endif /* _MAIL_H_INCLUDED_ */
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.8 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
@ -13,23 +13,24 @@ using namespace std;
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "html.h"
|
||||
#include "mail.h"
|
||||
#include "execmd.h"
|
||||
#include "pathut.h"
|
||||
|
||||
class MimeHandlerText : public MimeHandler {
|
||||
public:
|
||||
bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
MimeHandler::Status worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&);
|
||||
|
||||
};
|
||||
|
||||
// Process a plain text file
|
||||
bool MimeHandlerText::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
MimeHandler::Status MimeHandlerText::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&)
|
||||
{
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext))
|
||||
return false;
|
||||
return MimeHandler::MHError;
|
||||
|
||||
// Try to guess charset, then convert to utf-8, and fill document
|
||||
// fields The charset guesser really doesnt work well in general
|
||||
@ -46,36 +47,38 @@ bool MimeHandlerText::worker(RclConfig *conf, const string &fn,
|
||||
cerr << "textPlainToDoc: transcode failed: charset '" << charset
|
||||
<< "' to UTF-8: "<< utf8 << endl;
|
||||
otext.erase();
|
||||
return 0;
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
|
||||
Rcl::Doc out;
|
||||
out.origcharset = charset;
|
||||
out.text = utf8;
|
||||
docout = out;
|
||||
return true;
|
||||
return MimeHandler::MHDone;
|
||||
}
|
||||
|
||||
class MimeHandlerExec : public MimeHandler {
|
||||
public:
|
||||
list<string> params;
|
||||
virtual ~MimeHandlerExec() {}
|
||||
virtual bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
virtual MimeHandler::Status worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout,
|
||||
string&);
|
||||
|
||||
};
|
||||
|
||||
|
||||
// Execute an external program to translate a file from its native format
|
||||
// to html. Then call the html parser to do the actual indexing
|
||||
bool MimeHandlerExec::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
MimeHandler::Status
|
||||
MimeHandlerExec::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout, string&)
|
||||
{
|
||||
if (params.empty()) {
|
||||
// Hu ho
|
||||
LOGERR(("MimeHandlerExec::worker: empty params for mime %s\n",
|
||||
mtype.c_str()));
|
||||
return false;
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
// Command name
|
||||
string cmd = find_filter(conf, params.front());
|
||||
@ -92,7 +95,7 @@ bool MimeHandlerExec::worker(RclConfig *conf, const string &fn,
|
||||
if (status) {
|
||||
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
|
||||
status, cmd.c_str()));
|
||||
return false;
|
||||
return MimeHandler::MHError;
|
||||
}
|
||||
|
||||
// Process/index the html
|
||||
@ -106,6 +109,10 @@ static MimeHandler *mhfact(const string &mime)
|
||||
return new MimeHandlerText;
|
||||
else if (!stringlowercmp("text/html", mime))
|
||||
return new MimeHandlerHtml;
|
||||
else if (!stringlowercmp("text/x-mail", mime))
|
||||
return new MimeHandlerMail;
|
||||
else if (!stringlowercmp("message/rfc822", mime))
|
||||
return new MimeHandlerMail;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -117,7 +124,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
|
||||
// Return handler definition for mime type
|
||||
string hs;
|
||||
if (!mhandlers->get(mtype, hs, "index")) {
|
||||
LOGDEB(("getMimeHandler: no handler for %s\n", mtype.c_str()));
|
||||
LOGDEB(("getMimeHandler: no handler for '%s'\n", mtype.c_str()));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _MIMEHANDLER_H_INCLUDED_
|
||||
#define _MIMEHANDLER_H_INCLUDED_
|
||||
/* @(#$Id: mimehandler.h,v 1.5 2005-02-04 09:39:44 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mimehandler.h,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -10,13 +10,34 @@
|
||||
|
||||
|
||||
/**
|
||||
* Document interner class. We sometimes have data to pass to an interner
|
||||
* Document interner class.
|
||||
*/
|
||||
class MimeHandler {
|
||||
public:
|
||||
virtual ~MimeHandler() {}
|
||||
virtual bool worker(RclConfig *, const std::string &filename,
|
||||
const std::string &mimetype, Rcl::Doc& outdoc) = 0;
|
||||
|
||||
/**
|
||||
* Transform external data into internal utf8 document
|
||||
*
|
||||
* @param conf the global configuration
|
||||
* @param filename File from which the data comes from
|
||||
* @param mimetype its mime type (from the mimemap configuration file)
|
||||
* @param outdoc The output document
|
||||
* @param ipath the access path for the document inside the file.
|
||||
* For mono-document file types, this will always be empty.
|
||||
* It is used, for example for mbox files which may contain
|
||||
* multiple emails. If this is not empty in input, then the
|
||||
* caller is requesting a single document (ie: for display).
|
||||
* If this is empty (during indexation), it will be filled-up
|
||||
* by the function, and all the file's documents will be
|
||||
* returned by successive calls.
|
||||
* @return the return value indicates if there are more documents to be
|
||||
* fetched from the same file.
|
||||
*/
|
||||
enum Status {MHError, MHDone, MHAgain};
|
||||
virtual Status worker(RclConfig * conf, const std::string &filename,
|
||||
const std::string &mimetype, Rcl::Doc& outdoc,
|
||||
string& ipath) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -30,6 +51,11 @@ extern MimeHandler *getMimeHandler(const std::string &mtyp, ConfTree *mhdlers);
|
||||
*/
|
||||
extern std::string getMimeViewer(const std::string &mtyp, ConfTree *mhandlers);
|
||||
|
||||
/**
|
||||
* Return command to uncompress the given type. The returned command has
|
||||
* substitutable places for input file name and temp dir name, and will
|
||||
* return output name
|
||||
*/
|
||||
bool getUncompressor(const std::string &mtype, ConfTree *mhandlers,
|
||||
std::list<std::string>& cmd);
|
||||
|
||||
|
||||
@ -150,7 +150,8 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
lowercase_term(hequiv);
|
||||
if (hequiv == "content-type") {
|
||||
string value = i->second;
|
||||
MimeHeaderValue p = parseMimeHeaderValue(value);
|
||||
MimeHeaderValue p;
|
||||
parseMimeHeaderValue(value, p);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) !=
|
||||
p.params.end()) {
|
||||
|
||||
@ -8,14 +8,15 @@ all: $(LIBS)
|
||||
|
||||
OBJS = conftree.o csguess.o debuglog.o \
|
||||
execmd.o wipedir.o \
|
||||
fstreewalk.o html.o htmlparse.o indexer.o internfile.o \
|
||||
fstreewalk.o html.o mail.o htmlparse.o indexer.o internfile.o \
|
||||
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
|
||||
rclconfig.o rcldb.o readfile.o smallut.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o
|
||||
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
||||
../utils/execmd.cpp ../utils/wipedir.cpp \
|
||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
|
||||
../common/htmlparse.cpp \
|
||||
../index/indexer.cpp ../common/internfile.cpp \
|
||||
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
||||
../common/myhtmlparse.cpp ../utils/pathut.cpp \
|
||||
@ -51,6 +52,8 @@ indexer.o : ../index/indexer.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
internfile.o : ../common/internfile.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mail.o : ../common/mail.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimehandler.o : ../common/mimehandler.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimeparse.o : ../utils/mimeparse.cpp
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
CXXFLAGS = -pthread -Wall -g -I. -I../index -I../utils -I../common \
|
||||
-I../unac -I/usr/local/include
|
||||
-I../unac -I../bincimapmime -I/usr/local/include
|
||||
|
||||
LIBXAPIAN = -L/usr/local/lib -lxapian
|
||||
LIBICONV = -L/usr/local/lib -liconv
|
||||
|
||||
@ -23,7 +23,7 @@ unix {
|
||||
UI_DIR = .ui
|
||||
MOC_DIR = .moc
|
||||
OBJECTS_DIR = .obj
|
||||
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
|
||||
LIBS += ../lib/librcl.a ../bincimapmime/libmime.a -L/usr/local/lib -lxapian -liconv
|
||||
INCLUDEPATH += ../common ../index ../query ../unac ../utils
|
||||
}
|
||||
|
||||
|
||||
@ -197,7 +197,8 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
// for preview:
|
||||
string fn = urltolocalpath(doc.url);
|
||||
Rcl::Doc fdoc;
|
||||
if (!internfile(fn, rclconfig, fdoc, tmpdir)) {
|
||||
FileInterner interner(fn, rclconfig, tmpdir);
|
||||
if (interner.internfile(fdoc, doc.ipath) != FileInterner::FIDone) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("Can't turn doc into internal rep ") +
|
||||
doc.mimetype.c_str());
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.12 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.13 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -31,8 +31,9 @@ namespace Rcl {
|
||||
*/
|
||||
class Doc {
|
||||
public:
|
||||
// This fields potentially go into the document data record
|
||||
// These fields potentially go into the document data record
|
||||
string url;
|
||||
string ipath;
|
||||
string mimetype;
|
||||
string mtime; // Modification time as decimal ascii
|
||||
string origcharset;
|
||||
@ -41,8 +42,10 @@ class Doc {
|
||||
string abstract;
|
||||
|
||||
string text;
|
||||
|
||||
void erase() {
|
||||
url.erase();
|
||||
ipath.erase();
|
||||
mimetype.erase();
|
||||
mtime.erase();
|
||||
origcharset.erase();
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# @(#$Id: mimeconf,v 1.4 2005-03-17 14:02:05 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# @(#$Id: mimeconf,v 1.5 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
|
||||
# Recoll : associations of mime types to processing filters.
|
||||
# There are different sections for decompression, 'interning' for indexing
|
||||
@ -27,6 +27,9 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
|
||||
[index]
|
||||
text/plain = internal
|
||||
text/html = internal
|
||||
text/x-mail = internal
|
||||
message/rfc822 = internal
|
||||
|
||||
application/pdf = exec rclpdf
|
||||
application/postscript = exec rclps
|
||||
application/msword = exec rcldoc
|
||||
@ -46,7 +49,7 @@ application/vnd.sun.xml.writer.template = exec rclsoff
|
||||
# External viewers, launched when you double-click a result entry
|
||||
[view]
|
||||
text/plain = xemacs %f
|
||||
text/html = firefox -a firefox -remote "openFile(%u)"
|
||||
text/html = firefox -remote "openFile(%u)"
|
||||
application/pdf = xpdf %f
|
||||
application/postscript = gv %f
|
||||
application/msword = openoffice-1.1.3-swriter %f
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.2 2005-03-17 14:02:06 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#ifndef TEST_MIMEPARSE
|
||||
@ -7,71 +7,345 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.2 2005-03-17 14:02:06 dockes Ex
|
||||
#include <string>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "mimeparse.h"
|
||||
|
||||
using namespace std;
|
||||
#define WHITE " \t\n"
|
||||
|
||||
static void stripw_lc(string &in)
|
||||
// Parsing a header value. Only content-type has parameters, but
|
||||
// others are compatible with content-type syntax, only, parameters
|
||||
// are not used. So we can parse all like content-type:
|
||||
// headertype: value [; paramname=paramvalue] ...
|
||||
// Value and paramvalues can be quoted strings, and there can be
|
||||
// comments in there
|
||||
|
||||
|
||||
|
||||
// The lexical token returned by find_next_token
|
||||
class Lexical {
|
||||
public:
|
||||
enum kind {none, token, separator};
|
||||
kind what;
|
||||
string value;
|
||||
string error;
|
||||
char quote;
|
||||
Lexical() : what(none), quote(0) {}
|
||||
void reset() {what = none; value.erase(); error.erase();quote = 0;}
|
||||
};
|
||||
|
||||
// Skip mime comment. This must be called with in[start] == '('
|
||||
int skip_comment(const string &in, unsigned int start, Lexical &lex)
|
||||
{
|
||||
// fprintf(stderr, "In: '%s'\n", in.c_str());
|
||||
string::size_type pos, pos1;
|
||||
pos = in.find_first_not_of(WHITE);
|
||||
if (pos == string::npos) {
|
||||
// All white
|
||||
in = "";
|
||||
return;
|
||||
int commentlevel = 0;
|
||||
for (; start < in.size(); start++) {
|
||||
if (in[start] == '\\') {
|
||||
// Skip escaped char.
|
||||
if (start+1 < in.size()) {
|
||||
start++;
|
||||
continue;
|
||||
} else {
|
||||
lex.error.append("\\ at end of string ");
|
||||
return string::npos;
|
||||
}
|
||||
}
|
||||
if (in[start] == '(')
|
||||
commentlevel++;
|
||||
if (in[start] == ')') {
|
||||
if (--commentlevel == 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
in.replace(0, pos, "");
|
||||
pos1 = in.find_last_not_of(WHITE);
|
||||
if (pos1 != in.length() -1)
|
||||
in = in.replace(pos1+1, string::npos, "");
|
||||
string::iterator i;
|
||||
for (i = in.begin(); i != in.end(); i++)
|
||||
*i = tolower(*i);
|
||||
if (start == in.size()) {
|
||||
lex.error.append("Unclosed comment ");
|
||||
return string::npos;
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
MimeHeaderValue parseMimeHeaderValue(const string &ein)
|
||||
// Skip initial whitespace and (possibly nested) comments.
|
||||
int skip_whitespace_and_comment(const string &in, unsigned int start,
|
||||
Lexical &lex)
|
||||
{
|
||||
string in = ein;
|
||||
MimeHeaderValue out;
|
||||
string::size_type pos;
|
||||
while (1) {
|
||||
if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
|
||||
return in.size();
|
||||
if (in[start] == '(') {
|
||||
if ((start = skip_comment(in, start, lex)) == string::npos)
|
||||
return string::npos;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
pos = in.find_first_not_of(WHITE);
|
||||
if (pos == string::npos)
|
||||
return out;
|
||||
in = in.substr(pos, string::npos);
|
||||
if ((pos = in.find_first_of(";")) == string::npos) {
|
||||
out.value = in;
|
||||
return out;
|
||||
}
|
||||
out.value = in.substr(0, pos);
|
||||
stripw_lc(out.value);
|
||||
in = in.substr(pos+1, string::npos);
|
||||
for (;;) {
|
||||
// Skip whitespace
|
||||
if ((pos = in.find_first_not_of(WHITE)) == string::npos)
|
||||
return out;
|
||||
in = in.substr(pos, string::npos);
|
||||
/// Find next token in mime header value string.
|
||||
/// @return the next starting position in string, string::npos for error
|
||||
/// (ie unbalanced quoting)
|
||||
/// @param in the input string
|
||||
/// @param start the starting position
|
||||
/// @param lex the returned token and its description
|
||||
/// @param delims separators we should look for
|
||||
int find_next_token(const string &in, unsigned int start,
|
||||
Lexical &lex, string delims = ";=")
|
||||
{
|
||||
char oquot, cquot;
|
||||
|
||||
if ((pos = in.find_first_of("=")) == string::npos)
|
||||
return out;
|
||||
string pname = in.substr(0, pos);
|
||||
stripw_lc(pname);
|
||||
in = in.substr(pos+1, string::npos);
|
||||
start = skip_whitespace_and_comment(in, start, lex);
|
||||
if (start == string::npos || start == in.size())
|
||||
return start;
|
||||
|
||||
pos = in.find_first_of(";");
|
||||
string pvalue = in.substr(0, pos);
|
||||
stripw_lc(pvalue);
|
||||
out.params[pname] = pvalue;
|
||||
if (pos == string::npos)
|
||||
return out;
|
||||
in = in.substr(pos+1, string::npos);
|
||||
// Begins with separator ? return it.
|
||||
unsigned int delimi = delims.find_first_of(in[start]);
|
||||
if (delimi != string::npos) {
|
||||
lex.what = Lexical::separator;
|
||||
lex.value = delims[delimi];
|
||||
return start+1;
|
||||
}
|
||||
|
||||
return out;
|
||||
// Check for start of quoted string
|
||||
oquot = in[start];
|
||||
switch (oquot) {
|
||||
case '<': cquot = '>';break;
|
||||
case '"': cquot = '"';break;
|
||||
default: cquot = 0; break;
|
||||
}
|
||||
|
||||
if (cquot != 0) {
|
||||
// Quoted string parsing
|
||||
unsigned int end;
|
||||
start++; // Skip quote character
|
||||
for (end = start;end < in.size() && in[end] != cquot; end++) {
|
||||
if (in[end] == '\\') {
|
||||
// Skip escaped char.
|
||||
if (end+1 < in.size()) {
|
||||
end++;
|
||||
} else {
|
||||
// backslash at end of string: error
|
||||
lex.error.append("\\ at end of string ");
|
||||
return string::npos;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (end == in.size()) {
|
||||
// Found end of string before closing quote character: error
|
||||
lex.error.append("Unclosed quoted string ");
|
||||
return string::npos;
|
||||
}
|
||||
lex.what = Lexical::token;
|
||||
lex.value = in.substr(start, end-start);
|
||||
lex.quote = oquot;
|
||||
return ++end;
|
||||
} else {
|
||||
unsigned int end = in.find_first_of(delims + " \t(", start);
|
||||
lex.what = Lexical::token;
|
||||
lex.quote = 0;
|
||||
if (end == string::npos) {
|
||||
end = in.size();
|
||||
lex.value = in.substr(start);
|
||||
} else {
|
||||
lex.value = in.substr(start, end-start);
|
||||
}
|
||||
return end;
|
||||
}
|
||||
}
|
||||
|
||||
void stringtolower(string &out, const string& in)
|
||||
{
|
||||
for (unsigned int i = 0; i < in.size(); i++)
|
||||
out.append(1, char(tolower(in[i])));
|
||||
}
|
||||
|
||||
bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
|
||||
{
|
||||
parsed.value.erase();
|
||||
parsed.params.clear();
|
||||
|
||||
Lexical lex;
|
||||
unsigned int start = 0;
|
||||
start = find_next_token(value, start, lex);
|
||||
if (start == string::npos || lex.what != Lexical::token)
|
||||
return false;
|
||||
|
||||
parsed.value = lex.value;
|
||||
|
||||
for (;;) {
|
||||
string paramname, paramvalue;
|
||||
lex.reset();
|
||||
start = find_next_token(value, start, lex);
|
||||
if (start == value.size())
|
||||
return true;
|
||||
if (start == string::npos)
|
||||
return false;
|
||||
if (lex.what == Lexical::separator && lex.value[0] == ';')
|
||||
continue;
|
||||
if (lex.what != Lexical::token)
|
||||
return false;
|
||||
stringtolower(paramname, lex.value);
|
||||
|
||||
start = find_next_token(value, start, lex);
|
||||
if (start == string::npos || lex.what != Lexical::separator ||
|
||||
lex.value[0] != '=')
|
||||
return false;
|
||||
|
||||
start = find_next_token(value, start, lex);
|
||||
if (start == string::npos || lex.what != Lexical::token)
|
||||
return false;
|
||||
paramvalue = lex.value;
|
||||
parsed.params[paramname] = paramvalue;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Decode a string encoded with quoted-printable encoding.
|
||||
bool qp_decode(const string& in, string &out)
|
||||
{
|
||||
out.reserve(in.length());
|
||||
unsigned int ii;
|
||||
for (ii = 0; ii < in.length(); ii++) {
|
||||
if (in[ii] == '=') {
|
||||
ii++; // Skip '='
|
||||
if(ii >= in.length() - 1) { // Need at least 2 more chars
|
||||
break;
|
||||
} else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
|
||||
ii++;
|
||||
} else if (in[ii] != '\n' && in[ii] != '\r') { // decode
|
||||
char c = in[ii];
|
||||
char co;
|
||||
if(c >= 'A' && c <= 'F') {
|
||||
co = char((c - 'A' + 10) * 16);
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
co = char((c - 'a' + 10) * 16);
|
||||
} else if (c >= '0' && c <= '9') {
|
||||
co = char((c - '0') * 16);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
if(++ii >= in.length())
|
||||
break;
|
||||
c = in[ii];
|
||||
if (c >= 'A' && c <= 'F') {
|
||||
co += char(c - 'A' + 10);
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
co += char(c - 'a' + 10);
|
||||
} else if (c >= '0' && c <= '9') {
|
||||
co += char(c - '0');
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
out += co;
|
||||
}
|
||||
} else {
|
||||
out += in[ii];
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// This is adapted from FreeBSD's code.
|
||||
static const char Base64[] =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
static const char Pad64 = '=';
|
||||
bool base64_decode(const string& in, string& out)
|
||||
{
|
||||
int io = 0, state = 0, ch;
|
||||
char *pos;
|
||||
unsigned int ii = 0;
|
||||
out.reserve(in.length());
|
||||
|
||||
for (ii = 0; ii < in.length(); ii++) {
|
||||
ch = in[ii];
|
||||
if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */
|
||||
continue;
|
||||
|
||||
if (ch == Pad64)
|
||||
break;
|
||||
|
||||
pos = strchr(Base64, ch);
|
||||
if (pos == 0) /* A non-base64 character. */
|
||||
return false;
|
||||
|
||||
switch (state) {
|
||||
case 0:
|
||||
out[io] = (pos - Base64) << 2;
|
||||
state = 1;
|
||||
break;
|
||||
case 1:
|
||||
out[io] |= (pos - Base64) >> 4;
|
||||
out[io+1] = ((pos - Base64) & 0x0f) << 4 ;
|
||||
io++;
|
||||
state = 2;
|
||||
break;
|
||||
case 2:
|
||||
out[io] |= (pos - Base64) >> 2;
|
||||
out[io+1] = ((pos - Base64) & 0x03) << 6;
|
||||
io++;
|
||||
state = 3;
|
||||
break;
|
||||
case 3:
|
||||
out[io] |= (pos - Base64);
|
||||
io++;
|
||||
state = 0;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We are done decoding Base-64 chars. Let's see if we ended
|
||||
* on a byte boundary, and/or with erroneous trailing characters.
|
||||
*/
|
||||
|
||||
if (ch == Pad64) { /* We got a pad char. */
|
||||
ch = in[ii++]; /* Skip it, get next. */
|
||||
switch (state) {
|
||||
case 0: /* Invalid = in first position */
|
||||
case 1: /* Invalid = in second position */
|
||||
return false;
|
||||
|
||||
case 2: /* Valid, means one byte of info */
|
||||
/* Skip any number of spaces. */
|
||||
for (; ii < in.length(); ch = in[ii++])
|
||||
if (!isspace((unsigned char)ch))
|
||||
break;
|
||||
/* Make sure there is another trailing = sign. */
|
||||
if (ch != Pad64)
|
||||
return false;
|
||||
ch = in[ii++]; /* Skip the = */
|
||||
/* Fall through to "single trailing =" case. */
|
||||
/* FALLTHROUGH */
|
||||
|
||||
case 3: /* Valid, means two bytes of info */
|
||||
/*
|
||||
* We know this char is an =. Is there anything but
|
||||
* whitespace after it?
|
||||
*/
|
||||
for ((void)NULL; ii < in.length(); ch = in[ii++])
|
||||
if (!isspace((unsigned char)ch))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Now make sure for cases 2 and 3 that the "extra"
|
||||
* bits that slopped past the last full byte were
|
||||
* zeros. If we don't check them, they become a
|
||||
* subliminal channel.
|
||||
*/
|
||||
if (out[io] != 0)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We ended by seeing the end of the string. Make sure we
|
||||
* have no partial bytes lying around.
|
||||
*/
|
||||
if (state != 0)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
@ -82,19 +356,47 @@ using namespace std;
|
||||
int
|
||||
main(int argc, const char **argv)
|
||||
{
|
||||
#if 0
|
||||
// const char *tr = "text/html; charset=utf-8; otherparam=garb";
|
||||
const char *tr = "text/html;charset = UTF-8 ; otherparam=garb; \n"
|
||||
"QUOTEDPARAM=\"quoted value\"";
|
||||
|
||||
MimeHeaderValue parsed;
|
||||
|
||||
// const char *tr = "text/html; charset=utf-8; otherparam=garb";
|
||||
const char *tr = "text/html;charset = UTF-8 ; otherparam=garb;";
|
||||
|
||||
parsed = parseMimeHeaderValue(tr);
|
||||
if (!parseMimeHeaderValue(tr, parsed)) {
|
||||
fprintf(stderr, "PARSE ERROR\n");
|
||||
}
|
||||
|
||||
printf("'%s' \n", parsed.value.c_str());
|
||||
map<string, string>::iterator it;
|
||||
for (it = parsed.params.begin();it != parsed.params.end();it++) {
|
||||
printf(" '%s' = '%s'\n", it->first.c_str(), it->second.c_str());
|
||||
}
|
||||
#elif 0
|
||||
const char *qp = "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
|
||||
"agrave is: '=E0' probable skipped decode error: =\n"
|
||||
"Actual decode error =xx this wont show";
|
||||
|
||||
string out;
|
||||
if (!qp_decode(string(qp), out)) {
|
||||
fprintf(stderr, "qp_decode returned error\n");
|
||||
}
|
||||
printf("Decoded: '%s'\n", out.c_str());
|
||||
#else
|
||||
//'C'est à boire qu'il nous faut éviter l'excès.'
|
||||
//'Deuxième ligne'
|
||||
//'Troisième ligne'
|
||||
//'Et la fin (pas de nl). '
|
||||
const char *b64 =
|
||||
"Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
|
||||
"IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";
|
||||
|
||||
string out;
|
||||
if (!base64_decode(string(b64), out)) {
|
||||
fprintf(stderr, "base64_decode returned error\n");
|
||||
}
|
||||
printf("Decoded: '%s'\n", out.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // TEST_MIMEPARSE
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _MIME_H_INCLUDED_
|
||||
#define _MIME_H_INCLUDED_
|
||||
/* @(#$Id: mimeparse.h,v 1.1 2005-01-26 11:45:55 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
@ -11,7 +11,9 @@ class MimeHeaderValue {
|
||||
std::string value;
|
||||
std::map<std::string, std::string> params;
|
||||
};
|
||||
extern MimeHeaderValue parseMimeHeaderValue(const std::string &in);
|
||||
extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd);
|
||||
|
||||
bool qp_decode(const std::string& in, std::string &out);
|
||||
bool base64_decode(const std::string& in, std::string &out);
|
||||
|
||||
#endif /* _MIME_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user