mail ckpt

This commit is contained in:
dockes 2005-03-25 09:40:28 +00:00
parent 63a29c7ced
commit d392d317bb
25 changed files with 872 additions and 145 deletions

View File

@ -0,0 +1,3 @@
Most of the code in this directory was taken from the Binc IMAP project
(http://www.bincimap.org/), version 1.3.3

View File

@ -72,3 +72,27 @@ void Binc::MimePart::printBody(int fd, IODevice &output,
output << (char)c;
}
}
void Binc::MimePart::getBody(int fd, string &s,
unsigned int startoffset,
unsigned int length) const
{
if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
delete mimeSource;
mimeSource = new MimeInputSource(fd);
}
mimeSource->reset();
mimeSource->seek(bodystartoffsetcrlf + startoffset);
if (startoffset + length > bodylength)
length = bodylength - startoffset;
char c = '\0';
for (unsigned int i = 0; i < length; ++i) {
if (!mimeSource->getChar(&c))
break;
s += (char)c;
}
}

View File

@ -107,6 +107,7 @@ namespace Binc {
inline unsigned int getBodyStartOffset(void) const { return bodystartoffsetcrlf; }
void printBody(int fd, Binc::IODevice &output, unsigned int startoffset, unsigned int length) const;
void getBody(int fd, std::string& s, unsigned int startoffset, unsigned int length) const;
void printHeader(int fd, Binc::IODevice &output, std::vector<std::string> headers, bool includeheaders, unsigned int startoffset, unsigned int length, std::string &storage) const;
void printDoc(int fd, Binc::IODevice &output, unsigned int startoffset, unsigned int length) const;
virtual void clear(void) const;

View File

@ -0,0 +1,87 @@
#ifndef lint
static char rcsid [] = "@(#$Id: trbinc.cc,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 1994 CDKIT";
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <fcntl.h>
#include "mime.h"
static char *thisprog;
static char usage [] =
"trbinc <mboxfile> \n\n"
;
static void
Usage(void)
{
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
exit(1);
}
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_s 0x2
#define OPT_b 0x4
#define DEFCOUNT 10
const char *hnames[] = {"Subject", "Content-type"};
int nh = sizeof(hnames) / sizeof(char *);
int main(int argc, char **argv)
{
int count = DEFCOUNT;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 's': op_flags |= OPT_s; break;
case 'b': op_flags |= OPT_b; if (argc < 2) Usage();
if ((sscanf(*(++argv), "%d", &count)) != 1)
Usage();
argc--;
goto b1;
default: Usage(); break;
}
b1: argc--; argv++;
}
if (argc != 1)
Usage();
char *mfile = *argv++;argc--;
int fd;
if ((fd = open(mfile, 0)) < 0) {
perror("Opening");
exit(1);
}
Binc::MimeDocument doc;
doc.parseFull(fd);
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
fprintf(stderr, "Parse error\n");
exit(1);
}
close(fd);
Binc::HeaderItem hi;
for (int i = 0; i < nh ; i++) {
if (!doc.h.getFirstHeader(hnames[i], hi)) {
fprintf(stderr, "No %s\n", hnames[i]);
exit(1);
}
printf("%s: %s\n", hnames[i], hi.getValue().c_str());
}
exit(0);
}

View File

@ -1,7 +1,6 @@
include ../mk/sysconf
BIGLIB = ../lib/librcl.a
PROGS = unacpp textsplit
all: $(PROGS)

View File

@ -1,11 +1,12 @@
include ../mk/sysconf
BIGLIB = ../lib/librcl.a
MIMELIB = ../bincimapmime/libmime.a
PROGS = recollindex csguess
all: $(PROGS)
RECOLLINDEX_OBJS= recollindex.o $(BIGLIB)
RECOLLINDEX_OBJS= recollindex.o $(BIGLIB) $(MIMELIB)
recollindex : $(RECOLLINDEX_OBJS)
$(CXX) $(CXXFLAGS) -o recollindex $(RECOLLINDEX_OBJS) \
$(LIBXAPIAN) $(LIBICONV) $(LIBSYS)

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.8 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -146,18 +146,25 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
return FsTreeWalker::FtwOk;
}
Rcl::Doc doc;
if (!internfile(fn, config, doc, tmpdir))
return FsTreeWalker::FtwOk;
FileInterner interner(fn, config, tmpdir);
FileInterner::Status fis = FileInterner::FIAgain;
while (fis == FileInterner::FIAgain) {
Rcl::Doc doc;
string ipath;
fis = interner.internfile(doc, ipath);
if (fis == FileInterner::FIError)
break;
// Set up common fields:
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_ctime));
doc.mtime = ascdate;
// Set up common fields:
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_ctime));
doc.mtime = ascdate;
doc.ipath = ipath;
// Do database-specific work to update document data
if (!db.add(fn, doc))
return FsTreeWalker::FtwError;
// Do database-specific work to update document data
if (!db.add(fn, doc))
return FsTreeWalker::FtwError;
}
return FsTreeWalker::FtwOk;
}

View File

@ -1,14 +1,47 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.5 2005-02-09 13:34:08 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <ctype.h>
#include <string>
using std::string;
#include <list>
using std::list;
#include "mimetype.h"
#include "debuglog.h"
#include "execmd.h"
#include "conftree.h"
static string mimetypefromdata(const string &fn)
{
list<string> args;
args.push_back("-i");
args.push_back(fn);
ExecCmd ex;
string result;
string cmd = "file";
int status = ex.doexec(cmd, args, 0, &result);
if (status) {
LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status));
return "";
}
// LOGDEB(("mimetypefromdata: %s [%s]\n", result.c_str(), fn.c_str()));
list<string> res;
ConfTree::stringToStrings(result, res);
if (res.size() <= 1)
return "";
list<string>::iterator it = res.begin();
it++;
string mime = *it;
if (mime.length() > 0 && !isalpha(mime[mime.length() - 1]))
mime.erase(mime.length() -1);
return mime;
}
string mimetype(const string &fn, ConfTree *mtypes)
{
@ -38,8 +71,9 @@ string mimetype(const string &fn, ConfTree *mtypes)
// If the file name has a suffix and we find it in the map, we're done
string::size_type dot = fn.find_last_of(".");
string suff;
if (dot != string::npos) {
string suff = fn.substr(dot);
suff = fn.substr(dot);
for (unsigned int i = 0; i < suff.length(); i++)
suff[i] = tolower(suff[i]);
@ -48,7 +82,9 @@ string mimetype(const string &fn, ConfTree *mtypes)
return mtype;
}
// Look at file data ? One day maybe
// Look at file data ? Only when no suffix
if (suff.empty())
return mimetypefromdata(fn);
return "";
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.3 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.4 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <unistd.h>
#include <sys/types.h>
@ -70,30 +70,27 @@ static bool uncompressfile(RclConfig *conf, const string& ifn,
return true;
}
static void tmpcleanup(const string& tdir, const string& tfile)
void FileInterner::tmpcleanup()
{
if (tdir.empty() || tfile.empty())
return;
if (unlink(tfile.c_str()) < 0) {
LOGERR(("tmpcleanup: unlink(%s) errno %d\n", tfile.c_str(),
errno));
LOGERR(("FileInterner::tmpcleanup: unlink(%s) errno %d\n",
tfile.c_str(), errno));
return;
}
}
bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
const string& tdir)
// Handler==0 on return says we're in error
FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
const string& td)
: fn(f), config(cnf), tdir(td), handler(0)
{
string fn = ifn;
string tfile;
MimeHandler *handler = 0;
bool ret = false;
string mime = mimetype(fn, config->getMimeMap());
mime = mimetype(fn, config->getMimeMap());
if (mime.empty()) {
// No mime type: not listed in our map.
LOGDEB(("internfile: (no mime) [%s]\n", fn.c_str()));
return false;
LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", fn.c_str()));
return;
}
// First check for a compressed file. If so, create a temporary
@ -101,8 +98,9 @@ bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
// rest with the temp file.
list<string>ucmd;
if (getUncompressor(mime, config->getMimeConf(), ucmd)) {
if (!uncompressfile(config, fn, ucmd, tdir, tfile))
return false;
if (!uncompressfile(config, fn, ucmd, tdir, tfile)) {
return;
}
LOGDEB(("internfile: after ucomp: tdir %s, tfile %s\n",
tdir.c_str(), tfile.c_str()));
fn = tfile;
@ -110,33 +108,43 @@ bool internfile(const std::string &ifn, RclConfig *config, Rcl::Doc& doc,
if (mime.empty()) {
// No mime type ?? pass on.
LOGDEB(("internfile: (no mime) [%s]\n", fn.c_str()));
goto out;
return;
}
}
// Look for appropriate handler
handler = getMimeHandler(mime, config->getMimeConf());
if (!handler) {
// No handler for this type, for now :(
LOGDEB(("internfile: %s : no handler\n", mime.c_str()));
goto out;
LOGDEB(("FileInterner::FileInterner: %s: no handler\n", mime.c_str()));
return;
}
LOGDEB(("internfile: %s [%s]\n", mime.c_str(), fn.c_str()));
LOGDEB(("FileInterner::FileInterner: %s [%s]\n",mime.c_str(), fn.c_str()));
}
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
{
if (!handler)
return FIError;
// Turn file into a document. The document has fields for title, body
// etc., all text converted to utf8
if (!handler->worker(config, fn, mime, doc)) {
goto out;
MimeHandler::Status mhs = handler->worker(config, fn, mime, doc, ipath);
FileInterner::Status ret = FIError;
switch (mhs) {
case MimeHandler::MHError: break;
case MimeHandler::MHDone: ret = FIDone;break;
case MimeHandler::MHAgain: ret = FIAgain;break;
}
doc.mimetype = mime;
// Clean up. We delete the temp file and its father directory
ret = true;
out:
delete handler;
tmpcleanup(tdir, tfile);
doc.mimetype = mime;
return ret;
}
FileInterner::~FileInterner()
{
delete handler;
handler = 0;
tmpcleanup();
}

View File

@ -1,14 +1,31 @@
#ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_
/* @(#$Id: internfile.h,v 1.2 2005-02-09 12:07:29 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: internfile.h,v 1.3 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include "rclconfig.h"
#include "rcldb.h"
class MimeHandler;
/// Turn external file into internal representation, according to mime type etc
extern bool internfile(const std::string &fn, RclConfig *config,
Rcl::Doc& doc, const string& tdir);
class FileInterner {
string fn;
RclConfig *config;
const string &tdir;
MimeHandler *handler;
string tfile;
string mime;
void tmpcleanup();
public:
FileInterner(const std::string &f, RclConfig *cnf, const string& td);
~FileInterner();
enum Status {FIError, FIDone, FIAgain};
Status internfile(Rcl::Doc& doc, string &ipath);
};
#endif /* _INTERNFILE_H_INCLUDED_ */

View File

@ -38,21 +38,23 @@
using namespace std;
bool MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
MimeHandler::Status
MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
string otext;
if (!file_to_string(fn, otext)) {
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
return false;
return MimeHandler::MHError;
}
return worker1(conf, fn, otext, mtype, docout);
}
bool MimeHandlerHtml::worker1(RclConfig *conf, const string &,
const string& htext,
const string &mtype, Rcl::Doc &docout)
MimeHandler::Status
MimeHandlerHtml::worker1(RclConfig *conf, const string &,
const string& htext,
const string &mtype, Rcl::Doc &docout)
{
// Character set handling:
@ -111,5 +113,5 @@ bool MimeHandlerHtml::worker1(RclConfig *conf, const string &,
out.keywords = pres.keywords;
out.abstract = pres.sample;
docout = out;
return true;
return MimeHandler::MHDone;
}

View File

@ -1,6 +1,6 @@
#ifndef _HTML_H_INCLUDED_
#define _HTML_H_INCLUDED_
/* @(#$Id: mh_html.h,v 1.2 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_html.h,v 1.3 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include "mimehandler.h"
// Code to turn an html document into an internal one. There are 2
@ -11,9 +11,9 @@
// carry titles, abstracts, whatever)
class MimeHandlerHtml : public MimeHandler {
public:
virtual bool worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout);
virtual bool worker1(RclConfig *conf, const string &fn,
virtual MimeHandler::Status worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&);
virtual MimeHandler::Status worker1(RclConfig *conf, const string &fn,
const string& htext,
const string &mtype, Rcl::Doc &docout);
};

178
src/internfile/mh_mail.cpp Normal file
View File

@ -0,0 +1,178 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <fcntl.h>
#include <errno.h>
#include <map>
using std::map;
#include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
#include "indextext.h"
#include "mail.h"
#include "debuglog.h"
#include "smallut.h"
#include "mimeparse.h"
using namespace std;
// We are called for two different file types: mbox-type folders
// holding multiple messages, and maildir-type files with one rfc822
// message
MimeHandler::Status
MimeHandlerMail::worker(RclConfig *cnf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
conf = cnf;
if (!stringlowercmp("message/rfc822", mtype)) {
return processone(fn, docout);
} else if (!stringlowercmp("text/x-mail", mtype)) {
return MimeHandler::MHError;
} else
return MimeHandler::MHError;
}
#include "mime.h"
const char *hnames[] = {"Subject", "Content-type"};
int nh = sizeof(hnames) / sizeof(char *);
void walkmime(string &out, Binc::MimePart& doc, int fd, int depth);
// Transform a single message into a document. The subject becomes the
// title, and any simple body part with a content-type of text or html
// and content-disposition inline gets concatenated as text.
MimeHandler::Status
MimeHandlerMail::processone(const string &fn, Rcl::Doc &docout)
{
int fd;
if ((fd = open(fn.c_str(), 0)) < 0) {
LOGERR(("MimeHandlerMail::processone: open(%s) errno %d\n",
fn.c_str(), errno));
return MimeHandler::MHError;
}
Binc::MimeDocument doc;
doc.parseFull(fd);
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
LOGERR(("MimeHandlerMail::processone: parse error for %s\n",
fn.c_str()));
close(fd);
return MimeHandler::MHError;
}
LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str()));
walkmime(docout.text, doc, fd, 0);
close(fd);
LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
return MimeHandler::MHError;
}
void walkmime(string &out, Binc::MimePart& doc, int fd, int depth)
{
if (depth > 5) {
LOGINFO(("walkmime: max depth exceeded\n"));
return;
}
if (doc.isMultipart()) {
LOGDEB(("walkmime: ismultipart %d subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str()));
// We only handle alternative and mixed for now. For
// alternative, we look for a text/plain part, else html and process it
// For mixed, we process each part.
std::vector<Binc::MimePart>::iterator it;
if (!stringicmp("mixed", doc.getSubType())) {
for (it = doc.members.begin(); it != doc.members.end();it++) {
walkmime(out, *it, fd, depth+1);
}
} else if (!stringicmp("alternative", doc.getSubType())) {
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
ittxt = ithtml = doc.members.end();
for (it = doc.members.begin(); it != doc.members.end();it++) {
// Get and parse content-type header
Binc::HeaderItem hi;
if (!doc.h.getFirstHeader("Content-Type", hi))
continue;
LOGDEB(("walkmime:content-type: %s\n", hi.getValue().c_str()));
}
}
} else {
// If content-type is text or html and content-disposition is inline,
// decode and add to text.
// Get and parse content-type header.
Binc::HeaderItem hi;
string ctt = "text/plain";
if (doc.h.getFirstHeader("Content-Type", hi)) {
ctt = hi.getValue();
}
LOGDEB(("walkmime:content-type: %s\n", ctt.c_str()));
MimeHeaderValue content_type;
parseMimeHeaderValue(ctt, content_type);
if (stringlowercmp("text/plain", content_type.value) &&
stringlowercmp("text/html", content_type.value)) {
return;
}
string charset = "us-ascii";
map<string,string>::const_iterator it;
it = content_type.params.find(string("charset"));
if (it != content_type.params.end())
charset = it->second;
// Content disposition
string ctd = "inline";
if (doc.h.getFirstHeader("Content-Disposition", hi)) {
ctd = hi.getValue();
}
MimeHeaderValue content_disposition;
parseMimeHeaderValue(ctd, content_disposition);
if (stringlowercmp("inline", content_disposition.value)) {
return;
}
// Content transfer encoding
string cte = "7bit";
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
cte = hi.getValue();
}
LOGDEB(("walkmime: final: body start offset %d, length %d\n",
doc.getBodyStartOffset(), doc.getBodyLength()));
string body;
doc.getBody(fd, body, 0, doc.bodylength);
// Decode content transfer encoding
if (stringlowercmp("quoted-printable", content_disposition.value)) {
string decoded;
qp_decode(body, decoded);
body = decoded;
} else if (stringlowercmp("base64", content_disposition.value)) {
string decoded;
base64_decode(body, decoded);
body = decoded;
}
// Transcode to utf-8
string transcoded;
if (!transcode(body, transcoded, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
transcoded = body;
}
out += string("\r\n") + transcoded;
}
}

16
src/internfile/mh_mail.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include "mimehandler.h"
// Code to turn a mail folder file into internal documents
class MimeHandlerMail : public MimeHandler {
RclConfig *conf;
MimeHandler::Status processone(const string &fn, Rcl::Doc &docout);
public:
MimeHandlerMail() : conf(0) {}
virtual MimeHandler::Status
worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string& ipath);
};
#endif /* _MAIL_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.8 2005-02-04 14:21:17 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
@ -13,23 +13,24 @@ using namespace std;
#include "debuglog.h"
#include "smallut.h"
#include "html.h"
#include "mail.h"
#include "execmd.h"
#include "pathut.h"
class MimeHandlerText : public MimeHandler {
public:
bool worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout);
MimeHandler::Status worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&);
};
// Process a plain text file
bool MimeHandlerText::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
MimeHandler::Status MimeHandlerText::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
string otext;
if (!file_to_string(fn, otext))
return false;
return MimeHandler::MHError;
// Try to guess charset, then convert to utf-8, and fill document
// fields The charset guesser really doesnt work well in general
@ -46,36 +47,38 @@ bool MimeHandlerText::worker(RclConfig *conf, const string &fn,
cerr << "textPlainToDoc: transcode failed: charset '" << charset
<< "' to UTF-8: "<< utf8 << endl;
otext.erase();
return 0;
return MimeHandler::MHError;
}
Rcl::Doc out;
out.origcharset = charset;
out.text = utf8;
docout = out;
return true;
return MimeHandler::MHDone;
}
class MimeHandlerExec : public MimeHandler {
public:
list<string> params;
virtual ~MimeHandlerExec() {}
virtual bool worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout);
virtual MimeHandler::Status worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout,
string&);
};
// Execute an external program to translate a file from its native format
// to html. Then call the html parser to do the actual indexing
bool MimeHandlerExec::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
MimeHandler::Status
MimeHandlerExec::worker(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
if (params.empty()) {
// Hu ho
LOGERR(("MimeHandlerExec::worker: empty params for mime %s\n",
mtype.c_str()));
return false;
return MimeHandler::MHError;
}
// Command name
string cmd = find_filter(conf, params.front());
@ -92,7 +95,7 @@ bool MimeHandlerExec::worker(RclConfig *conf, const string &fn,
if (status) {
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
status, cmd.c_str()));
return false;
return MimeHandler::MHError;
}
// Process/index the html
@ -106,6 +109,10 @@ static MimeHandler *mhfact(const string &mime)
return new MimeHandlerText;
else if (!stringlowercmp("text/html", mime))
return new MimeHandlerHtml;
else if (!stringlowercmp("text/x-mail", mime))
return new MimeHandlerMail;
else if (!stringlowercmp("message/rfc822", mime))
return new MimeHandlerMail;
return 0;
}
@ -117,7 +124,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
// Return handler definition for mime type
string hs;
if (!mhandlers->get(mtype, hs, "index")) {
LOGDEB(("getMimeHandler: no handler for %s\n", mtype.c_str()));
LOGDEB(("getMimeHandler: no handler for '%s'\n", mtype.c_str()));
return 0;
}

View File

@ -1,6 +1,6 @@
#ifndef _MIMEHANDLER_H_INCLUDED_
#define _MIMEHANDLER_H_INCLUDED_
/* @(#$Id: mimehandler.h,v 1.5 2005-02-04 09:39:44 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mimehandler.h,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -10,13 +10,34 @@
/**
* Document interner class. We sometimes have data to pass to an interner
* Document interner class.
*/
class MimeHandler {
public:
virtual ~MimeHandler() {}
virtual bool worker(RclConfig *, const std::string &filename,
const std::string &mimetype, Rcl::Doc& outdoc) = 0;
/**
* Transform external data into internal utf8 document
*
* @param conf the global configuration
* @param filename File from which the data comes from
* @param mimetype its mime type (from the mimemap configuration file)
* @param outdoc The output document
* @param ipath the access path for the document inside the file.
* For mono-document file types, this will always be empty.
* It is used, for example for mbox files which may contain
* multiple emails. If this is not empty in input, then the
* caller is requesting a single document (ie: for display).
* If this is empty (during indexation), it will be filled-up
* by the function, and all the file's documents will be
* returned by successive calls.
* @return the return value indicates if there are more documents to be
* fetched from the same file.
*/
enum Status {MHError, MHDone, MHAgain};
virtual Status worker(RclConfig * conf, const std::string &filename,
const std::string &mimetype, Rcl::Doc& outdoc,
string& ipath) = 0;
};
/**
@ -30,6 +51,11 @@ extern MimeHandler *getMimeHandler(const std::string &mtyp, ConfTree *mhdlers);
*/
extern std::string getMimeViewer(const std::string &mtyp, ConfTree *mhandlers);
/**
* Return command to uncompress the given type. The returned command has
* substitutable places for input file name and temp dir name, and will
* return output name
*/
bool getUncompressor(const std::string &mtype, ConfTree *mhandlers,
std::list<std::string>& cmd);

View File

@ -150,7 +150,8 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
lowercase_term(hequiv);
if (hequiv == "content-type") {
string value = i->second;
MimeHeaderValue p = parseMimeHeaderValue(value);
MimeHeaderValue p;
parseMimeHeaderValue(value, p);
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) !=
p.params.end()) {

View File

@ -8,14 +8,15 @@ all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o html.o htmlparse.o indexer.o internfile.o \
fstreewalk.o html.o mail.o htmlparse.o indexer.o internfile.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
rclconfig.o rcldb.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/wipedir.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
../common/htmlparse.cpp \
../index/indexer.cpp ../common/internfile.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
../common/myhtmlparse.cpp ../utils/pathut.cpp \
@ -51,6 +52,8 @@ indexer.o : ../index/indexer.cpp
$(CXX) $(CXXFLAGS) -c $<
internfile.o : ../common/internfile.cpp
$(CXX) $(CXXFLAGS) -c $<
mail.o : ../common/mail.cpp
$(CXX) $(CXXFLAGS) -c $<
mimehandler.o : ../common/mimehandler.cpp
$(CXX) $(CXXFLAGS) -c $<
mimeparse.o : ../utils/mimeparse.cpp

View File

@ -1,5 +1,5 @@
CXXFLAGS = -pthread -Wall -g -I. -I../index -I../utils -I../common \
-I../unac -I/usr/local/include
-I../unac -I../bincimapmime -I/usr/local/include
LIBXAPIAN = -L/usr/local/lib -lxapian
LIBICONV = -L/usr/local/lib -liconv

View File

@ -23,7 +23,7 @@ unix {
UI_DIR = .ui
MOC_DIR = .moc
OBJECTS_DIR = .obj
LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
LIBS += ../lib/librcl.a ../bincimapmime/libmime.a -L/usr/local/lib -lxapian -liconv
INCLUDEPATH += ../common ../index ../query ../unac ../utils
}

View File

@ -197,7 +197,8 @@ void RecollMain::reslistTE_clicked(int par, int car)
// for preview:
string fn = urltolocalpath(doc.url);
Rcl::Doc fdoc;
if (!internfile(fn, rclconfig, fdoc, tmpdir)) {
FileInterner interner(fn, rclconfig, tmpdir);
if (interner.internfile(fdoc, doc.ipath) != FileInterner::FIDone) {
QMessageBox::warning(0, "Recoll",
QString("Can't turn doc into internal rep ") +
doc.mimetype.c_str());

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.12 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.13 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -31,8 +31,9 @@ namespace Rcl {
*/
class Doc {
public:
// This fields potentially go into the document data record
// These fields potentially go into the document data record
string url;
string ipath;
string mimetype;
string mtime; // Modification time as decimal ascii
string origcharset;
@ -41,8 +42,10 @@ class Doc {
string abstract;
string text;
void erase() {
url.erase();
ipath.erase();
mimetype.erase();
mtime.erase();
origcharset.erase();

View File

@ -1,4 +1,4 @@
# @(#$Id: mimeconf,v 1.4 2005-03-17 14:02:05 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: mimeconf,v 1.5 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll : associations of mime types to processing filters.
# There are different sections for decompression, 'interning' for indexing
@ -27,6 +27,9 @@ application/x-bzip2 = uncompress rcluncomp bunzip2 %f %t
[index]
text/plain = internal
text/html = internal
text/x-mail = internal
message/rfc822 = internal
application/pdf = exec rclpdf
application/postscript = exec rclps
application/msword = exec rcldoc
@ -46,7 +49,7 @@ application/vnd.sun.xml.writer.template = exec rclsoff
# External viewers, launched when you double-click a result entry
[view]
text/plain = xemacs %f
text/html = firefox -a firefox -remote "openFile(%u)"
text/html = firefox -remote "openFile(%u)"
application/pdf = xpdf %f
application/postscript = gv %f
application/msword = openoffice-1.1.3-swriter %f

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.2 2005-03-17 14:02:06 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_MIMEPARSE
@ -7,71 +7,345 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.2 2005-03-17 14:02:06 dockes Ex
#include <string>
#include <ctype.h>
#include <stdio.h>
#include <ctype.h>
#include "mimeparse.h"
using namespace std;
#define WHITE " \t\n"
static void stripw_lc(string &in)
// Parsing a header value. Only content-type has parameters, but
// others are compatible with content-type syntax, only, parameters
// are not used. So we can parse all like content-type:
// headertype: value [; paramname=paramvalue] ...
// Value and paramvalues can be quoted strings, and there can be
// comments in there
// The lexical token returned by find_next_token
class Lexical {
public:
enum kind {none, token, separator};
kind what;
string value;
string error;
char quote;
Lexical() : what(none), quote(0) {}
void reset() {what = none; value.erase(); error.erase();quote = 0;}
};
// Skip mime comment. This must be called with in[start] == '('
int skip_comment(const string &in, unsigned int start, Lexical &lex)
{
// fprintf(stderr, "In: '%s'\n", in.c_str());
string::size_type pos, pos1;
pos = in.find_first_not_of(WHITE);
if (pos == string::npos) {
// All white
in = "";
return;
int commentlevel = 0;
for (; start < in.size(); start++) {
if (in[start] == '\\') {
// Skip escaped char.
if (start+1 < in.size()) {
start++;
continue;
} else {
lex.error.append("\\ at end of string ");
return string::npos;
}
}
if (in[start] == '(')
commentlevel++;
if (in[start] == ')') {
if (--commentlevel == 0)
break;
}
}
in.replace(0, pos, "");
pos1 = in.find_last_not_of(WHITE);
if (pos1 != in.length() -1)
in = in.replace(pos1+1, string::npos, "");
string::iterator i;
for (i = in.begin(); i != in.end(); i++)
*i = tolower(*i);
if (start == in.size()) {
lex.error.append("Unclosed comment ");
return string::npos;
}
return start;
}
MimeHeaderValue parseMimeHeaderValue(const string &ein)
// Skip initial whitespace and (possibly nested) comments.
int skip_whitespace_and_comment(const string &in, unsigned int start,
Lexical &lex)
{
string in = ein;
MimeHeaderValue out;
string::size_type pos;
while (1) {
if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
return in.size();
if (in[start] == '(') {
if ((start = skip_comment(in, start, lex)) == string::npos)
return string::npos;
} else {
break;
}
}
return start;
}
pos = in.find_first_not_of(WHITE);
if (pos == string::npos)
return out;
in = in.substr(pos, string::npos);
if ((pos = in.find_first_of(";")) == string::npos) {
out.value = in;
return out;
}
out.value = in.substr(0, pos);
stripw_lc(out.value);
in = in.substr(pos+1, string::npos);
for (;;) {
// Skip whitespace
if ((pos = in.find_first_not_of(WHITE)) == string::npos)
return out;
in = in.substr(pos, string::npos);
/// Find next token in mime header value string.
/// @return the next starting position in string, string::npos for error
/// (ie unbalanced quoting)
/// @param in the input string
/// @param start the starting position
/// @param lex the returned token and its description
/// @param delims separators we should look for
int find_next_token(const string &in, unsigned int start,
Lexical &lex, string delims = ";=")
{
char oquot, cquot;
if ((pos = in.find_first_of("=")) == string::npos)
return out;
string pname = in.substr(0, pos);
stripw_lc(pname);
in = in.substr(pos+1, string::npos);
start = skip_whitespace_and_comment(in, start, lex);
if (start == string::npos || start == in.size())
return start;
pos = in.find_first_of(";");
string pvalue = in.substr(0, pos);
stripw_lc(pvalue);
out.params[pname] = pvalue;
if (pos == string::npos)
return out;
in = in.substr(pos+1, string::npos);
// Begins with separator ? return it.
unsigned int delimi = delims.find_first_of(in[start]);
if (delimi != string::npos) {
lex.what = Lexical::separator;
lex.value = delims[delimi];
return start+1;
}
return out;
// Check for start of quoted string
oquot = in[start];
switch (oquot) {
case '<': cquot = '>';break;
case '"': cquot = '"';break;
default: cquot = 0; break;
}
if (cquot != 0) {
// Quoted string parsing
unsigned int end;
start++; // Skip quote character
for (end = start;end < in.size() && in[end] != cquot; end++) {
if (in[end] == '\\') {
// Skip escaped char.
if (end+1 < in.size()) {
end++;
} else {
// backslash at end of string: error
lex.error.append("\\ at end of string ");
return string::npos;
}
}
}
if (end == in.size()) {
// Found end of string before closing quote character: error
lex.error.append("Unclosed quoted string ");
return string::npos;
}
lex.what = Lexical::token;
lex.value = in.substr(start, end-start);
lex.quote = oquot;
return ++end;
} else {
unsigned int end = in.find_first_of(delims + " \t(", start);
lex.what = Lexical::token;
lex.quote = 0;
if (end == string::npos) {
end = in.size();
lex.value = in.substr(start);
} else {
lex.value = in.substr(start, end-start);
}
return end;
}
}
void stringtolower(string &out, const string& in)
{
for (unsigned int i = 0; i < in.size(); i++)
out.append(1, char(tolower(in[i])));
}
bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
{
parsed.value.erase();
parsed.params.clear();
Lexical lex;
unsigned int start = 0;
start = find_next_token(value, start, lex);
if (start == string::npos || lex.what != Lexical::token)
return false;
parsed.value = lex.value;
for (;;) {
string paramname, paramvalue;
lex.reset();
start = find_next_token(value, start, lex);
if (start == value.size())
return true;
if (start == string::npos)
return false;
if (lex.what == Lexical::separator && lex.value[0] == ';')
continue;
if (lex.what != Lexical::token)
return false;
stringtolower(paramname, lex.value);
start = find_next_token(value, start, lex);
if (start == string::npos || lex.what != Lexical::separator ||
lex.value[0] != '=')
return false;
start = find_next_token(value, start, lex);
if (start == string::npos || lex.what != Lexical::token)
return false;
paramvalue = lex.value;
parsed.params[paramname] = paramvalue;
}
return true;
}
// Decode a string encoded with quoted-printable encoding.
bool qp_decode(const string& in, string &out)
{
out.reserve(in.length());
unsigned int ii;
for (ii = 0; ii < in.length(); ii++) {
if (in[ii] == '=') {
ii++; // Skip '='
if(ii >= in.length() - 1) { // Need at least 2 more chars
break;
} else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
ii++;
} else if (in[ii] != '\n' && in[ii] != '\r') { // decode
char c = in[ii];
char co;
if(c >= 'A' && c <= 'F') {
co = char((c - 'A' + 10) * 16);
} else if (c >= 'a' && c <= 'f') {
co = char((c - 'a' + 10) * 16);
} else if (c >= '0' && c <= '9') {
co = char((c - '0') * 16);
} else {
return false;
}
if(++ii >= in.length())
break;
c = in[ii];
if (c >= 'A' && c <= 'F') {
co += char(c - 'A' + 10);
} else if (c >= 'a' && c <= 'f') {
co += char(c - 'a' + 10);
} else if (c >= '0' && c <= '9') {
co += char(c - '0');
} else {
return false;
}
out += co;
}
} else {
out += in[ii];
}
}
return true;
}
// This is adapted from FreeBSD's code.
static const char Base64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static const char Pad64 = '=';
bool base64_decode(const string& in, string& out)
{
int io = 0, state = 0, ch;
char *pos;
unsigned int ii = 0;
out.reserve(in.length());
for (ii = 0; ii < in.length(); ii++) {
ch = in[ii];
if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */
continue;
if (ch == Pad64)
break;
pos = strchr(Base64, ch);
if (pos == 0) /* A non-base64 character. */
return false;
switch (state) {
case 0:
out[io] = (pos - Base64) << 2;
state = 1;
break;
case 1:
out[io] |= (pos - Base64) >> 4;
out[io+1] = ((pos - Base64) & 0x0f) << 4 ;
io++;
state = 2;
break;
case 2:
out[io] |= (pos - Base64) >> 2;
out[io+1] = ((pos - Base64) & 0x03) << 6;
io++;
state = 3;
break;
case 3:
out[io] |= (pos - Base64);
io++;
state = 0;
break;
default:
return false;
}
}
/*
* We are done decoding Base-64 chars. Let's see if we ended
* on a byte boundary, and/or with erroneous trailing characters.
*/
if (ch == Pad64) { /* We got a pad char. */
ch = in[ii++]; /* Skip it, get next. */
switch (state) {
case 0: /* Invalid = in first position */
case 1: /* Invalid = in second position */
return false;
case 2: /* Valid, means one byte of info */
/* Skip any number of spaces. */
for (; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch))
break;
/* Make sure there is another trailing = sign. */
if (ch != Pad64)
return false;
ch = in[ii++]; /* Skip the = */
/* Fall through to "single trailing =" case. */
/* FALLTHROUGH */
case 3: /* Valid, means two bytes of info */
/*
* We know this char is an =. Is there anything but
* whitespace after it?
*/
for ((void)NULL; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch))
return false;
/*
* Now make sure for cases 2 and 3 that the "extra"
* bits that slopped past the last full byte were
* zeros. If we don't check them, they become a
* subliminal channel.
*/
if (out[io] != 0)
return false;
}
} else {
/*
* We ended by seeing the end of the string. Make sure we
* have no partial bytes lying around.
*/
if (state != 0)
return false;
}
return true;
}
#else
@ -82,19 +356,47 @@ using namespace std;
int
main(int argc, const char **argv)
{
#if 0
// const char *tr = "text/html; charset=utf-8; otherparam=garb";
const char *tr = "text/html;charset = UTF-8 ; otherparam=garb; \n"
"QUOTEDPARAM=\"quoted value\"";
MimeHeaderValue parsed;
// const char *tr = "text/html; charset=utf-8; otherparam=garb";
const char *tr = "text/html;charset = UTF-8 ; otherparam=garb;";
parsed = parseMimeHeaderValue(tr);
if (!parseMimeHeaderValue(tr, parsed)) {
fprintf(stderr, "PARSE ERROR\n");
}
printf("'%s' \n", parsed.value.c_str());
map<string, string>::iterator it;
for (it = parsed.params.begin();it != parsed.params.end();it++) {
printf(" '%s' = '%s'\n", it->first.c_str(), it->second.c_str());
}
#elif 0
const char *qp = "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
"agrave is: '=E0' probable skipped decode error: =\n"
"Actual decode error =xx this wont show";
string out;
if (!qp_decode(string(qp), out)) {
fprintf(stderr, "qp_decode returned error\n");
}
printf("Decoded: '%s'\n", out.c_str());
#else
//'C'est à boire qu'il nous faut éviter l'excès.'
//'Deuxième ligne'
//'Troisième ligne'
//'Et la fin (pas de nl). '
const char *b64 =
"Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
"IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";
string out;
if (!base64_decode(string(b64), out)) {
fprintf(stderr, "base64_decode returned error\n");
}
printf("Decoded: '%s'\n", out.c_str());
#endif
}
#endif // TEST_MIMEPARSE

View File

@ -1,6 +1,6 @@
#ifndef _MIME_H_INCLUDED_
#define _MIME_H_INCLUDED_
/* @(#$Id: mimeparse.h,v 1.1 2005-01-26 11:45:55 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <map>
@ -11,7 +11,9 @@ class MimeHeaderValue {
std::string value;
std::map<std::string, std::string> params;
};
extern MimeHeaderValue parseMimeHeaderValue(const std::string &in);
extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd);
bool qp_decode(const std::string& in, std::string &out);
bool base64_decode(const std::string& in, std::string &out);
#endif /* _MIME_H_INCLUDED_ */