limit path therm length through hashing

2005-11-06 11:16:53 +00:00 · 2005-11-06 11:16:53 +00:00 · 48948bc92f
commit 48948bc92f
parent d39a9f12de
9 changed files with 751 additions and 164 deletions
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -6,22 +6,24 @@ UNACCFLAGS = -g -I. -I../unac -I/usr/local/include -DUNAC_VERSION=\"1.0.7\"
 LIBS = librcl.a
 all: $(LIBS)
-OBJS = conftree.o csguess.o debuglog.o \
+OBJS = base64.o conftree.o csguess.o debuglog.o \
     execmd.o wipedir.o \
-     fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o internfile.o \
+     fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o \
-     mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
+     internfile.o md5.o \
     mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
     rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
     textsplit.o transcode.o \
     unacpp.o unac.o
 SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
-     ../utils/execmd.cpp ../utils/idfile.cpp ../utils/wipedir.cpp \
+     ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
     ../utils/wipedir.cpp \
     ../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
     ../common/htmlparse.cpp \
     ../index/indexer.cpp ../common/internfile.cpp \
     ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
-     ../common/myhtmlparse.cpp ../utils/pathut.cpp \
+     ../common/myhtmlparse.cpp ../common/pathhash.cpp ../utils/pathut.cpp \
     ../common/rclconfig.cpp ../common/rcldb.cpp ../common/rclinit.cpp \
-     ../utils/readfile.cpp ../utils/smallut.cpp \
+     ../utils/base64.cpp ../utils/readfile.cpp ../utils/smallut.cpp \
     ../common/textsplit.cpp ../utils/transcode.cpp \
     ../common/unacpp.cpp ../unac/unac.c
@ -67,6 +69,8 @@ myhtmlparse.o : ../common/myhtmlparse.cpp
 	$(CXX) $(CXXFLAGS) -c $<
 pathut.o : ../utils/pathut.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 pathhash.o : ../common/pathhash.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 rclconfig.o : ../common/rclconfig.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 rclinit.o : ../common/rclinit.cpp 
@ -75,12 +79,16 @@ rcldb.o : ../common/rcldb.cpp
 	$(CXX) $(CXXFLAGS) -c $<
 readfile.o : ../utils/readfile.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 base64.o : ../utils/base64.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 smallut.o : ../utils/smallut.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 textsplit.o : ../common/textsplit.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 transcode.o : ../utils/transcode.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 md5.o : ../utils/md5.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 unacpp.o : ../common/unacpp.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
--- a/src/rcldb/pathhash.cpp
+++ b/src/rcldb/pathhash.cpp
@ -0,0 +1,81 @@
 #ifndef lint
 static char rcsid[] = "@(#$Id: pathhash.cpp,v 1.1 2005-11-06 11:16:52 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include "pathhash.h"
 #include "md5.h"
 #include "base64.h"
 using std::string;
 #ifdef PATHHASH_HEX
 static void md5hexprint(const unsigned char hash[16], string &out)
 {
    out.erase();
    out.reserve(33);
    static const char hex[]="0123456789abcdef";
    for (int i = 0; i < 16; i++) {
 	out.append(1, hex[hash[i] >> 4]);
 	out.append(1, hex[hash[i] & 0x0f]);
    }
 }
 #endif
 // Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars)
 #define HASHLEN 22
 // Convert longish paths by truncating and appending hash of path
 // The full length of the base64-encoded (minus pad) of the md5 is 22 chars
 // We append this to the truncated path
 void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
 {
    if (maxlen < HASHLEN) {
 	fprintf(stderr, "pathHash: internal error: requested len too small\n");
 	abort();
    }
    if (path.length() <= maxlen) {
 	phash = path;
 	return;
    }
    // Compute the md5
    unsigned char chash[16];
    MD5_CTX ctx;
    MD5Init(&ctx);
    MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN), 
 	      path.length() - (maxlen - HASHLEN));
    MD5Final(chash, &ctx);
 #if 0
    string hex;
    md5hexprint(chash, hex);
    printf("hex  [%s]\n", hex.c_str());
 #endif
    // Encode it to ascii. This shouldn't be strictly necessary as
    // xapian terms can be binary
    string hash;
    base64_encode(string((char *)chash, 16), hash);
    // We happen to know there will be 2 pad chars in there, that we
    // don't need as this won't ever be decoded. Resulting length is 22
    hash.resize(hash.length() - 2);
    // Truncate path and append hash
    phash = path.substr(0, maxlen - HASHLEN) + hash;
 }
 #ifdef TEST_PATHHASH
 #include <stdio.h>
 int main(int argc, char **argv)
 {
    string path="/usr/lib/toto.cpp";
    string hash;
    pathHash(path, hash);
    printf("hash [%s]\n", hash.c_str());
 }
 #endif // TEST_PATHHASH
--- a/src/rcldb/pathhash.h
+++ b/src/rcldb/pathhash.h
@ -0,0 +1,11 @@
 #ifndef _PATHHASH_H_INCLUDED_
 #define _PATHHASH_H_INCLUDED_
 /* @(#$Id: pathhash.h,v 1.1 2005-11-06 11:16:52 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 extern void pathHash(const std::string &path, std::string &hash, 
 		     unsigned int len);
 #endif /* _PATHHASH_H_INCLUDED_ */
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.33 2005-11-05 15:29:12 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.34 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -19,6 +19,7 @@ using namespace std;
 #include "debuglog.h"
 #include "pathut.h"
 #include "smallut.h"
 #include "pathhash.h"
 #include "xapian.h"
 #include <xapian/stem.h>
@ -268,6 +269,16 @@ truncate_to_word(string & input, string::size_type maxlen)
    return output;
 }
 // Truncate longer path and uniquize with hash . The goad for this is
 // to avoid xapian max term length limitations, not to gain space (we
 // gain very little even with very short maxlens like 30)
 #define HASHPATH
 #define PATHHASHLEN 150
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
 // metadata), and update database
 bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 {
    LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
@ -275,6 +286,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 	return false;
    Native *ndb = (Native *)pdata;
    // Truncate abstract, title and keywords to reasonable lengths
    Rcl::Doc doc = idoc;
    if (doc.abstract.empty()) 
 	doc.abstract = truncate_to_word(doc.text, 100);
@ -289,6 +301,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    TextSplit splitter(&splitData);
    ///////// Split and index terms in document body and auxiliary fields
    // Split title and index terms
    string noacc;
    if (!dumb_string(doc.title, noacc)) {
 	LOGERR(("Rcl::Db::add: unac failed\n"));
@ -296,6 +311,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    }
    splitter.text_to_words(noacc);
    // Split body and index terms
    splitData.basepos += splitData.curpos + 100;
    if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -303,6 +319,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    }
    splitter.text_to_words(noacc);
    // Split keywords and index terms
    splitData.basepos += splitData.curpos + 100;
    if (!dumb_string(doc.keywords, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -310,6 +327,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    }
    splitter.text_to_words(noacc);
    // Split abstract and index terms
    splitData.basepos += splitData.curpos + 100;
    if (!dumb_string(doc.abstract, noacc)) {
 	LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -317,19 +335,44 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    }
    splitter.text_to_words(noacc);
    ////// Special terms for metadata
    // Mime type
    newdocument.add_term("T" + doc.mimetype);
-    string pathterm  = "P" + fn;
+    // Path name
    string hash;
 #ifdef HASHPATH
    pathHash(fn, hash, PATHHASHLEN);
 #else
    hash = fn;
 #endif
    LOGDEB(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
    string pathterm  = "P" + hash;
    newdocument.add_term(pathterm);
    // File path + internal path: document unique identifier for
    // documents inside multidocument files.
    string uniterm;
    if (!doc.ipath.empty()) {
-	uniterm  = "Q" + fn + "|" + doc.ipath;
+	uniterm  = "Q" + hash + "|" + doc.ipath;
 	newdocument.add_term(uniterm);
    }
-
+    // Dates etc...
-    const char *fnc = fn.c_str();
+    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
 			doc.dmtime.c_str());
    struct tm *tm = localtime(&mtime);
    char buf[9];
    sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
    newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD)
    buf[7] = '\0';
    if (buf[6] == '3') buf[6] = '2';
    newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval
    buf[6] = '\0';
    newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
    buf[4] = '\0';
    newdocument.add_term("Y" + string(buf)); // Year (YYYY)
    // Document data record. omindex has the following nl separated fields:
    // - url
@ -348,27 +391,14 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    if (!doc.ipath.empty()) {
 	record += "\nipath=" + doc.ipath;
    }
    record += "\n";
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
    newdocument.set_data(record);
-    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
+    const char *fnc = fn.c_str();
 			doc.dmtime.c_str());
    struct tm *tm = localtime(&mtime);
    char buf[9];
    sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
    newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD)
    buf[7] = '\0';
    if (buf[6] == '3') buf[6] = '2';
    newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval
    buf[6] = '\0';
    newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
    buf[4] = '\0';
    newdocument.add_term("Y" + string(buf)); // Year (YYYY)
-    // If this document has already been indexed, update the existing
+    // Add db entry or update existing entry:
    // entry.
    try {
 	Xapian::docid did = 
 	    ndb->wdb.replace_document(uniterm.empty() ? pathterm : uniterm, 
@ -397,14 +427,23 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
    Native *ndb = (Native *)pdata;
    // If no document exist with this path, we do need update
-    string pathterm  = "P" + filename;
+    string hash;
 #ifdef HASHPATH
    pathHash(filename, hash, PATHHASHLEN);
 #else
    hash = filename;
 #endif
    string pathterm  = "P" + hash;
    if (!ndb->wdb.term_exists(pathterm)) {
 	return true;
    }
    // Look for all documents with this path. We need to look at all
-    // to set their existence flag.
+    // to set their existence flag.  We check the update time on the
-    // We check the update time on the spe
+    // fmtime field which will be identical for all docs inside a
    // multi-document file (we currently always reindex all if the
    // file changed)
    Xapian::PostingIterator doc;
    try {
 	Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
--- a/src/utils/base64.cpp
+++ b/src/utils/base64.cpp
@ -0,0 +1,209 @@
 #ifndef lint
 static char rcsid[] = "@(#$Id: base64.cpp,v 1.1 2005-11-06 11:16:53 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 #include <sys/types.h>
 #include <string>
 using std::string;
 //#define DEBUG_BASE64 
 #ifdef DEBUG_BASE64
 #define DPRINT(X) fprintf X
 #else
 #define DPRINT(X)
 #endif
 // This is adapted from FreeBSD's code.
 static const char Base64[] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 static const char Pad64 = '=';
 bool base64_decode(const string& in, string& out)
 {
    int io = 0, state = 0, ch;
    char *pos;
    unsigned int ii = 0;
    out.reserve(in.length());
    for (ii = 0; ii < in.length(); ii++) {
 	ch = in[ii];
 	if (isspace((unsigned char)ch))        /* Skip whitespace anywhere. */
 	    continue;
 	if (ch == Pad64)
 	    break;
 	pos = strchr(Base64, ch);
 	if (pos == 0) {
 	    /* A non-base64 character. */
 	    DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii));
 	    return false;
 	}
 	switch (state) {
 	case 0:
 	    out += (pos - Base64) << 2;
 	    state = 1;
 	    break;
 	case 1:
 	    out[io]   |=  (pos - Base64) >> 4;
 	    out += ((pos - Base64) & 0x0f) << 4 ;
 	    io++;
 	    state = 2;
 	    break;
 	case 2:
 	    out[io]   |=  (pos - Base64) >> 2;
 	    out += ((pos - Base64) & 0x03) << 6;
 	    io++;
 	    state = 3;
 	    break;
 	case 3:
 	    out[io] |= (pos - Base64);
 	    io++;
 	    state = 0;
 	    break;
 	default:
 	    DPRINT((stderr, "base64_dec: internal!bad state!\n"));
 	    return false;
 	}
    }
    /*
     * We are done decoding Base-64 chars.  Let's see if we ended
     * on a byte boundary, and/or with erroneous trailing characters.
     */
    if (ch == Pad64) {		/* We got a pad char. */
 	ch = in[ii++];		/* Skip it, get next. */
 	switch (state) {
 	case 0:		/* Invalid = in first position */
 	case 1:		/* Invalid = in second position */
 	    DPRINT((stderr, "base64_dec: pad char in state 0/1\n"));
 	    return false;
 	case 2:		/* Valid, means one byte of info */
 			/* Skip any number of spaces. */
 	    for (; ii < in.length(); ch = in[ii++])
 		if (!isspace((unsigned char)ch))
 		    break;
 	    /* Make sure there is another trailing = sign. */
 	    if (ch != Pad64) {
 		DPRINT((stderr, "base64_dec: missing pad char!\n"));
 		// Well, there are bad encoders out there. Let it pass
 		// return false;
 	    }
 	    ch = in[ii++];		/* Skip the = */
 	    /* Fall through to "single trailing =" case. */
 	    /* FALLTHROUGH */
 	case 3:	    /* Valid, means two bytes of info */
 	    /*
 	     * We know this char is an =.  Is there anything but
 	     * whitespace after it?
 	     */
 	    for (; ii < in.length(); ch = in[ii++])
 		if (!isspace((unsigned char)ch)) {
 		    DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n", 
 			    (unsigned int)ch));
 		    // Well, there are bad encoders out there. Let it pass
 		    //return false;
 		}
 	    /*
 	     * Now make sure for cases 2 and 3 that the "extra"
 	     * bits that slopped past the last full byte were
 	     * zeros.  If we don't check them, they become a
 	     * subliminal channel.
 	     */
 	    if (out[io] != 0) {
 		DPRINT((stderr, "base64_dec: bad extra bits!\n"));
 		// Well, there are bad encoders out there. Let it pass
 		out[io] = 0;
 		// return false;
 	    }
 	}
    } else {
 	/*
 	 * We ended by seeing the end of the string.  Make sure we
 	 * have no partial bytes lying around.
 	 */
 	if (state != 0) {
 	    DPRINT((stderr, "base64_dec: bad final state\n"));
 	    return false;
 	}
    }
    DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n", 
 	    io, out.size(), out.length(), out.c_str()));
    return true;
 }
 #undef Assert
 #define Assert(X)
 void base64_encode(const string &in, string &out)
 {
    size_t datalength = 0;
    unsigned char input[3];
    unsigned char output[4];
    size_t i;
    int srclength = in.length();
    int sidx = 0;
    while (2 < srclength) {
 	input[0] = in[sidx++];
 	input[1] = in[sidx++];
 	input[2] = in[sidx++];
 	srclength -= 3;
 	output[0] = input[0] >> 2;
 	output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
 	output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
 	output[3] = input[2] & 0x3f;
 	Assert(output[0] < 64);
 	Assert(output[1] < 64);
 	Assert(output[2] < 64);
 	Assert(output[3] < 64);
 	out += Base64[output[0]];
 	out += Base64[output[1]];
 	out += Base64[output[2]];
 	out += Base64[output[3]];
    }
    /* Now we worry about padding. */
    if (0 != srclength) {
 	/* Get what's left. */
 	input[0] = input[1] = input[2] = '\0';
 	for (unsigned int i = 0; i < srclength; i++)
 	    input[i] = in[sidx++];
 	output[0] = input[0] >> 2;
 	output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
 	output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
 	Assert(output[0] < 64);
 	Assert(output[1] < 64);
 	Assert(output[2] < 64);
 	out += Base64[output[0]];
 	out += Base64[output[1]];
 	if (srclength == 1)
 	    out += Pad64;
 	else
 	    out += Base64[output[2]];
 	out += Pad64;
    }
    return;
 }
 #ifdef TEST_BASE64
 #include <stdio.h>
 int main(int agrc, char **argv)
 {
    string in = "12345";
    string out;
    base64_encode(in, out);
    printf("in %s out %s\n", in.c_str(), out.c_str());
 }
 #endif
--- a/src/utils/base64.h
+++ b/src/utils/base64.h
@ -0,0 +1,9 @@
 #ifndef _BASE64_H_INCLUDED_
 #define _BASE64_H_INCLUDED_
 /* @(#$Id: base64.h,v 1.1 2005-11-06 11:16:53 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 void base64_encode(const std::string &in, std::string &out);
 bool base64_decode(const std::string& in, std::string& out);
 #endif /* _BASE64_H_INCLUDED_ */
--- a/src/utils/md5.cpp
+++ b/src/utils/md5.cpp
@ -0,0 +1,318 @@
 /*
 * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
 *
 * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
 * rights reserved.
 *
 * License to copy and use this software is granted provided that it
 * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
 * Algorithm" in all material mentioning or referencing this software
 * or this function.
 *
 * License is also granted to make and use derivative works provided
 * that such works are identified as "derived from the RSA Data
 * Security, Inc. MD5 Message-Digest Algorithm" in all material
 * mentioning or referencing the derived work.
 *
 * RSA Data Security, Inc. makes no representations concerning either
 * the merchantability of this software or the suitability of this
 * software for any particular purpose. It is provided "as is"
 * without express or implied warranty of any kind.
 *
 * These notices must be retained in any copies of any part of this
 * documentation and/or software.
 *
 * $FreeBSD: src/lib/libmd/md5c.c,v 1.11 1999/12/29 05:04:20 peter Exp $
 *
 * This code is the same as the code published by RSA Inc.  It has been
 * edited for clarity and style only.
 */
 #include <string.h>
 #include "md5.h"
 typedef unsigned int md5uint32;
 static void MD5Transform(md5uint32 [4], const unsigned char [64]);
 #ifdef i386
 #define Encode memcpy
 #define Decode memcpy
 #else /* i386 */
 /*
 * Encodes input (md5uint32) into output (unsigned char). Assumes len is
 * a multiple of 4.
 */
 static void
 Encode (unsigned char *output, md5uint32 *input, unsigned int len)
 {
 	unsigned int i, j;
 	for (i = 0, j = 0; j < len; i++, j += 4) {
 		output[j] = (unsigned char)(input[i] & 0xff);
 		output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
 		output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
 		output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
 	}
 }
 /*
 * Decodes input (unsigned char) into output (md5uint32). Assumes len is
 * a multiple of 4.
 */
 static void
 Decode (md5uint32 *output, const unsigned char *input, unsigned int len)
 {
 	unsigned int i, j;
 	for (i = 0, j = 0; j < len; i++, j += 4)
 		output[i] = ((md5uint32)input[j]) | (((md5uint32)input[j+1]) << 8) |
 		    (((md5uint32)input[j+2]) << 16) | (((md5uint32)input[j+3]) << 24);
 }
 #endif /* i386 */
 static unsigned char PADDING[64] = {
  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 /* F, G, H and I are basic MD5 functions. */
 #define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
 #define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
 #define H(x, y, z) ((x) ^ (y) ^ (z))
 #define I(x, y, z) ((y) ^ ((x) | (~z)))
 /* ROTATE_LEFT rotates x left n bits. */
 #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
 /*
 * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
 * Rotation is separate from addition to prevent recomputation.
 */
 #define FF(a, b, c, d, x, s, ac) { \
 	(a) += F ((b), (c), (d)) + (x) + (md5uint32)(ac); \
 	(a) = ROTATE_LEFT ((a), (s)); \
 	(a) += (b); \
 	}
 #define GG(a, b, c, d, x, s, ac) { \
 	(a) += G ((b), (c), (d)) + (x) + (md5uint32)(ac); \
 	(a) = ROTATE_LEFT ((a), (s)); \
 	(a) += (b); \
 	}
 #define HH(a, b, c, d, x, s, ac) { \
 	(a) += H ((b), (c), (d)) + (x) + (md5uint32)(ac); \
 	(a) = ROTATE_LEFT ((a), (s)); \
 	(a) += (b); \
 	}
 #define II(a, b, c, d, x, s, ac) { \
 	(a) += I ((b), (c), (d)) + (x) + (md5uint32)(ac); \
 	(a) = ROTATE_LEFT ((a), (s)); \
 	(a) += (b); \
 	}
 /* MD5 initialization. Begins an MD5 operation, writing a new context. */
 void
 MD5Init (MD5_CTX *context)
 {
 	context->count[0] = context->count[1] = 0;
 	/* Load magic initialization constants.  */
 	context->state[0] = 0x67452301;
 	context->state[1] = 0xefcdab89;
 	context->state[2] = 0x98badcfe;
 	context->state[3] = 0x10325476;
 }
 /* 
 * MD5 block update operation. Continues an MD5 message-digest
 * operation, processing another message block, and updating the
 * context.
 */
 void
 MD5Update (MD5_CTX *context, const unsigned char *input, unsigned int inputLen)
 {
 	unsigned int i, index, partLen;
 	/* Compute number of bytes mod 64 */
 	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
 	/* Update number of bits */
 	if ((context->count[0] += ((md5uint32)inputLen << 3))
 	    < ((md5uint32)inputLen << 3))
 		context->count[1]++;
 	context->count[1] += ((md5uint32)inputLen >> 29);
 	partLen = 64 - index;
 	/* Transform as many times as possible. */
 	if (inputLen >= partLen) {
 		memcpy((void *)&context->buffer[index], (const void *)input,
 		    partLen);
 		MD5Transform (context->state, context->buffer);
 		for (i = partLen; i + 63 < inputLen; i += 64)
 			MD5Transform (context->state, &input[i]);
 		index = 0;
 	}
 	else
 		i = 0;
 	/* Buffer remaining input */
 	memcpy ((void *)&context->buffer[index], (const void *)&input[i],
 	    inputLen-i);
 }
 /*
 * MD5 padding. Adds padding followed by original length.
 */
 void
 MD5Pad (MD5_CTX *context)
 {
 	unsigned char bits[8];
 	unsigned int index, padLen;
 	/* Save number of bits */
 	Encode (bits, context->count, 8);
 	/* Pad out to 56 mod 64. */
 	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
 	padLen = (index < 56) ? (56 - index) : (120 - index);
 	MD5Update (context, PADDING, padLen);
 	/* Append length (before padding) */
 	MD5Update (context, bits, 8);
 }
 /*
 * MD5 finalization. Ends an MD5 message-digest operation, writing the
 * the message digest and zeroizing the context.
 */
 void
 MD5Final (unsigned char digest[16],MD5_CTX *context)
 {
 	/* Do padding. */
 	MD5Pad (context);
 	/* Store state in digest */
 	Encode (digest, context->state, 16);
 	/* Zeroize sensitive information. */
 	memset ((void *)context, 0, sizeof (*context));
 }
 /* MD5 basic transformation. Transforms state based on block. */
 static void
 MD5Transform (md5uint32 state[4], const unsigned char block[64])
 {
 	md5uint32 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
 	Decode (x, block, 64);
 	/* Round 1 */
 #define S11 7
 #define S12 12
 #define S13 17
 #define S14 22
 	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
 	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
 	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
 	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
 	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
 	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
 	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
 	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
 	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
 	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
 	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
 	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
 	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
 	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
 	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
 	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
 	/* Round 2 */
 #define S21 5
 #define S22 9
 #define S23 14
 #define S24 20
 	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
 	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
 	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
 	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
 	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
 	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
 	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
 	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
 	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
 	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
 	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
 	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
 	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
 	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
 	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
 	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
 	/* Round 3 */
 #define S31 4
 #define S32 11
 #define S33 16
 #define S34 23
 	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
 	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
 	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
 	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
 	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
 	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
 	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
 	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
 	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
 	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
 	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
 	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
 	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
 	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
 	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
 	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
 	/* Round 4 */
 #define S41 6
 #define S42 10
 #define S43 15
 #define S44 21
 	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
 	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
 	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
 	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
 	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
 	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
 	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
 	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
 	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
 	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
 	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
 	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
 	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
 	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
 	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
 	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
 	state[0] += a;
 	state[1] += b;
 	state[2] += c;
 	state[3] += d;
 	/* Zeroize sensitive information. */
 	memset ((void *)x, 0, sizeof (x));
 }
--- a/src/utils/md5.h
+++ b/src/utils/md5.h
@ -0,0 +1,43 @@
 #ifndef _MD5_H_
 #define _MD5_H_
 /* MD5.H - header file for MD5C.C
 * Id: md5.h,v 1.6.2.1 1998/02/18 02:28:14 jkh Exp $
 */
 /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
 rights reserved.
 License to copy and use this software is granted provided that it
 is identified as the "RSA Data Security, Inc. MD5 Message-Digest
 Algorithm" in all material mentioning or referencing this software
 or this function.
 License is also granted to make and use derivative works provided
 that such works are identified as "derived from the RSA Data
 Security, Inc. MD5 Message-Digest Algorithm" in all material
 mentioning or referencing the derived work.
 RSA Data Security, Inc. makes no representations concerning either
 the merchantability of this software or the suitability of this
 software for any particular purpose. It is provided "as is"
 without express or implied warranty of any kind.
 These notices must be retained in any copies of any part of this
 documentation and/or software.
 */
 extern "C" {
 /* MD5 context. */
 typedef struct MD5Context {
  unsigned int state[4];	/* state (ABCD) */
  unsigned int count[2];	/* number of bits, modulo 2^64 (lsb first) */
  unsigned char buffer[64];	/* input buffer */
 } MD5_CTX;
 void   MD5Init (MD5_CTX *);
 void   MD5Update (MD5_CTX *, const unsigned char *, unsigned int);
 void   MD5Final (unsigned char [16], MD5_CTX *);
 }
 #endif /* _MD5_H_ */
--- a/src/utils/mimeparse.cpp
+++ b/src/utils/mimeparse.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.6 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_MIMEPARSE
@ -10,13 +10,7 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Ex
 #include <ctype.h>
 #include "mimeparse.h"
-
+#include "base64.h"
 //#define DEBUG_MIMEPARSE 
 #ifdef DEBUG_MIMEPARSE
 #define DPRINT(X) fprintf X
 #else
 #define DPRINT(X)
 #endif
 using namespace std;
@ -251,131 +245,6 @@ bool qp_decode(const string& in, string &out)
 }
 // This is adapted from FreeBSD's code.
 static const char Base64[] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 static const char Pad64 = '=';
 bool base64_decode(const string& in, string& out)
 {
    int io = 0, state = 0, ch;
    char *pos;
    unsigned int ii = 0;
    out.reserve(in.length());
    for (ii = 0; ii < in.length(); ii++) {
 	ch = in[ii];
 	if (isspace((unsigned char)ch))        /* Skip whitespace anywhere. */
 	    continue;
 	if (ch == Pad64)
 	    break;
 	pos = strchr(Base64, ch);
 	if (pos == 0) {
 	    /* A non-base64 character. */
 	    DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii));
 	    return false;
 	}
 	switch (state) {
 	case 0:
 	    out += (pos - Base64) << 2;
 	    state = 1;
 	    break;
 	case 1:
 	    out[io]   |=  (pos - Base64) >> 4;
 	    out += ((pos - Base64) & 0x0f) << 4 ;
 	    io++;
 	    state = 2;
 	    break;
 	case 2:
 	    out[io]   |=  (pos - Base64) >> 2;
 	    out += ((pos - Base64) & 0x03) << 6;
 	    io++;
 	    state = 3;
 	    break;
 	case 3:
 	    out[io] |= (pos - Base64);
 	    io++;
 	    state = 0;
 	    break;
 	default:
 	    DPRINT((stderr, "base64_dec: internal!bad state!\n"));
 	    return false;
 	}
    }
    /*
     * We are done decoding Base-64 chars.  Let's see if we ended
     * on a byte boundary, and/or with erroneous trailing characters.
     */
    if (ch == Pad64) {		/* We got a pad char. */
 	ch = in[ii++];		/* Skip it, get next. */
 	switch (state) {
 	case 0:		/* Invalid = in first position */
 	case 1:		/* Invalid = in second position */
 	    DPRINT((stderr, "base64_dec: pad char in state 0/1\n"));
 	    return false;
 	case 2:		/* Valid, means one byte of info */
 			/* Skip any number of spaces. */
 	    for (; ii < in.length(); ch = in[ii++])
 		if (!isspace((unsigned char)ch))
 		    break;
 	    /* Make sure there is another trailing = sign. */
 	    if (ch != Pad64) {
 		DPRINT((stderr, "base64_dec: missing pad char!\n"));
 		// Well, there are bad encoders out there. Let it pass
 		// return false;
 	    }
 	    ch = in[ii++];		/* Skip the = */
 	    /* Fall through to "single trailing =" case. */
 	    /* FALLTHROUGH */
 	case 3:	    /* Valid, means two bytes of info */
 	    /*
 	     * We know this char is an =.  Is there anything but
 	     * whitespace after it?
 	     */
 	    for ((void)NULL; ii < in.length(); ch = in[ii++])
 		if (!isspace((unsigned char)ch)) {
 		    DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n", 
 			    (unsigned int)ch));
 		    // Well, there are bad encoders out there. Let it pass
 		    //return false;
 		}
 	    /*
 	     * Now make sure for cases 2 and 3 that the "extra"
 	     * bits that slopped past the last full byte were
 	     * zeros.  If we don't check them, they become a
 	     * subliminal channel.
 	     */
 	    if (out[io] != 0) {
 		DPRINT((stderr, "base64_dec: bad extra bits!\n"));
 		// Well, there are bad encoders out there. Let it pass
 		out[io] = 0;
 		// return false;
 	    }
 	}
    } else {
 	/*
 	 * We ended by seeing the end of the string.  Make sure we
 	 * have no partial bytes lying around.
 	 */
 	if (state != 0) {
 	    DPRINT((stderr, "base64_dec: bad final state\n"));
 	    return false;
 	}
    }
    DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n", 
 	    io, out.size(), out.length(), out.c_str()));
    return true;
 }
 #include "transcode.h"
 #include "smallut.h"