From 48948bc92fd0103debb862b93a388f1d8a067cd9 Mon Sep 17 00:00:00 2001 From: dockes Date: Sun, 6 Nov 2005 11:16:53 +0000 Subject: [PATCH] limit path therm length through hashing --- src/lib/Makefile | 20 ++- src/rcldb/pathhash.cpp | 81 ++++++++++ src/rcldb/pathhash.h | 11 ++ src/rcldb/rcldb.cpp | 89 +++++++---- src/utils/base64.cpp | 209 ++++++++++++++++++++++++++ src/utils/base64.h | 9 ++ src/utils/md5.cpp | 318 ++++++++++++++++++++++++++++++++++++++++ src/utils/md5.h | 43 ++++++ src/utils/mimeparse.cpp | 135 +---------------- 9 files changed, 751 insertions(+), 164 deletions(-) create mode 100644 src/rcldb/pathhash.cpp create mode 100644 src/rcldb/pathhash.h create mode 100644 src/utils/base64.cpp create mode 100644 src/utils/base64.h create mode 100644 src/utils/md5.cpp create mode 100644 src/utils/md5.h diff --git a/src/lib/Makefile b/src/lib/Makefile index 7d24a6cc..f4b9b5a9 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -6,22 +6,24 @@ UNACCFLAGS = -g -I. -I../unac -I/usr/local/include -DUNAC_VERSION=\"1.0.7\" LIBS = librcl.a all: $(LIBS) -OBJS = conftree.o csguess.o debuglog.o \ +OBJS = base64.o conftree.o csguess.o debuglog.o \ execmd.o wipedir.o \ - fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o internfile.o \ - mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \ + fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o \ + internfile.o md5.o \ + mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \ rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ - ../utils/execmd.cpp ../utils/idfile.cpp ../utils/wipedir.cpp \ + ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \ + ../utils/wipedir.cpp \ ../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \ ../common/htmlparse.cpp \ ../index/indexer.cpp ../common/internfile.cpp \ ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \ - ../common/myhtmlparse.cpp ../utils/pathut.cpp \ + ../common/myhtmlparse.cpp ../common/pathhash.cpp ../utils/pathut.cpp \ ../common/rclconfig.cpp ../common/rcldb.cpp ../common/rclinit.cpp \ - ../utils/readfile.cpp ../utils/smallut.cpp \ + ../utils/base64.cpp ../utils/readfile.cpp ../utils/smallut.cpp \ ../common/textsplit.cpp ../utils/transcode.cpp \ ../common/unacpp.cpp ../unac/unac.c @@ -67,6 +69,8 @@ myhtmlparse.o : ../common/myhtmlparse.cpp $(CXX) $(CXXFLAGS) -c $< pathut.o : ../utils/pathut.cpp $(CXX) $(CXXFLAGS) -c $< +pathhash.o : ../common/pathhash.cpp + $(CXX) $(CXXFLAGS) -c $< rclconfig.o : ../common/rclconfig.cpp $(CXX) $(CXXFLAGS) -c $< rclinit.o : ../common/rclinit.cpp @@ -75,12 +79,16 @@ rcldb.o : ../common/rcldb.cpp $(CXX) $(CXXFLAGS) -c $< readfile.o : ../utils/readfile.cpp $(CXX) $(CXXFLAGS) -c $< +base64.o : ../utils/base64.cpp + $(CXX) $(CXXFLAGS) -c $< smallut.o : ../utils/smallut.cpp $(CXX) $(CXXFLAGS) -c $< textsplit.o : ../common/textsplit.cpp $(CXX) $(CXXFLAGS) -c $< transcode.o : ../utils/transcode.cpp $(CXX) $(CXXFLAGS) -c $< +md5.o : ../utils/md5.cpp + $(CXX) $(CXXFLAGS) -c $< unacpp.o : ../common/unacpp.cpp $(CXX) $(CXXFLAGS) -c $< diff --git a/src/rcldb/pathhash.cpp b/src/rcldb/pathhash.cpp new file mode 100644 index 00000000..851c589e --- /dev/null +++ b/src/rcldb/pathhash.cpp @@ -0,0 +1,81 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: pathhash.cpp,v 1.1 2005-11-06 11:16:52 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif + +#include + +#include "pathhash.h" +#include "md5.h" +#include "base64.h" + +using std::string; + +#ifdef PATHHASH_HEX +static void md5hexprint(const unsigned char hash[16], string &out) +{ + out.erase(); + out.reserve(33); + static const char hex[]="0123456789abcdef"; + for (int i = 0; i < 16; i++) { + out.append(1, hex[hash[i] >> 4]); + out.append(1, hex[hash[i] & 0x0f]); + } +} +#endif + +// Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars) +#define HASHLEN 22 + +// Convert longish paths by truncating and appending hash of path +// The full length of the base64-encoded (minus pad) of the md5 is 22 chars +// We append this to the truncated path +void pathHash(const std::string &path, std::string &phash, unsigned int maxlen) +{ + if (maxlen < HASHLEN) { + fprintf(stderr, "pathHash: internal error: requested len too small\n"); + abort(); + } + + if (path.length() <= maxlen) { + phash = path; + return; + } + + // Compute the md5 + unsigned char chash[16]; + MD5_CTX ctx; + MD5Init(&ctx); + MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN), + path.length() - (maxlen - HASHLEN)); + MD5Final(chash, &ctx); + +#if 0 + string hex; + md5hexprint(chash, hex); + printf("hex [%s]\n", hex.c_str()); +#endif + + // Encode it to ascii. This shouldn't be strictly necessary as + // xapian terms can be binary + string hash; + base64_encode(string((char *)chash, 16), hash); + // We happen to know there will be 2 pad chars in there, that we + // don't need as this won't ever be decoded. Resulting length is 22 + hash.resize(hash.length() - 2); + + + // Truncate path and append hash + phash = path.substr(0, maxlen - HASHLEN) + hash; +} + +#ifdef TEST_PATHHASH +#include +int main(int argc, char **argv) +{ + string path="/usr/lib/toto.cpp"; + string hash; + pathHash(path, hash); + printf("hash [%s]\n", hash.c_str()); + +} +#endif // TEST_PATHHASH diff --git a/src/rcldb/pathhash.h b/src/rcldb/pathhash.h new file mode 100644 index 00000000..3578bd3d --- /dev/null +++ b/src/rcldb/pathhash.h @@ -0,0 +1,11 @@ +#ifndef _PATHHASH_H_INCLUDED_ +#define _PATHHASH_H_INCLUDED_ +/* @(#$Id: pathhash.h,v 1.1 2005-11-06 11:16:52 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include + +extern void pathHash(const std::string &path, std::string &hash, + unsigned int len); + + +#endif /* _PATHHASH_H_INCLUDED_ */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 533bfc9a..4cd6caf4 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.33 2005-11-05 15:29:12 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.34 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -19,6 +19,7 @@ using namespace std; #include "debuglog.h" #include "pathut.h" #include "smallut.h" +#include "pathhash.h" #include "xapian.h" #include @@ -268,6 +269,16 @@ truncate_to_word(string & input, string::size_type maxlen) return output; } +// Truncate longer path and uniquize with hash . The goad for this is +// to avoid xapian max term length limitations, not to gain space (we +// gain very little even with very short maxlens like 30) +#define HASHPATH +#define PATHHASHLEN 150 + +// Add document in internal form to the database: index the terms in +// the title abstract and body and add special terms for file name, +// date, mime type ... , create the document data record (more +// metadata), and update database bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) { LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str())); @@ -275,6 +286,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) return false; Native *ndb = (Native *)pdata; + // Truncate abstract, title and keywords to reasonable lengths Rcl::Doc doc = idoc; if (doc.abstract.empty()) doc.abstract = truncate_to_word(doc.text, 100); @@ -289,6 +301,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) TextSplit splitter(&splitData); + ///////// Split and index terms in document body and auxiliary fields + + // Split title and index terms string noacc; if (!dumb_string(doc.title, noacc)) { LOGERR(("Rcl::Db::add: unac failed\n")); @@ -296,6 +311,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) } splitter.text_to_words(noacc); + // Split body and index terms splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.text, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); @@ -303,6 +319,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) } splitter.text_to_words(noacc); + // Split keywords and index terms splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.keywords, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); @@ -310,6 +327,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) } splitter.text_to_words(noacc); + // Split abstract and index terms splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.abstract, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); @@ -317,20 +335,45 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) } splitter.text_to_words(noacc); + ////// Special terms for metadata + // Mime type newdocument.add_term("T" + doc.mimetype); - string pathterm = "P" + fn; + // Path name + string hash; +#ifdef HASHPATH + pathHash(fn, hash, PATHHASHLEN); +#else + hash = fn; +#endif + LOGDEB(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); + + string pathterm = "P" + hash; newdocument.add_term(pathterm); + // File path + internal path: document unique identifier for + // documents inside multidocument files. string uniterm; if (!doc.ipath.empty()) { - uniterm = "Q" + fn + "|" + doc.ipath; + uniterm = "Q" + hash + "|" + doc.ipath; newdocument.add_term(uniterm); } + // Dates etc... + time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : + doc.dmtime.c_str()); + struct tm *tm = localtime(&mtime); + char buf[9]; + sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); + newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD) + buf[7] = '\0'; + if (buf[6] == '3') buf[6] = '2'; + newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval + buf[6] = '\0'; + newdocument.add_term("M" + string(buf)); // Month (YYYYMM) + buf[4] = '\0'; + newdocument.add_term("Y" + string(buf)); // Year (YYYY) - const char *fnc = fn.c_str(); - // Document data record. omindex has the following nl separated fields: // - url // - sample @@ -348,27 +391,14 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) if (!doc.ipath.empty()) { record += "\nipath=" + doc.ipath; } - record += "\n"; + LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); - time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : - doc.dmtime.c_str()); - struct tm *tm = localtime(&mtime); - char buf[9]; - sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); - newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD) - buf[7] = '\0'; - if (buf[6] == '3') buf[6] = '2'; - newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval - buf[6] = '\0'; - newdocument.add_term("M" + string(buf)); // Month (YYYYMM) - buf[4] = '\0'; - newdocument.add_term("Y" + string(buf)); // Year (YYYY) - - // If this document has already been indexed, update the existing - // entry. + const char *fnc = fn.c_str(); + + // Add db entry or update existing entry: try { Xapian::docid did = ndb->wdb.replace_document(uniterm.empty() ? pathterm : uniterm, @@ -397,14 +427,23 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) Native *ndb = (Native *)pdata; // If no document exist with this path, we do need update - string pathterm = "P" + filename; + string hash; +#ifdef HASHPATH + pathHash(filename, hash, PATHHASHLEN); +#else + hash = filename; +#endif + + string pathterm = "P" + hash; if (!ndb->wdb.term_exists(pathterm)) { return true; } // Look for all documents with this path. We need to look at all - // to set their existence flag. - // We check the update time on the spe + // to set their existence flag. We check the update time on the + // fmtime field which will be identical for all docs inside a + // multi-document file (we currently always reindex all if the + // file changed) Xapian::PostingIterator doc; try { Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); diff --git a/src/utils/base64.cpp b/src/utils/base64.cpp new file mode 100644 index 00000000..5a168cd9 --- /dev/null +++ b/src/utils/base64.cpp @@ -0,0 +1,209 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: base64.cpp,v 1.1 2005-11-06 11:16:53 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif + +#include + +#include +using std::string; + +//#define DEBUG_BASE64 +#ifdef DEBUG_BASE64 +#define DPRINT(X) fprintf X +#else +#define DPRINT(X) +#endif + +// This is adapted from FreeBSD's code. +static const char Base64[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char Pad64 = '='; +bool base64_decode(const string& in, string& out) +{ + int io = 0, state = 0, ch; + char *pos; + unsigned int ii = 0; + out.reserve(in.length()); + + for (ii = 0; ii < in.length(); ii++) { + ch = in[ii]; + if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */ + continue; + + if (ch == Pad64) + break; + + pos = strchr(Base64, ch); + if (pos == 0) { + /* A non-base64 character. */ + DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii)); + return false; + } + + + switch (state) { + case 0: + out += (pos - Base64) << 2; + state = 1; + break; + case 1: + out[io] |= (pos - Base64) >> 4; + out += ((pos - Base64) & 0x0f) << 4 ; + io++; + state = 2; + break; + case 2: + out[io] |= (pos - Base64) >> 2; + out += ((pos - Base64) & 0x03) << 6; + io++; + state = 3; + break; + case 3: + out[io] |= (pos - Base64); + io++; + state = 0; + break; + default: + DPRINT((stderr, "base64_dec: internal!bad state!\n")); + return false; + } + } + + /* + * We are done decoding Base-64 chars. Let's see if we ended + * on a byte boundary, and/or with erroneous trailing characters. + */ + + if (ch == Pad64) { /* We got a pad char. */ + ch = in[ii++]; /* Skip it, get next. */ + switch (state) { + case 0: /* Invalid = in first position */ + case 1: /* Invalid = in second position */ + DPRINT((stderr, "base64_dec: pad char in state 0/1\n")); + return false; + + case 2: /* Valid, means one byte of info */ + /* Skip any number of spaces. */ + for (; ii < in.length(); ch = in[ii++]) + if (!isspace((unsigned char)ch)) + break; + /* Make sure there is another trailing = sign. */ + if (ch != Pad64) { + DPRINT((stderr, "base64_dec: missing pad char!\n")); + // Well, there are bad encoders out there. Let it pass + // return false; + } + ch = in[ii++]; /* Skip the = */ + /* Fall through to "single trailing =" case. */ + /* FALLTHROUGH */ + + case 3: /* Valid, means two bytes of info */ + /* + * We know this char is an =. Is there anything but + * whitespace after it? + */ + for (; ii < in.length(); ch = in[ii++]) + if (!isspace((unsigned char)ch)) { + DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n", + (unsigned int)ch)); + // Well, there are bad encoders out there. Let it pass + //return false; + } + + /* + * Now make sure for cases 2 and 3 that the "extra" + * bits that slopped past the last full byte were + * zeros. If we don't check them, they become a + * subliminal channel. + */ + if (out[io] != 0) { + DPRINT((stderr, "base64_dec: bad extra bits!\n")); + // Well, there are bad encoders out there. Let it pass + out[io] = 0; + // return false; + } + } + } else { + /* + * We ended by seeing the end of the string. Make sure we + * have no partial bytes lying around. + */ + if (state != 0) { + DPRINT((stderr, "base64_dec: bad final state\n")); + return false; + } + } + + DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n", + io, out.size(), out.length(), out.c_str())); + return true; +} + +#undef Assert +#define Assert(X) + +void base64_encode(const string &in, string &out) +{ + size_t datalength = 0; + unsigned char input[3]; + unsigned char output[4]; + size_t i; + + int srclength = in.length(); + int sidx = 0; + while (2 < srclength) { + input[0] = in[sidx++]; + input[1] = in[sidx++]; + input[2] = in[sidx++]; + srclength -= 3; + + output[0] = input[0] >> 2; + output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4); + output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6); + output[3] = input[2] & 0x3f; + Assert(output[0] < 64); + Assert(output[1] < 64); + Assert(output[2] < 64); + Assert(output[3] < 64); + + out += Base64[output[0]]; + out += Base64[output[1]]; + out += Base64[output[2]]; + out += Base64[output[3]]; + } + + /* Now we worry about padding. */ + if (0 != srclength) { + /* Get what's left. */ + input[0] = input[1] = input[2] = '\0'; + for (unsigned int i = 0; i < srclength; i++) + input[i] = in[sidx++]; + + output[0] = input[0] >> 2; + output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4); + output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6); + Assert(output[0] < 64); + Assert(output[1] < 64); + Assert(output[2] < 64); + + out += Base64[output[0]]; + out += Base64[output[1]]; + if (srclength == 1) + out += Pad64; + else + out += Base64[output[2]]; + out += Pad64; + } + return; +} + +#ifdef TEST_BASE64 +#include +int main(int agrc, char **argv) +{ + string in = "12345"; + string out; + base64_encode(in, out); + printf("in %s out %s\n", in.c_str(), out.c_str()); +} +#endif diff --git a/src/utils/base64.h b/src/utils/base64.h new file mode 100644 index 00000000..417448a2 --- /dev/null +++ b/src/utils/base64.h @@ -0,0 +1,9 @@ +#ifndef _BASE64_H_INCLUDED_ +#define _BASE64_H_INCLUDED_ +/* @(#$Id: base64.h,v 1.1 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes */ +#include + +void base64_encode(const std::string &in, std::string &out); +bool base64_decode(const std::string& in, std::string& out); + +#endif /* _BASE64_H_INCLUDED_ */ diff --git a/src/utils/md5.cpp b/src/utils/md5.cpp new file mode 100644 index 00000000..d10bbcf2 --- /dev/null +++ b/src/utils/md5.cpp @@ -0,0 +1,318 @@ +/* + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + * + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * $FreeBSD: src/lib/libmd/md5c.c,v 1.11 1999/12/29 05:04:20 peter Exp $ + * + * This code is the same as the code published by RSA Inc. It has been + * edited for clarity and style only. + */ + +#include + +#include "md5.h" + +typedef unsigned int md5uint32; + +static void MD5Transform(md5uint32 [4], const unsigned char [64]); + +#ifdef i386 +#define Encode memcpy +#define Decode memcpy +#else /* i386 */ + +/* + * Encodes input (md5uint32) into output (unsigned char). Assumes len is + * a multiple of 4. + */ + +static void +Encode (unsigned char *output, md5uint32 *input, unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* + * Decodes input (unsigned char) into output (md5uint32). Assumes len is + * a multiple of 4. + */ + +static void +Decode (md5uint32 *output, const unsigned char *input, unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((md5uint32)input[j]) | (((md5uint32)input[j+1]) << 8) | + (((md5uint32)input[j+2]) << 16) | (((md5uint32)input[j+3]) << 24); +} +#endif /* i386 */ + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (md5uint32)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (md5uint32)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (md5uint32)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (md5uint32)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. */ + +void +MD5Init (MD5_CTX *context) +{ + + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* + * MD5 block update operation. Continues an MD5 message-digest + * operation, processing another message block, and updating the + * context. + */ + +void +MD5Update (MD5_CTX *context, const unsigned char *input, unsigned int inputLen) +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((md5uint32)inputLen << 3)) + < ((md5uint32)inputLen << 3)) + context->count[1]++; + context->count[1] += ((md5uint32)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. */ + if (inputLen >= partLen) { + memcpy((void *)&context->buffer[index], (const void *)input, + partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy ((void *)&context->buffer[index], (const void *)&input[i], + inputLen-i); +} + +/* + * MD5 padding. Adds padding followed by original length. + */ + +void +MD5Pad (MD5_CTX *context) +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + MD5Update (context, bits, 8); +} + +/* + * MD5 finalization. Ends an MD5 message-digest operation, writing the + * the message digest and zeroizing the context. + */ + +void +MD5Final (unsigned char digest[16],MD5_CTX *context) +{ + /* Do padding. */ + MD5Pad (context); + + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. */ + memset ((void *)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. */ + +static void +MD5Transform (md5uint32 state[4], const unsigned char block[64]) +{ + md5uint32 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. */ + memset ((void *)x, 0, sizeof (x)); +} diff --git a/src/utils/md5.h b/src/utils/md5.h new file mode 100644 index 00000000..7146da80 --- /dev/null +++ b/src/utils/md5.h @@ -0,0 +1,43 @@ +#ifndef _MD5_H_ +#define _MD5_H_ +/* MD5.H - header file for MD5C.C + * Id: md5.h,v 1.6.2.1 1998/02/18 02:28:14 jkh Exp $ + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +extern "C" { + +/* MD5 context. */ +typedef struct MD5Context { + unsigned int state[4]; /* state (ABCD) */ + unsigned int count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ +} MD5_CTX; + +void MD5Init (MD5_CTX *); +void MD5Update (MD5_CTX *, const unsigned char *, unsigned int); +void MD5Final (unsigned char [16], MD5_CTX *); +} + +#endif /* _MD5_H_ */ diff --git a/src/utils/mimeparse.cpp b/src/utils/mimeparse.cpp index af08448e..9c8d9347 100644 --- a/src/utils/mimeparse.cpp +++ b/src/utils/mimeparse.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.6 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_MIMEPARSE @@ -10,13 +10,7 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Ex #include #include "mimeparse.h" - -//#define DEBUG_MIMEPARSE -#ifdef DEBUG_MIMEPARSE -#define DPRINT(X) fprintf X -#else -#define DPRINT(X) -#endif +#include "base64.h" using namespace std; @@ -251,131 +245,6 @@ bool qp_decode(const string& in, string &out) } -// This is adapted from FreeBSD's code. -static const char Base64[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -static const char Pad64 = '='; -bool base64_decode(const string& in, string& out) -{ - int io = 0, state = 0, ch; - char *pos; - unsigned int ii = 0; - out.reserve(in.length()); - - for (ii = 0; ii < in.length(); ii++) { - ch = in[ii]; - if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */ - continue; - - if (ch == Pad64) - break; - - pos = strchr(Base64, ch); - if (pos == 0) { - /* A non-base64 character. */ - DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii)); - return false; - } - - - switch (state) { - case 0: - out += (pos - Base64) << 2; - state = 1; - break; - case 1: - out[io] |= (pos - Base64) >> 4; - out += ((pos - Base64) & 0x0f) << 4 ; - io++; - state = 2; - break; - case 2: - out[io] |= (pos - Base64) >> 2; - out += ((pos - Base64) & 0x03) << 6; - io++; - state = 3; - break; - case 3: - out[io] |= (pos - Base64); - io++; - state = 0; - break; - default: - DPRINT((stderr, "base64_dec: internal!bad state!\n")); - return false; - } - } - - /* - * We are done decoding Base-64 chars. Let's see if we ended - * on a byte boundary, and/or with erroneous trailing characters. - */ - - if (ch == Pad64) { /* We got a pad char. */ - ch = in[ii++]; /* Skip it, get next. */ - switch (state) { - case 0: /* Invalid = in first position */ - case 1: /* Invalid = in second position */ - DPRINT((stderr, "base64_dec: pad char in state 0/1\n")); - return false; - - case 2: /* Valid, means one byte of info */ - /* Skip any number of spaces. */ - for (; ii < in.length(); ch = in[ii++]) - if (!isspace((unsigned char)ch)) - break; - /* Make sure there is another trailing = sign. */ - if (ch != Pad64) { - DPRINT((stderr, "base64_dec: missing pad char!\n")); - // Well, there are bad encoders out there. Let it pass - // return false; - } - ch = in[ii++]; /* Skip the = */ - /* Fall through to "single trailing =" case. */ - /* FALLTHROUGH */ - - case 3: /* Valid, means two bytes of info */ - /* - * We know this char is an =. Is there anything but - * whitespace after it? - */ - for ((void)NULL; ii < in.length(); ch = in[ii++]) - if (!isspace((unsigned char)ch)) { - DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n", - (unsigned int)ch)); - // Well, there are bad encoders out there. Let it pass - //return false; - } - - /* - * Now make sure for cases 2 and 3 that the "extra" - * bits that slopped past the last full byte were - * zeros. If we don't check them, they become a - * subliminal channel. - */ - if (out[io] != 0) { - DPRINT((stderr, "base64_dec: bad extra bits!\n")); - // Well, there are bad encoders out there. Let it pass - out[io] = 0; - // return false; - } - } - } else { - /* - * We ended by seeing the end of the string. Make sure we - * have no partial bytes lying around. - */ - if (state != 0) { - DPRINT((stderr, "base64_dec: bad final state\n")); - return false; - } - } - - DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n", - io, out.size(), out.length(), out.c_str())); - return true; -} - #include "transcode.h" #include "smallut.h"