limit path therm length through hashing

This commit is contained in:
dockes 2005-11-06 11:16:53 +00:00
parent d39a9f12de
commit 48948bc92f
9 changed files with 751 additions and 164 deletions

View File

@ -6,22 +6,24 @@ UNACCFLAGS = -g -I. -I../unac -I/usr/local/include -DUNAC_VERSION=\"1.0.7\"
LIBS = librcl.a
all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
OBJS = base64.o conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o internfile.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o \
internfile.o md5.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/wipedir.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
../utils/wipedir.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
../common/htmlparse.cpp \
../index/indexer.cpp ../common/internfile.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
../common/myhtmlparse.cpp ../utils/pathut.cpp \
../common/myhtmlparse.cpp ../common/pathhash.cpp ../utils/pathut.cpp \
../common/rclconfig.cpp ../common/rcldb.cpp ../common/rclinit.cpp \
../utils/readfile.cpp ../utils/smallut.cpp \
../utils/base64.cpp ../utils/readfile.cpp ../utils/smallut.cpp \
../common/textsplit.cpp ../utils/transcode.cpp \
../common/unacpp.cpp ../unac/unac.c
@ -67,6 +69,8 @@ myhtmlparse.o : ../common/myhtmlparse.cpp
$(CXX) $(CXXFLAGS) -c $<
pathut.o : ../utils/pathut.cpp
$(CXX) $(CXXFLAGS) -c $<
pathhash.o : ../common/pathhash.cpp
$(CXX) $(CXXFLAGS) -c $<
rclconfig.o : ../common/rclconfig.cpp
$(CXX) $(CXXFLAGS) -c $<
rclinit.o : ../common/rclinit.cpp
@ -75,12 +79,16 @@ rcldb.o : ../common/rcldb.cpp
$(CXX) $(CXXFLAGS) -c $<
readfile.o : ../utils/readfile.cpp
$(CXX) $(CXXFLAGS) -c $<
base64.o : ../utils/base64.cpp
$(CXX) $(CXXFLAGS) -c $<
smallut.o : ../utils/smallut.cpp
$(CXX) $(CXXFLAGS) -c $<
textsplit.o : ../common/textsplit.cpp
$(CXX) $(CXXFLAGS) -c $<
transcode.o : ../utils/transcode.cpp
$(CXX) $(CXXFLAGS) -c $<
md5.o : ../utils/md5.cpp
$(CXX) $(CXXFLAGS) -c $<
unacpp.o : ../common/unacpp.cpp
$(CXX) $(CXXFLAGS) -c $<

81
src/rcldb/pathhash.cpp Normal file
View File

@ -0,0 +1,81 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathhash.cpp,v 1.1 2005-11-06 11:16:52 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
#include "pathhash.h"
#include "md5.h"
#include "base64.h"
using std::string;
#ifdef PATHHASH_HEX
static void md5hexprint(const unsigned char hash[16], string &out)
{
out.erase();
out.reserve(33);
static const char hex[]="0123456789abcdef";
for (int i = 0; i < 16; i++) {
out.append(1, hex[hash[i] >> 4]);
out.append(1, hex[hash[i] & 0x0f]);
}
}
#endif
// Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars)
#define HASHLEN 22
// Convert longish paths by truncating and appending hash of path
// The full length of the base64-encoded (minus pad) of the md5 is 22 chars
// We append this to the truncated path
void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
{
if (maxlen < HASHLEN) {
fprintf(stderr, "pathHash: internal error: requested len too small\n");
abort();
}
if (path.length() <= maxlen) {
phash = path;
return;
}
// Compute the md5
unsigned char chash[16];
MD5_CTX ctx;
MD5Init(&ctx);
MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN),
path.length() - (maxlen - HASHLEN));
MD5Final(chash, &ctx);
#if 0
string hex;
md5hexprint(chash, hex);
printf("hex [%s]\n", hex.c_str());
#endif
// Encode it to ascii. This shouldn't be strictly necessary as
// xapian terms can be binary
string hash;
base64_encode(string((char *)chash, 16), hash);
// We happen to know there will be 2 pad chars in there, that we
// don't need as this won't ever be decoded. Resulting length is 22
hash.resize(hash.length() - 2);
// Truncate path and append hash
phash = path.substr(0, maxlen - HASHLEN) + hash;
}
#ifdef TEST_PATHHASH
#include <stdio.h>
int main(int argc, char **argv)
{
string path="/usr/lib/toto.cpp";
string hash;
pathHash(path, hash);
printf("hash [%s]\n", hash.c_str());
}
#endif // TEST_PATHHASH

11
src/rcldb/pathhash.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef _PATHHASH_H_INCLUDED_
#define _PATHHASH_H_INCLUDED_
/* @(#$Id: pathhash.h,v 1.1 2005-11-06 11:16:52 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
extern void pathHash(const std::string &path, std::string &hash,
unsigned int len);
#endif /* _PATHHASH_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.33 2005-11-05 15:29:12 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.34 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -19,6 +19,7 @@ using namespace std;
#include "debuglog.h"
#include "pathut.h"
#include "smallut.h"
#include "pathhash.h"
#include "xapian.h"
#include <xapian/stem.h>
@ -268,6 +269,16 @@ truncate_to_word(string & input, string::size_type maxlen)
return output;
}
// Truncate longer path and uniquize with hash . The goad for this is
// to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30)
#define HASHPATH
#define PATHHASHLEN 150
// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
// metadata), and update database
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
{
LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
@ -275,6 +286,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
return false;
Native *ndb = (Native *)pdata;
// Truncate abstract, title and keywords to reasonable lengths
Rcl::Doc doc = idoc;
if (doc.abstract.empty())
doc.abstract = truncate_to_word(doc.text, 100);
@ -289,6 +301,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
TextSplit splitter(&splitData);
///////// Split and index terms in document body and auxiliary fields
// Split title and index terms
string noacc;
if (!dumb_string(doc.title, noacc)) {
LOGERR(("Rcl::Db::add: unac failed\n"));
@ -296,6 +311,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
splitter.text_to_words(noacc);
// Split body and index terms
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -303,6 +319,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
splitter.text_to_words(noacc);
// Split keywords and index terms
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -310,6 +327,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
splitter.text_to_words(noacc);
// Split abstract and index terms
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
@ -317,20 +335,45 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
splitter.text_to_words(noacc);
////// Special terms for metadata
// Mime type
newdocument.add_term("T" + doc.mimetype);
string pathterm = "P" + fn;
// Path name
string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
LOGDEB(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
string pathterm = "P" + hash;
newdocument.add_term(pathterm);
// File path + internal path: document unique identifier for
// documents inside multidocument files.
string uniterm;
if (!doc.ipath.empty()) {
uniterm = "Q" + fn + "|" + doc.ipath;
uniterm = "Q" + hash + "|" + doc.ipath;
newdocument.add_term(uniterm);
}
// Dates etc...
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
doc.dmtime.c_str());
struct tm *tm = localtime(&mtime);
char buf[9];
sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD)
buf[7] = '\0';
if (buf[6] == '3') buf[6] = '2';
newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval
buf[6] = '\0';
newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
buf[4] = '\0';
newdocument.add_term("Y" + string(buf)); // Year (YYYY)
const char *fnc = fn.c_str();
// Document data record. omindex has the following nl separated fields:
// - url
// - sample
@ -348,27 +391,14 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
if (!doc.ipath.empty()) {
record += "\nipath=" + doc.ipath;
}
record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
doc.dmtime.c_str());
struct tm *tm = localtime(&mtime);
char buf[9];
sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD)
buf[7] = '\0';
if (buf[6] == '3') buf[6] = '2';
newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval
buf[6] = '\0';
newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
buf[4] = '\0';
newdocument.add_term("Y" + string(buf)); // Year (YYYY)
// If this document has already been indexed, update the existing
// entry.
const char *fnc = fn.c_str();
// Add db entry or update existing entry:
try {
Xapian::docid did =
ndb->wdb.replace_document(uniterm.empty() ? pathterm : uniterm,
@ -397,14 +427,23 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
Native *ndb = (Native *)pdata;
// If no document exist with this path, we do need update
string pathterm = "P" + filename;
string hash;
#ifdef HASHPATH
pathHash(filename, hash, PATHHASHLEN);
#else
hash = filename;
#endif
string pathterm = "P" + hash;
if (!ndb->wdb.term_exists(pathterm)) {
return true;
}
// Look for all documents with this path. We need to look at all
// to set their existence flag.
// We check the update time on the spe
// to set their existence flag. We check the update time on the
// fmtime field which will be identical for all docs inside a
// multi-document file (we currently always reindex all if the
// file changed)
Xapian::PostingIterator doc;
try {
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);

209
src/utils/base64.cpp Normal file
View File

@ -0,0 +1,209 @@
#ifndef lint
static char rcsid[] = "@(#$Id: base64.cpp,v 1.1 2005-11-06 11:16:53 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <sys/types.h>
#include <string>
using std::string;
//#define DEBUG_BASE64
#ifdef DEBUG_BASE64
#define DPRINT(X) fprintf X
#else
#define DPRINT(X)
#endif
// This is adapted from FreeBSD's code.
static const char Base64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static const char Pad64 = '=';
bool base64_decode(const string& in, string& out)
{
int io = 0, state = 0, ch;
char *pos;
unsigned int ii = 0;
out.reserve(in.length());
for (ii = 0; ii < in.length(); ii++) {
ch = in[ii];
if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */
continue;
if (ch == Pad64)
break;
pos = strchr(Base64, ch);
if (pos == 0) {
/* A non-base64 character. */
DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii));
return false;
}
switch (state) {
case 0:
out += (pos - Base64) << 2;
state = 1;
break;
case 1:
out[io] |= (pos - Base64) >> 4;
out += ((pos - Base64) & 0x0f) << 4 ;
io++;
state = 2;
break;
case 2:
out[io] |= (pos - Base64) >> 2;
out += ((pos - Base64) & 0x03) << 6;
io++;
state = 3;
break;
case 3:
out[io] |= (pos - Base64);
io++;
state = 0;
break;
default:
DPRINT((stderr, "base64_dec: internal!bad state!\n"));
return false;
}
}
/*
* We are done decoding Base-64 chars. Let's see if we ended
* on a byte boundary, and/or with erroneous trailing characters.
*/
if (ch == Pad64) { /* We got a pad char. */
ch = in[ii++]; /* Skip it, get next. */
switch (state) {
case 0: /* Invalid = in first position */
case 1: /* Invalid = in second position */
DPRINT((stderr, "base64_dec: pad char in state 0/1\n"));
return false;
case 2: /* Valid, means one byte of info */
/* Skip any number of spaces. */
for (; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch))
break;
/* Make sure there is another trailing = sign. */
if (ch != Pad64) {
DPRINT((stderr, "base64_dec: missing pad char!\n"));
// Well, there are bad encoders out there. Let it pass
// return false;
}
ch = in[ii++]; /* Skip the = */
/* Fall through to "single trailing =" case. */
/* FALLTHROUGH */
case 3: /* Valid, means two bytes of info */
/*
* We know this char is an =. Is there anything but
* whitespace after it?
*/
for (; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch)) {
DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n",
(unsigned int)ch));
// Well, there are bad encoders out there. Let it pass
//return false;
}
/*
* Now make sure for cases 2 and 3 that the "extra"
* bits that slopped past the last full byte were
* zeros. If we don't check them, they become a
* subliminal channel.
*/
if (out[io] != 0) {
DPRINT((stderr, "base64_dec: bad extra bits!\n"));
// Well, there are bad encoders out there. Let it pass
out[io] = 0;
// return false;
}
}
} else {
/*
* We ended by seeing the end of the string. Make sure we
* have no partial bytes lying around.
*/
if (state != 0) {
DPRINT((stderr, "base64_dec: bad final state\n"));
return false;
}
}
DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n",
io, out.size(), out.length(), out.c_str()));
return true;
}
#undef Assert
#define Assert(X)
void base64_encode(const string &in, string &out)
{
size_t datalength = 0;
unsigned char input[3];
unsigned char output[4];
size_t i;
int srclength = in.length();
int sidx = 0;
while (2 < srclength) {
input[0] = in[sidx++];
input[1] = in[sidx++];
input[2] = in[sidx++];
srclength -= 3;
output[0] = input[0] >> 2;
output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
output[3] = input[2] & 0x3f;
Assert(output[0] < 64);
Assert(output[1] < 64);
Assert(output[2] < 64);
Assert(output[3] < 64);
out += Base64[output[0]];
out += Base64[output[1]];
out += Base64[output[2]];
out += Base64[output[3]];
}
/* Now we worry about padding. */
if (0 != srclength) {
/* Get what's left. */
input[0] = input[1] = input[2] = '\0';
for (unsigned int i = 0; i < srclength; i++)
input[i] = in[sidx++];
output[0] = input[0] >> 2;
output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
Assert(output[0] < 64);
Assert(output[1] < 64);
Assert(output[2] < 64);
out += Base64[output[0]];
out += Base64[output[1]];
if (srclength == 1)
out += Pad64;
else
out += Base64[output[2]];
out += Pad64;
}
return;
}
#ifdef TEST_BASE64
#include <stdio.h>
int main(int agrc, char **argv)
{
string in = "12345";
string out;
base64_encode(in, out);
printf("in %s out %s\n", in.c_str(), out.c_str());
}
#endif

9
src/utils/base64.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef _BASE64_H_INCLUDED_
#define _BASE64_H_INCLUDED_
/* @(#$Id: base64.h,v 1.1 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
void base64_encode(const std::string &in, std::string &out);
bool base64_decode(const std::string& in, std::string& out);
#endif /* _BASE64_H_INCLUDED_ */

318
src/utils/md5.cpp Normal file
View File

@ -0,0 +1,318 @@
/*
* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
*
* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
* rights reserved.
*
* License to copy and use this software is granted provided that it
* is identified as the "RSA Data Security, Inc. MD5 Message-Digest
* Algorithm" in all material mentioning or referencing this software
* or this function.
*
* License is also granted to make and use derivative works provided
* that such works are identified as "derived from the RSA Data
* Security, Inc. MD5 Message-Digest Algorithm" in all material
* mentioning or referencing the derived work.
*
* RSA Data Security, Inc. makes no representations concerning either
* the merchantability of this software or the suitability of this
* software for any particular purpose. It is provided "as is"
* without express or implied warranty of any kind.
*
* These notices must be retained in any copies of any part of this
* documentation and/or software.
*
* $FreeBSD: src/lib/libmd/md5c.c,v 1.11 1999/12/29 05:04:20 peter Exp $
*
* This code is the same as the code published by RSA Inc. It has been
* edited for clarity and style only.
*/
#include <string.h>
#include "md5.h"
typedef unsigned int md5uint32;
static void MD5Transform(md5uint32 [4], const unsigned char [64]);
#ifdef i386
#define Encode memcpy
#define Decode memcpy
#else /* i386 */
/*
* Encodes input (md5uint32) into output (unsigned char). Assumes len is
* a multiple of 4.
*/
static void
Encode (unsigned char *output, md5uint32 *input, unsigned int len)
{
unsigned int i, j;
for (i = 0, j = 0; j < len; i++, j += 4) {
output[j] = (unsigned char)(input[i] & 0xff);
output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
}
}
/*
* Decodes input (unsigned char) into output (md5uint32). Assumes len is
* a multiple of 4.
*/
static void
Decode (md5uint32 *output, const unsigned char *input, unsigned int len)
{
unsigned int i, j;
for (i = 0, j = 0; j < len; i++, j += 4)
output[i] = ((md5uint32)input[j]) | (((md5uint32)input[j+1]) << 8) |
(((md5uint32)input[j+2]) << 16) | (((md5uint32)input[j+3]) << 24);
}
#endif /* i386 */
static unsigned char PADDING[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* F, G, H and I are basic MD5 functions. */
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z)))
/* ROTATE_LEFT rotates x left n bits. */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
/*
* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
* Rotation is separate from addition to prevent recomputation.
*/
#define FF(a, b, c, d, x, s, ac) { \
(a) += F ((b), (c), (d)) + (x) + (md5uint32)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define GG(a, b, c, d, x, s, ac) { \
(a) += G ((b), (c), (d)) + (x) + (md5uint32)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define HH(a, b, c, d, x, s, ac) { \
(a) += H ((b), (c), (d)) + (x) + (md5uint32)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define II(a, b, c, d, x, s, ac) { \
(a) += I ((b), (c), (d)) + (x) + (md5uint32)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
/* MD5 initialization. Begins an MD5 operation, writing a new context. */
void
MD5Init (MD5_CTX *context)
{
context->count[0] = context->count[1] = 0;
/* Load magic initialization constants. */
context->state[0] = 0x67452301;
context->state[1] = 0xefcdab89;
context->state[2] = 0x98badcfe;
context->state[3] = 0x10325476;
}
/*
* MD5 block update operation. Continues an MD5 message-digest
* operation, processing another message block, and updating the
* context.
*/
void
MD5Update (MD5_CTX *context, const unsigned char *input, unsigned int inputLen)
{
unsigned int i, index, partLen;
/* Compute number of bytes mod 64 */
index = (unsigned int)((context->count[0] >> 3) & 0x3F);
/* Update number of bits */
if ((context->count[0] += ((md5uint32)inputLen << 3))
< ((md5uint32)inputLen << 3))
context->count[1]++;
context->count[1] += ((md5uint32)inputLen >> 29);
partLen = 64 - index;
/* Transform as many times as possible. */
if (inputLen >= partLen) {
memcpy((void *)&context->buffer[index], (const void *)input,
partLen);
MD5Transform (context->state, context->buffer);
for (i = partLen; i + 63 < inputLen; i += 64)
MD5Transform (context->state, &input[i]);
index = 0;
}
else
i = 0;
/* Buffer remaining input */
memcpy ((void *)&context->buffer[index], (const void *)&input[i],
inputLen-i);
}
/*
* MD5 padding. Adds padding followed by original length.
*/
void
MD5Pad (MD5_CTX *context)
{
unsigned char bits[8];
unsigned int index, padLen;
/* Save number of bits */
Encode (bits, context->count, 8);
/* Pad out to 56 mod 64. */
index = (unsigned int)((context->count[0] >> 3) & 0x3f);
padLen = (index < 56) ? (56 - index) : (120 - index);
MD5Update (context, PADDING, padLen);
/* Append length (before padding) */
MD5Update (context, bits, 8);
}
/*
* MD5 finalization. Ends an MD5 message-digest operation, writing the
* the message digest and zeroizing the context.
*/
void
MD5Final (unsigned char digest[16],MD5_CTX *context)
{
/* Do padding. */
MD5Pad (context);
/* Store state in digest */
Encode (digest, context->state, 16);
/* Zeroize sensitive information. */
memset ((void *)context, 0, sizeof (*context));
}
/* MD5 basic transformation. Transforms state based on block. */
static void
MD5Transform (md5uint32 state[4], const unsigned char block[64])
{
md5uint32 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
Decode (x, block, 64);
/* Round 1 */
#define S11 7
#define S12 12
#define S13 17
#define S14 22
FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
/* Round 2 */
#define S21 5
#define S22 9
#define S23 14
#define S24 20
GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
/* Round 3 */
#define S31 4
#define S32 11
#define S33 16
#define S34 23
HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
/* Round 4 */
#define S41 6
#define S42 10
#define S43 15
#define S44 21
II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
/* Zeroize sensitive information. */
memset ((void *)x, 0, sizeof (x));
}

43
src/utils/md5.h Normal file
View File

@ -0,0 +1,43 @@
#ifndef _MD5_H_
#define _MD5_H_
/* MD5.H - header file for MD5C.C
* Id: md5.h,v 1.6.2.1 1998/02/18 02:28:14 jkh Exp $
*/
/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
rights reserved.
License to copy and use this software is granted provided that it
is identified as the "RSA Data Security, Inc. MD5 Message-Digest
Algorithm" in all material mentioning or referencing this software
or this function.
License is also granted to make and use derivative works provided
that such works are identified as "derived from the RSA Data
Security, Inc. MD5 Message-Digest Algorithm" in all material
mentioning or referencing the derived work.
RSA Data Security, Inc. makes no representations concerning either
the merchantability of this software or the suitability of this
software for any particular purpose. It is provided "as is"
without express or implied warranty of any kind.
These notices must be retained in any copies of any part of this
documentation and/or software.
*/
extern "C" {
/* MD5 context. */
typedef struct MD5Context {
unsigned int state[4]; /* state (ABCD) */
unsigned int count[2]; /* number of bits, modulo 2^64 (lsb first) */
unsigned char buffer[64]; /* input buffer */
} MD5_CTX;
void MD5Init (MD5_CTX *);
void MD5Update (MD5_CTX *, const unsigned char *, unsigned int);
void MD5Final (unsigned char [16], MD5_CTX *);
}
#endif /* _MD5_H_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.6 2005-11-06 11:16:53 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_MIMEPARSE
@ -10,13 +10,7 @@ static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.5 2005-10-31 08:59:05 dockes Ex
#include <ctype.h>
#include "mimeparse.h"
//#define DEBUG_MIMEPARSE
#ifdef DEBUG_MIMEPARSE
#define DPRINT(X) fprintf X
#else
#define DPRINT(X)
#endif
#include "base64.h"
using namespace std;
@ -251,131 +245,6 @@ bool qp_decode(const string& in, string &out)
}
// This is adapted from FreeBSD's code.
static const char Base64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static const char Pad64 = '=';
bool base64_decode(const string& in, string& out)
{
int io = 0, state = 0, ch;
char *pos;
unsigned int ii = 0;
out.reserve(in.length());
for (ii = 0; ii < in.length(); ii++) {
ch = in[ii];
if (isspace((unsigned char)ch)) /* Skip whitespace anywhere. */
continue;
if (ch == Pad64)
break;
pos = strchr(Base64, ch);
if (pos == 0) {
/* A non-base64 character. */
DPRINT((stderr, "base64_dec: non-base64 char at pos %d\n", ii));
return false;
}
switch (state) {
case 0:
out += (pos - Base64) << 2;
state = 1;
break;
case 1:
out[io] |= (pos - Base64) >> 4;
out += ((pos - Base64) & 0x0f) << 4 ;
io++;
state = 2;
break;
case 2:
out[io] |= (pos - Base64) >> 2;
out += ((pos - Base64) & 0x03) << 6;
io++;
state = 3;
break;
case 3:
out[io] |= (pos - Base64);
io++;
state = 0;
break;
default:
DPRINT((stderr, "base64_dec: internal!bad state!\n"));
return false;
}
}
/*
* We are done decoding Base-64 chars. Let's see if we ended
* on a byte boundary, and/or with erroneous trailing characters.
*/
if (ch == Pad64) { /* We got a pad char. */
ch = in[ii++]; /* Skip it, get next. */
switch (state) {
case 0: /* Invalid = in first position */
case 1: /* Invalid = in second position */
DPRINT((stderr, "base64_dec: pad char in state 0/1\n"));
return false;
case 2: /* Valid, means one byte of info */
/* Skip any number of spaces. */
for (; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch))
break;
/* Make sure there is another trailing = sign. */
if (ch != Pad64) {
DPRINT((stderr, "base64_dec: missing pad char!\n"));
// Well, there are bad encoders out there. Let it pass
// return false;
}
ch = in[ii++]; /* Skip the = */
/* Fall through to "single trailing =" case. */
/* FALLTHROUGH */
case 3: /* Valid, means two bytes of info */
/*
* We know this char is an =. Is there anything but
* whitespace after it?
*/
for ((void)NULL; ii < in.length(); ch = in[ii++])
if (!isspace((unsigned char)ch)) {
DPRINT((stderr, "base64_dec: non-white at eod: 0x%x\n",
(unsigned int)ch));
// Well, there are bad encoders out there. Let it pass
//return false;
}
/*
* Now make sure for cases 2 and 3 that the "extra"
* bits that slopped past the last full byte were
* zeros. If we don't check them, they become a
* subliminal channel.
*/
if (out[io] != 0) {
DPRINT((stderr, "base64_dec: bad extra bits!\n"));
// Well, there are bad encoders out there. Let it pass
out[io] = 0;
// return false;
}
}
} else {
/*
* We ended by seeing the end of the string. Make sure we
* have no partial bytes lying around.
*/
if (state != 0) {
DPRINT((stderr, "base64_dec: bad final state\n"));
return false;
}
}
DPRINT((stderr, "base64_dec: ret ok, io %d sz %d len %d value [%s]\n",
io, out.size(), out.length(), out.c_str()));
return true;
}
#include "transcode.h"
#include "smallut.h"