From 1dd66b5b1df44c99ce5b34a444df6f6f41c0e9d8 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 28 Jul 2008 10:20:20 +0000 Subject: [PATCH] *** empty log message *** --- src/utils/fileudi.cpp | 132 ++++++++++++++++++++++++++++++++++++++++++ src/utils/fileudi.h | 33 +++++++++++ 2 files changed, 165 insertions(+) create mode 100644 src/utils/fileudi.cpp create mode 100644 src/utils/fileudi.h diff --git a/src/utils/fileudi.cpp b/src/utils/fileudi.cpp new file mode 100644 index 00000000..79966037 --- /dev/null +++ b/src/utils/fileudi.cpp @@ -0,0 +1,132 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: fileudi.cpp,v 1.1 2008-07-28 10:20:20 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef TEST_FILEUDI + +#include +#include + +#include "fileudi.h" +#include "md5.h" +#include "base64.h" + +using std::string; + +// Debug only +#ifdef PATHHASH_HEX +static void md5hexprint(const unsigned char hash[16], string &out) +{ + out.erase(); + out.reserve(33); + static const char hex[]="0123456789abcdef"; + for (int i = 0; i < 16; i++) { + out.append(1, hex[hash[i] >> 4]); + out.append(1, hex[hash[i] & 0x0f]); + } +} +#endif + +// Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars) +#define HASHLEN 22 + +// Convert longish paths by truncating and appending hash of path +// The full length of the base64-encoded (minus pad) of the md5 is 22 chars +// We append this to the truncated path +void pathHash(const std::string &path, std::string &phash, unsigned int maxlen) +{ + if (maxlen < HASHLEN) { + fprintf(stderr, "pathHash: internal error: requested len too small\n"); + abort(); + } + + if (path.length() <= maxlen) { + phash = path; + return; + } + + // Compute the md5 + unsigned char chash[16]; + MD5_CTX ctx; + MD5Init(&ctx); + MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN), + path.length() - (maxlen - HASHLEN)); + MD5Final(chash, &ctx); + +#ifdef PATHHASH_HEX + string hex; + md5hexprint(chash, hex); + printf("hex [%s]\n", hex.c_str()); +#endif + + // Encode it to ascii. This shouldn't be strictly necessary as + // xapian terms can be binary + string hash; + base64_encode(string((char *)chash, 16), hash); + // We happen to know there will be 2 pad chars in there, that we + // don't need as this won't ever be decoded. Resulting length is 22 + hash.resize(hash.length() - 2); + + // Truncate path and append hash + phash = path.substr(0, maxlen - HASHLEN) + hash; +} + + +// Maximum length for path/unique terms stored for each document. We truncate +// longer paths and uniquize them by appending a hashed value. This +// is done to avoid xapian max term length limitations, not +// to gain space (we gain very little even with very short maxlens +// like 30). The xapian max key length seems to be around 250. +// The value for PATHHASHLEN includes the length of the hash part. +#define PATHHASHLEN 150 + +// Compute the unique term used to link documents to their file-system source: +// Hashed path + possible internal path +void make_udi(const string& fn, const string& ipath, string &udi) +{ + string s(fn); + s.append("|"); + s.append(ipath); + pathHash(s, udi, PATHHASHLEN); + return; +} + +#else // TEST_FILEUDI +#include +#include +#include "fileudi.h" + +using namespace std; + +int main(int argc, char **argv) +{ + string path="/usr/lib/toto.cpp"; + string ipath = "1:2:3:4:5:10"; + string udi; + make_udi(path, ipath, udi); + printf("udi [%s]\n", udi.c_str()); + path = "/some/much/too/looooooooooooooong/path/bla/bla/bla" + "/looooooooooooooong/path/bla/bla/bla/llllllllllllllllll" + "/looooooooooooooong/path/bla/bla/bla/llllllllllllllllll"; + ipath = "1:2:3:4:5:10" + "1:2:3:4:5:10" + "1:2:3:4:5:10"; + make_udi(path, ipath, udi); + printf("udi [%s]\n", udi.c_str()); +} +#endif // TEST_FILEUDI diff --git a/src/utils/fileudi.h b/src/utils/fileudi.h new file mode 100644 index 00000000..368ae1d5 --- /dev/null +++ b/src/utils/fileudi.h @@ -0,0 +1,33 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef _FILEUDI_H_INCLUDED_ +#define _FILEUDI_H_INCLUDED_ +/* @(#$Id: fileudi.h,v 1.1 2008-07-28 10:20:20 dockes Exp $ (C) 2004 J.F.Dockes */ +#include +using std::string; + +// Unique Document Ids for the file-based indexer (main Recoll +// indexer). Document Ids are built from a concatenation of the file +// path and the internal path (ie: email number inside +// folder/attachment number/etc.) As the size of Xapian terms is +// limited, the path is truncated to a maximum length, and completed +// by a hash of the remainder. So the unique id looks like: +// /some/truncated/paHASHVALUE|ipath + +extern void make_udi(const string& fn, const string& ipath, string &udi); + +#endif /* _FILEUDI_H_INCLUDED_ */