implemented a cache for mbox message header offsets

This commit is contained in:
dockes 2009-11-27 12:41:05 +00:00
parent d5c80cdf16
commit 2ad0326ed7
6 changed files with 210 additions and 8 deletions

View File

@ -67,8 +67,10 @@ namespace Dijon
* - DEFAULT_CHARSET is the charset preferred by the client application.
* The filter will convert document's content to this charset if possible.
* - OPERATING_MODE can be set to either view or index.
* - DJF_UDI Unique document identifier. This can be useful if the
* filter wants to manage a persistent cache.
*/
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties;
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties;
// Information.

View File

@ -183,6 +183,12 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
{
m_fn = f;
// This is used by filters which manage some kind of cache.
// Indexing by udi makes things easier (because they sometimes get a temp
// as input
string udi;
make_udi(f, "", udi);
cnf->setKeyDir(path_getfather(m_fn));
string l_mime;
@ -259,6 +265,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
string charset = m_cfg->getDefCharset();
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
df->set_property(Dijon::Filter::DJF_UDI, udi);
#ifdef RCL_USE_XATTR
// Get fields computed from extended attributes. We use the

View File

@ -28,15 +28,175 @@ static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.5 2008-10-04 14:26:59 dockes Exp
#include <map>
#include <sstream>
#include <fstream>
#include "mimehandler.h"
#include "debuglog.h"
#include "readfile.h"
#include "mh_mbox.h"
#include "smallut.h"
#include "rclconfig.h"
#include "md5.h"
#include "conftree.h"
using namespace std;
/**
* Handles a cache for message numbers to offset translations. Permits direct
* accesses inside big folders instead of having to scan up to the right place
*
* Message offsets are saved to files stored under cfg(mboxcachedir), default
* confdir/mboxcache. Mbox files smaller than cfg(mboxcacheminmbs) are not
* cached.
* Cache files are named as the md5 of the file UDI, which is kept in
* the first block for possible collision detection. The 64 bits
* offsets for all message "From_" lines follow. The format is purely
* binary, values are not even byte-swapped to be proc-idependant.
*/
class MboxCache {
public:
typedef MimeHandlerMbox::mbhoff_type mbhoff_type;
MboxCache()
: m_ok(false), m_minfsize(0)
{
// Can't access rclconfig here, we're a static object, would
// have to make sure it's initialized.
}
~MboxCache() {}
mbhoff_type get_offset(const string& udi, int msgnum)
{
if (!ok())
return -1;
string fn = makefilename(udi);
ifstream input(fn.c_str(), ios::in | ios::binary);
if (!input.is_open())
return -1;
char blk1[o_b1size];
input.read(blk1, o_b1size);
if (!input)
return -1;
ConfSimple cf(string(blk1, o_b1size));
string fudi;
if (!cf.get("udi", fudi) || fudi.compare(udi)) {
LOGINFO(("MboxCache::get_offset:badudi fn %s udi [%s], fudi [%s]\n",
fn.c_str(), udi.c_str(), fudi.c_str()));
input.close();
return -1;
}
input.seekg(cacheoffset(msgnum));
if (!input) {
LOGINFO(("MboxCache::get_offset: fn %s, seek(%ld) failed\n",
fn.c_str(), cacheoffset(msgnum)));
input.close();
return -1;
}
mbhoff_type offset = -1;
input.read((char *)&offset, sizeof(mbhoff_type));
input.close();
return offset;
}
// Save array of offsets for a given file, designated by Udi
void put_offsets(const string& udi, mbhoff_type fsize,
vector<mbhoff_type>& offs)
{
LOGDEB0(("MboxCache::put_offsets: %u offsets\n", offs.size()));
if (!ok() || !maybemakedir())
return;
if (fsize < m_minfsize)
return;
string fn = makefilename(udi);
ofstream output(fn.c_str(), ios::out|ios::trunc|ios::binary);
if (!output.is_open())
return;
string blk1;
blk1.append("udi=");
blk1.append(udi);
blk1.append("\n");
blk1.resize(o_b1size, 0);
output << blk1;
if (!output.good())
return;
for (vector<mbhoff_type>::const_iterator it = offs.begin();
it != offs.end(); it++) {
mbhoff_type off = *it;
output.write((char*)&off, sizeof(mbhoff_type));
if (!output.good()) {
output.close();
return;
}
}
output.close();
}
// Check state, possibly initialize
bool ok() {
if (m_minfsize == -1)
return false;
if (!m_ok) {
RclConfig *config = RclConfig::getMainConfig();
if (config == 0)
return false;
int minmbs = 10;
config->getConfParam("mboxcacheminmbs", &minmbs);
if (minmbs < 0) {
// minmbs set to negative to disable cache
m_minfsize = -1;
return false;
}
m_minfsize = minmbs * 1000 * 1000;
config->getConfParam("mboxcachedir", m_dir);
if (m_dir.empty())
m_dir = "mboxcache";
m_dir = path_tildexpand(m_dir);
// If not an absolute path, compute relative to config dir
if (m_dir.at(0) != '/')
m_dir = path_cat(config->getConfDir(), m_dir);
m_ok = true;
}
return m_ok;
}
private:
bool m_ok;
// Place where we store things
string m_dir;
// Don't cache smaller files. If -1, don't do anything.
mbhoff_type m_minfsize;
static const int o_b1size;
// Create the cache directory if it does not exist
bool maybemakedir()
{
struct stat st;
if (stat(m_dir.c_str(), &st) != 0 && mkdir(m_dir.c_str(), 0700) != 0) {
return false;
}
return true;
}
// Compute file name from udi
string makefilename(const string& udi)
{
string digest, xdigest;
MD5String(udi, digest);
MD5HexPrint(digest, xdigest);
return path_cat(m_dir, xdigest);
}
// Compute offset in cache file for the mbox offset of msgnum
mbhoff_type cacheoffset(int msgnum)
{// Msgnums are from 1
return o_b1size + (msgnum-1) * sizeof(mbhoff_type);
}
};
const int MboxCache::o_b1size = 1024;
static class MboxCache mcache;
MimeHandlerMbox::~MimeHandlerMbox()
{
clear();
@ -51,6 +211,7 @@ void MimeHandlerMbox::clear()
}
m_msgnum = m_lineno = 0;
m_ipath.erase();
m_offsets.clear();
RecollFilter::clear();
}
@ -70,7 +231,11 @@ bool MimeHandlerMbox::set_document_file(const string &fn)
fn.c_str()));
return false;
}
fseek((FILE *)m_vfp, 0, SEEK_END);
m_fsize = ftell((FILE*)m_vfp);
fseek((FILE*)m_vfp, 0, SEEK_SET);
m_havedoc = true;
m_offsets.clear();
return true;
}
@ -186,8 +351,22 @@ bool MimeHandlerMbox::next_document()
// we're ever used in this way (multiple retrieves on same
// object). So:
if (mtarg > 0) {
fseek(fp, 0, SEEK_SET);
m_msgnum = 0;
mbhoff_type off;
line_type line;
LOGDEB0(("MimeHandlerMbox::next_doc: mtarg %d m_udi[%s]\n",
mtarg, m_udi.c_str()));
if (!m_udi.empty() &&
(off = mcache.get_offset(m_udi, mtarg)) >= 0 &&
fseeko(fp, (off_t)off, SEEK_SET) >= 0 &&
fgets(line, LL, fp) &&
!regexec(&fromregex, line, 0, 0, 0)) {
LOGDEB0(("MimeHandlerMbox: Cache: From_ Ok\n"));
fseeko(fp, (off_t)off, SEEK_SET);
m_msgnum = mtarg -1;
} else {
fseek(fp, 0, SEEK_SET);
m_msgnum = 0;
}
}
off_t start, end;
@ -200,6 +379,7 @@ bool MimeHandlerMbox::next_document()
// line after this
line_type line;
for (;;) {
mbhoff_type off_From = ftello(fp);
if (!fgets(line, LL, fp)) {
// Eof hit while looking for 'From ' -> file done. We'd need
// another return code here
@ -217,9 +397,10 @@ bool MimeHandlerMbox::next_document()
continue;
}
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
LOGDEB0(("MimeHandlerMbox: From_ at line %d: [%s]\n",
m_lineno, line));
LOGDEB0(("MimeHandlerMbox: msgnum %d, From_ at line %d: [%s]\n",
m_msgnum, m_lineno, line));
start = ftello(fp);
m_offsets.push_back(off_From);
m_msgnum++;
break;
}
@ -267,6 +448,9 @@ bool MimeHandlerMbox::next_document()
if (iseof) {
LOGDEB2(("MimeHandlerMbox::next: eof hit\n"));
m_havedoc = false;
if (!m_udi.empty()) {
mcache.put_offsets(m_udi, m_fsize, m_offsets);
}
}
return msgtxt.empty() ? false : true;
}

View File

@ -19,7 +19,9 @@
/* @(#$Id: mh_mbox.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <vector>
using std::string;
using std::vector;
#include "mimehandler.h"
@ -31,7 +33,7 @@ using std::string;
class MimeHandlerMbox : public RecollFilter {
public:
MimeHandlerMbox(const string& mime)
: RecollFilter(mime), m_vfp(0), m_msgnum(0), m_lineno(0)
: RecollFilter(mime), m_vfp(0), m_msgnum(0), m_lineno(0), m_fsize(0)
{}
virtual ~MimeHandlerMbox();
virtual bool set_document_file(const string &file_path);
@ -41,12 +43,15 @@ class MimeHandlerMbox : public RecollFilter {
return true;
}
virtual void clear();
typedef long long mbhoff_type;
private:
string m_fn; // File name
void *m_vfp; // File pointer for folder
int m_msgnum; // Current message number in folder. Starts at 1
string m_ipath;
int m_lineno; // debug
mbhoff_type m_fsize;
vector<mbhoff_type> m_offsets;
};
#endif /* _MBOX_H_INCLUDED_ */

View File

@ -36,6 +36,9 @@ public:
virtual ~RecollFilter() {}
virtual bool set_property(Properties p, const string &v) {
switch (p) {
case DJF_UDI:
m_udi = v;
break;
case DEFAULT_CHARSET:
m_defcharset = v;
break;
@ -94,6 +97,7 @@ protected:
string m_defcharset;
string m_reason;
bool m_havedoc;
string m_udi; // May be set by creator as a hint
};
/**

View File

@ -79,7 +79,7 @@ bool RclDHistoryEntry::decode(const string &value)
// Old style entry found, make an udi, using the fs udi maker
make_udi(fn, ipath, udi);
}
LOGDEB(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str()));
LOGDEB1(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str()));
return true;
}
@ -91,7 +91,7 @@ bool RclDHistoryEntry::equal(const DynConfEntry& other)
bool historyEnterDoc(RclDynConf *dncf, const string& udi)
{
LOGDEB(("historyEnterDoc: [%s] into %s\n",
LOGDEB1(("historyEnterDoc: [%s] into %s\n",
udi.c_str(), dncf->getFilename().c_str()));
RclDHistoryEntry ne(time(0), udi);
RclDHistoryEntry scratch;