diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 846eff7d..be1e897f 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -67,8 +67,10 @@ namespace Dijon * - DEFAULT_CHARSET is the charset preferred by the client application. * The filter will convert document's content to this charset if possible. * - OPERATING_MODE can be set to either view or index. + * - DJF_UDI Unique document identifier. This can be useful if the + * filter wants to manage a persistent cache. */ - typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties; + typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties; // Information. diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index c12d9d4e..6854f96b 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -183,6 +183,12 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, { m_fn = f; + // This is used by filters which manage some kind of cache. + // Indexing by udi makes things easier (because they sometimes get a temp + // as input + string udi; + make_udi(f, "", udi); + cnf->setKeyDir(path_getfather(m_fn)); string l_mime; @@ -259,6 +265,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, string charset = m_cfg->getDefCharset(); df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + df->set_property(Dijon::Filter::DJF_UDI, udi); #ifdef RCL_USE_XATTR // Get fields computed from extended attributes. We use the diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index 2e4f7d66..3e6033ab 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -28,15 +28,175 @@ static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.5 2008-10-04 14:26:59 dockes Exp #include #include +#include #include "mimehandler.h" #include "debuglog.h" #include "readfile.h" #include "mh_mbox.h" #include "smallut.h" +#include "rclconfig.h" +#include "md5.h" +#include "conftree.h" using namespace std; +/** + * Handles a cache for message numbers to offset translations. Permits direct + * accesses inside big folders instead of having to scan up to the right place + * + * Message offsets are saved to files stored under cfg(mboxcachedir), default + * confdir/mboxcache. Mbox files smaller than cfg(mboxcacheminmbs) are not + * cached. + * Cache files are named as the md5 of the file UDI, which is kept in + * the first block for possible collision detection. The 64 bits + * offsets for all message "From_" lines follow. The format is purely + * binary, values are not even byte-swapped to be proc-idependant. + */ +class MboxCache { +public: + typedef MimeHandlerMbox::mbhoff_type mbhoff_type; + MboxCache() + : m_ok(false), m_minfsize(0) + { + // Can't access rclconfig here, we're a static object, would + // have to make sure it's initialized. + } + + ~MboxCache() {} + + mbhoff_type get_offset(const string& udi, int msgnum) + { + if (!ok()) + return -1; + string fn = makefilename(udi); + ifstream input(fn.c_str(), ios::in | ios::binary); + if (!input.is_open()) + return -1; + char blk1[o_b1size]; + input.read(blk1, o_b1size); + if (!input) + return -1; + ConfSimple cf(string(blk1, o_b1size)); + string fudi; + if (!cf.get("udi", fudi) || fudi.compare(udi)) { + LOGINFO(("MboxCache::get_offset:badudi fn %s udi [%s], fudi [%s]\n", + fn.c_str(), udi.c_str(), fudi.c_str())); + input.close(); + return -1; + } + input.seekg(cacheoffset(msgnum)); + if (!input) { + LOGINFO(("MboxCache::get_offset: fn %s, seek(%ld) failed\n", + fn.c_str(), cacheoffset(msgnum))); + input.close(); + return -1; + } + mbhoff_type offset = -1; + input.read((char *)&offset, sizeof(mbhoff_type)); + input.close(); + return offset; + } + + // Save array of offsets for a given file, designated by Udi + void put_offsets(const string& udi, mbhoff_type fsize, + vector& offs) + { + LOGDEB0(("MboxCache::put_offsets: %u offsets\n", offs.size())); + if (!ok() || !maybemakedir()) + return; + if (fsize < m_minfsize) + return; + string fn = makefilename(udi); + ofstream output(fn.c_str(), ios::out|ios::trunc|ios::binary); + if (!output.is_open()) + return; + string blk1; + blk1.append("udi="); + blk1.append(udi); + blk1.append("\n"); + blk1.resize(o_b1size, 0); + output << blk1; + if (!output.good()) + return; + for (vector::const_iterator it = offs.begin(); + it != offs.end(); it++) { + mbhoff_type off = *it; + output.write((char*)&off, sizeof(mbhoff_type)); + if (!output.good()) { + output.close(); + return; + } + } + output.close(); + } + + // Check state, possibly initialize + bool ok() { + if (m_minfsize == -1) + return false; + if (!m_ok) { + RclConfig *config = RclConfig::getMainConfig(); + if (config == 0) + return false; + int minmbs = 10; + config->getConfParam("mboxcacheminmbs", &minmbs); + if (minmbs < 0) { + // minmbs set to negative to disable cache + m_minfsize = -1; + return false; + } + m_minfsize = minmbs * 1000 * 1000; + + config->getConfParam("mboxcachedir", m_dir); + if (m_dir.empty()) + m_dir = "mboxcache"; + m_dir = path_tildexpand(m_dir); + // If not an absolute path, compute relative to config dir + if (m_dir.at(0) != '/') + m_dir = path_cat(config->getConfDir(), m_dir); + m_ok = true; + } + return m_ok; + } + +private: + bool m_ok; + + // Place where we store things + string m_dir; + // Don't cache smaller files. If -1, don't do anything. + mbhoff_type m_minfsize; + static const int o_b1size; + + // Create the cache directory if it does not exist + bool maybemakedir() + { + struct stat st; + if (stat(m_dir.c_str(), &st) != 0 && mkdir(m_dir.c_str(), 0700) != 0) { + return false; + } + return true; + } + // Compute file name from udi + string makefilename(const string& udi) + { + string digest, xdigest; + MD5String(udi, digest); + MD5HexPrint(digest, xdigest); + return path_cat(m_dir, xdigest); + } + + // Compute offset in cache file for the mbox offset of msgnum + mbhoff_type cacheoffset(int msgnum) + {// Msgnums are from 1 + return o_b1size + (msgnum-1) * sizeof(mbhoff_type); + } +}; + +const int MboxCache::o_b1size = 1024; +static class MboxCache mcache; + MimeHandlerMbox::~MimeHandlerMbox() { clear(); @@ -51,6 +211,7 @@ void MimeHandlerMbox::clear() } m_msgnum = m_lineno = 0; m_ipath.erase(); + m_offsets.clear(); RecollFilter::clear(); } @@ -70,7 +231,11 @@ bool MimeHandlerMbox::set_document_file(const string &fn) fn.c_str())); return false; } + fseek((FILE *)m_vfp, 0, SEEK_END); + m_fsize = ftell((FILE*)m_vfp); + fseek((FILE*)m_vfp, 0, SEEK_SET); m_havedoc = true; + m_offsets.clear(); return true; } @@ -186,8 +351,22 @@ bool MimeHandlerMbox::next_document() // we're ever used in this way (multiple retrieves on same // object). So: if (mtarg > 0) { - fseek(fp, 0, SEEK_SET); - m_msgnum = 0; + mbhoff_type off; + line_type line; + LOGDEB0(("MimeHandlerMbox::next_doc: mtarg %d m_udi[%s]\n", + mtarg, m_udi.c_str())); + if (!m_udi.empty() && + (off = mcache.get_offset(m_udi, mtarg)) >= 0 && + fseeko(fp, (off_t)off, SEEK_SET) >= 0 && + fgets(line, LL, fp) && + !regexec(&fromregex, line, 0, 0, 0)) { + LOGDEB0(("MimeHandlerMbox: Cache: From_ Ok\n")); + fseeko(fp, (off_t)off, SEEK_SET); + m_msgnum = mtarg -1; + } else { + fseek(fp, 0, SEEK_SET); + m_msgnum = 0; + } } off_t start, end; @@ -200,6 +379,7 @@ bool MimeHandlerMbox::next_document() // line after this line_type line; for (;;) { + mbhoff_type off_From = ftello(fp); if (!fgets(line, LL, fp)) { // Eof hit while looking for 'From ' -> file done. We'd need // another return code here @@ -217,9 +397,10 @@ bool MimeHandlerMbox::next_document() continue; } if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { - LOGDEB0(("MimeHandlerMbox: From_ at line %d: [%s]\n", - m_lineno, line)); + LOGDEB0(("MimeHandlerMbox: msgnum %d, From_ at line %d: [%s]\n", + m_msgnum, m_lineno, line)); start = ftello(fp); + m_offsets.push_back(off_From); m_msgnum++; break; } @@ -267,6 +448,9 @@ bool MimeHandlerMbox::next_document() if (iseof) { LOGDEB2(("MimeHandlerMbox::next: eof hit\n")); m_havedoc = false; + if (!m_udi.empty()) { + mcache.put_offsets(m_udi, m_fsize, m_offsets); + } } return msgtxt.empty() ? false : true; } diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h index 72cd4940..8c29e6b3 100644 --- a/src/internfile/mh_mbox.h +++ b/src/internfile/mh_mbox.h @@ -19,7 +19,9 @@ /* @(#$Id: mh_mbox.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include using std::string; +using std::vector; #include "mimehandler.h" @@ -31,7 +33,7 @@ using std::string; class MimeHandlerMbox : public RecollFilter { public: MimeHandlerMbox(const string& mime) - : RecollFilter(mime), m_vfp(0), m_msgnum(0), m_lineno(0) + : RecollFilter(mime), m_vfp(0), m_msgnum(0), m_lineno(0), m_fsize(0) {} virtual ~MimeHandlerMbox(); virtual bool set_document_file(const string &file_path); @@ -41,12 +43,15 @@ class MimeHandlerMbox : public RecollFilter { return true; } virtual void clear(); + typedef long long mbhoff_type; private: string m_fn; // File name void *m_vfp; // File pointer for folder int m_msgnum; // Current message number in folder. Starts at 1 string m_ipath; int m_lineno; // debug + mbhoff_type m_fsize; + vector m_offsets; }; #endif /* _MBOX_H_INCLUDED_ */ diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index f4480a3f..7866872b 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -36,6 +36,9 @@ public: virtual ~RecollFilter() {} virtual bool set_property(Properties p, const string &v) { switch (p) { + case DJF_UDI: + m_udi = v; + break; case DEFAULT_CHARSET: m_defcharset = v; break; @@ -94,6 +97,7 @@ protected: string m_defcharset; string m_reason; bool m_havedoc; + string m_udi; // May be set by creator as a hint }; /** diff --git a/src/query/docseqhist.cpp b/src/query/docseqhist.cpp index 8b309b01..dbf173f3 100644 --- a/src/query/docseqhist.cpp +++ b/src/query/docseqhist.cpp @@ -79,7 +79,7 @@ bool RclDHistoryEntry::decode(const string &value) // Old style entry found, make an udi, using the fs udi maker make_udi(fn, ipath, udi); } - LOGDEB(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str())); + LOGDEB1(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str())); return true; } @@ -91,7 +91,7 @@ bool RclDHistoryEntry::equal(const DynConfEntry& other) bool historyEnterDoc(RclDynConf *dncf, const string& udi) { - LOGDEB(("historyEnterDoc: [%s] into %s\n", + LOGDEB1(("historyEnterDoc: [%s] into %s\n", udi.c_str(), dncf->getFilename().c_str())); RclDHistoryEntry ne(time(0), udi); RclDHistoryEntry scratch;