intermediary checkpoint (things work, no index, no compression)

This commit is contained in:
dockes 2009-11-10 17:34:52 +00:00
parent 645af1aa55
commit 66d9896d81
2 changed files with 225 additions and 75 deletions

View File

@ -1,3 +1,19 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes"; static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes";
#endif #endif
@ -17,19 +33,20 @@ static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes";
#include "circache.h" #include "circache.h"
#include "conftree.h" #include "conftree.h"
#include "debuglog.h"
using namespace std; using namespace std;
/* /*
* File structure: * File structure:
* - Starts with a 1-KB header block, with a param dictionary, ascii-space * - Starts with a 1-KB header block, with a param dictionary.
* filled. * - Stored items follow. Each item has a header and 2 segments for
* - Stored items follow. Each item has 2 segments for the metadata and the * the metadata and the data.
* data. The segment sizes are stored in an ascii header/marker. * The segment sizes are stored in the ascii header/marker:
* circacheSizes = xxx yyy zzz * circacheSizes = xxx yyy zzz
* xxx bytes of metadata * xxx bytes of metadata
* yyy bytes of data * yyy bytes of data
* zzz bytes of padding up to next object * zzz bytes of padding up to next object (only one entry has non zero)
* *
* There is a write position, which can be at eof while * There is a write position, which can be at eof while
* the file is growing, or inside the file if we are recycling. This is stored * the file is growing, or inside the file if we are recycling. This is stored
@ -40,7 +57,7 @@ using namespace std;
* pad it with neutral data and store the size in the new header. * pad it with neutral data and store the size in the new header.
*/ */
// First block in file. // First block size
#define CIRCACHE_FIRSTBLOCK_SIZE 1024 #define CIRCACHE_FIRSTBLOCK_SIZE 1024
// Entry header. // Entry header.
@ -111,7 +128,7 @@ public:
// Name for the cache file // Name for the cache file
string datafn(const string& d) string datafn(const string& d)
{ {
return path_cat(d, "circache"); return path_cat(d, "circache.crch");
} }
bool writefirstblock() bool writefirstblock()
@ -228,17 +245,33 @@ public:
return true; return true;
} }
CCScanHook::status scan(off_t startoffset, CCScanHook *user) CCScanHook::status scan(off_t startoffset, CCScanHook *user,
bool fold = false)
{ {
assert(m_fd >= 0); assert(m_fd >= 0);
off_t so0 = startoffset;
bool already_folded = false;
while (true) { while (true) {
if (already_folded && startoffset == so0)
return CCScanHook::Eof;
EntryHeaderData d; EntryHeaderData d;
CCScanHook::status st; CCScanHook::status st;
if ((st = readentryheader(startoffset, d)) != switch ((st = readentryheader(startoffset, d))) {
CCScanHook::Continue) { case CCScanHook::Continue: break;
case CCScanHook::Eof:
if (fold && !already_folded) {
already_folded = true;
startoffset = CIRCACHE_FIRSTBLOCK_SIZE;
continue;
}
/* FALLTHROUGH */
default:
return st; return st;
} }
char *bf; char *bf;
if ((bf = buf(d.dicsize+1)) == 0) { if ((bf = buf(d.dicsize+1)) == 0) {
return CCScanHook::Error; return CCScanHook::Error;
@ -296,8 +329,8 @@ bool CirCache::create(off_t m_maxsize)
struct stat st; struct stat st;
if (stat(m_dir.c_str(), &st) < 0) { if (stat(m_dir.c_str(), &st) < 0) {
if (mkdir(m_dir.c_str(), 0777) < 0) { if (mkdir(m_dir.c_str(), 0777) < 0) {
m_d->m_reason << "CirCache::create: mkdir(" << m_dir << ") failed" << m_d->m_reason << "CirCache::create: mkdir(" << m_dir <<
" errno " << errno; ") failed" << " errno " << errno;
return false; return false;
} }
} }
@ -317,23 +350,13 @@ bool CirCache::create(off_t m_maxsize)
memset(buf, 0, CIRCACHE_FIRSTBLOCK_SIZE); memset(buf, 0, CIRCACHE_FIRSTBLOCK_SIZE);
if (::write(m_d->m_fd, buf, CIRCACHE_FIRSTBLOCK_SIZE) != if (::write(m_d->m_fd, buf, CIRCACHE_FIRSTBLOCK_SIZE) !=
CIRCACHE_FIRSTBLOCK_SIZE) { CIRCACHE_FIRSTBLOCK_SIZE) {
m_d->m_reason << "CirCache::create: write header failed, errno " << errno; m_d->m_reason << "CirCache::create: write header failed, errno "
<< errno;
return false; return false;
} }
return m_d->writefirstblock(); return m_d->writefirstblock();
} }
class CCScanHookDump : public CCScanHook {
public:
virtual status takeone(off_t offs, const string& udi, unsigned int dicsize,
unsigned int datasize, unsigned int padsize)
{
cout << "udi [" << udi << "] dicsize " << dicsize << " datasize "
<< datasize << " padsize " << padsize << endl;
return Continue;
}
};
bool CirCache::open(OpMode mode) bool CirCache::open(OpMode mode)
{ {
assert(m_d != 0); assert(m_d != 0);
@ -346,31 +369,121 @@ bool CirCache::open(OpMode mode)
") failed " << "errno " << errno; ") failed " << "errno " << errno;
return false; return false;
} }
bool ret = m_d->readfirstblock(); return m_d->readfirstblock();
if (mode == CC_OPREAD) {
CCScanHookDump dumper;
switch (m_d->scan(CIRCACHE_FIRSTBLOCK_SIZE, &dumper)) {
case CCScanHook::Stop:
cerr << "Scan returns Stop" << endl;
break;
case CCScanHook::Continue:
cerr << "Scan returns Continue ?? " << CCScanHook::Continue << " " <<
getReason() << endl;
break;
case CCScanHook::Error:
cerr << "Scan returns Error: " << getReason() << endl;
break;
case CCScanHook::Eof:
cerr << "Scan returns Eof" << endl;
break;
}
}
return ret;
} }
bool CirCache::get(const string& udi, string dic, string data) class CCScanHookDump : public CCScanHook {
public:
virtual status takeone(off_t offs, const string& udi, unsigned int dicsize,
unsigned int datasize, unsigned int padsize)
{
cout << "Scan: offs " << offs << " dicsize " << dicsize
<< " datasize " << datasize << " padsize " << padsize <<
" udi [" << udi << "]" << endl;
return Continue;
}
};
bool CirCache::dump()
{ {
CCScanHookDump dumper;
off_t start = m_d->m_nheadoffs > CIRCACHE_FIRSTBLOCK_SIZE ?
m_d->m_nheadoffs : CIRCACHE_FIRSTBLOCK_SIZE;
switch (m_d->scan(start, &dumper, true)) {
case CCScanHook::Stop:
cout << "Scan returns Stop??" << endl;
return false;
case CCScanHook::Continue:
cout << "Scan returns Continue ?? " << CCScanHook::Continue << " " <<
getReason() << endl;
return false;
case CCScanHook::Error:
cout << "Scan returns Error: " << getReason() << endl;
return false;
case CCScanHook::Eof:
cout << "Scan returns Eof" << endl;
return true;
default:
cout << "Scan returns Unknown ??" << endl;
return false;
}
}
class CCScanHookGetter : public CCScanHook {
public:
string m_udi;
int m_targinstance;
int m_instance;
off_t m_offs;
EntryHeaderData m_hd;
CCScanHookGetter(const string &udi, int ti)
: m_udi(udi), m_targinstance(ti), m_instance(0), m_offs(0){}
virtual status takeone(off_t offs, const string& udi, unsigned int dicsize,
unsigned int datasize, unsigned int padsize)
{
cerr << "offs " << offs << " udi [" << udi << "] dicsize " << dicsize
<< " datasize " << datasize << " padsize " << padsize << endl;
if (!m_udi.compare(udi)) {
m_instance++;
m_offs = offs;
m_hd.dicsize = dicsize;
m_hd.datasize = datasize;
m_hd.padsize = padsize;
if (m_instance == m_targinstance)
return Stop;
}
return Continue;
}
};
// instance == -1 means get latest. Otherwise specify from 1+
bool CirCache::get(const string& udi, string& dict, string& data, int instance)
{
assert(m_d != 0);
if (m_d->m_fd < 0) {
m_d->m_reason << "CirCache::get: not open";
return false;
}
LOGDEB(("CirCache::get: udi [%s], instance\n", udi.c_str(), instance));
CCScanHookGetter getter(udi, instance);
off_t start = m_d->m_nheadoffs > CIRCACHE_FIRSTBLOCK_SIZE ?
m_d->m_nheadoffs : CIRCACHE_FIRSTBLOCK_SIZE;
CCScanHook::status ret = m_d->scan(start, &getter, true);
if (ret == CCScanHook::Eof) {
if (getter.m_instance == 0)
return false;
} else if (ret != CCScanHook::Stop) {
return false;
}
off_t offs = getter.m_offs + CIRCACHE_HEADER_SIZE;
if (lseek(m_d->m_fd, offs, 0) != offs) {
m_d->m_reason << "CirCache::get: lseek(" << offs << ") failed: " <<
errno;
return false;
}
char *bf = m_d->buf(getter.m_hd.dicsize);
if (bf == 0)
return false;
if (read(m_d->m_fd, bf, getter.m_hd.dicsize) != int(getter.m_hd.dicsize)) {
m_d->m_reason << "CirCache::get: read() failed: errno " << errno;
return false;
}
dict.assign(bf, getter.m_hd.dicsize);
bf = m_d->buf(getter.m_hd.datasize);
if (bf == 0)
return false;
if (read(m_d->m_fd, bf, getter.m_hd.datasize) != int(getter.m_hd.datasize)){
m_d->m_reason << "CirCache::get: read() failed: errno " << errno;
return false;
}
data.assign(bf, getter.m_hd.datasize);
return true; return true;
} }
@ -385,8 +498,8 @@ public:
virtual status takeone(off_t offs, const string& udi, unsigned int dicsize, virtual status takeone(off_t offs, const string& udi, unsigned int dicsize,
unsigned int datasize, unsigned int padsize) unsigned int datasize, unsigned int padsize)
{ {
cout << "udi [" << udi << "] dicsize " << dicsize << " datasize " LOGDEB(("ScanSpacer: offs %u dicsz %u datasz %u padsz %u udi[%s]\n",
<< datasize << " padsize " << padsize << endl; (unsigned int)offs, dicsize, datasize, padsize, udi.c_str()));
sizeseen += CIRCACHE_HEADER_SIZE + dicsize + datasize + padsize; sizeseen += CIRCACHE_HEADER_SIZE + dicsize + datasize + padsize;
if (sizeseen >= sizewanted) if (sizeseen >= sizewanted)
return Stop; return Stop;
@ -425,8 +538,8 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
int npadsize = 0; int npadsize = 0;
bool extending = false; bool extending = false;
cerr << "CirCache::PUT: nsize " << nsize << LOGDEB2(("CirCache::put: nsize %d oheadoffs %d\n",
" oheadoffs " << m_d->m_oheadoffs << endl; nsize, m_d->m_oheadoffs));
if (st.st_size < m_d->m_maxsize) { if (st.st_size < m_d->m_maxsize) {
// If we are still growing the file, things are simple // If we are still growing the file, things are simple
@ -450,8 +563,8 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
return false; return false;
} }
assert(int(pd.padsize) == m_d->m_npadsize); assert(int(pd.padsize) == m_d->m_npadsize);
cerr << "CirCache::put: recovering previous padsize " << LOGDEB2(("CirCache::put: recovering previous padsize %d\n",
pd.padsize << endl; pd.padsize));
pd.padsize = 0; pd.padsize = 0;
if (!m_d->writeentryheader(m_d->m_nheadoffs, pd)) { if (!m_d->writeentryheader(m_d->m_nheadoffs, pd)) {
return false; return false;
@ -463,19 +576,20 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
if (nsize <= recovpadsize) { if (nsize <= recovpadsize) {
// If the new entry fits entirely in the pad area from the // If the new entry fits entirely in the pad area from the
// latest one, no need to recycle the oldest entries. // latest one, no need to recycle the oldest entries.
cerr << "CirCache::put: new fits in old padsize " << LOGDEB2(("CirCache::put: new fits in old padsize %d\n,"
recovpadsize << endl; recovpadsize));
npadsize = recovpadsize - nsize; npadsize = recovpadsize - nsize;
} else { } else {
// Scan the file until we have enough space for the new entry, // Scan the file until we have enough space for the new entry,
// and determine the pad size up to the 1st preserved entry // and determine the pad size up to the 1st preserved entry
int scansize = nsize - recovpadsize; int scansize = nsize - recovpadsize;
cerr << "CirCache::put: scanning for size " << scansize << LOGDEB2(("CirCache::put: scanning for size %d from offs %u\n",
" from offset " << m_d->m_oheadoffs << endl; scansize, (unsigned int)m_d->m_oheadoffs));
CCScanHookSpacer spacer(scansize); CCScanHookSpacer spacer(scansize);
switch (m_d->scan(m_d->m_oheadoffs, &spacer)) { switch (m_d->scan(m_d->m_oheadoffs, &spacer)) {
case CCScanHook::Stop: case CCScanHook::Stop:
cerr << "put: Scan ok, sizeseen " << spacer.sizeseen << endl; LOGDEB2(("CirCache::put: Scan ok, sizeseen %d\n",
spacer.sizeseen));
npadsize = spacer.sizeseen - scansize; npadsize = spacer.sizeseen - scansize;
break; break;
case CCScanHook::Eof: case CCScanHook::Eof:
@ -489,8 +603,8 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
} }
} }
cerr << "CirCache::put: writing " << nsize << " at " << nwriteoffs << LOGDEB2(("CirCache::put: writing %d at %d padsize %d\n",
" padsize " << npadsize << endl; nsize, nwriteoffs, npadsize));
if (lseek(m_d->m_fd, nwriteoffs, 0) != nwriteoffs) { if (lseek(m_d->m_fd, nwriteoffs, 0) != nwriteoffs) {
m_d->m_reason << "CirCache::put: lseek failed: " << errno; m_d->m_reason << "CirCache::put: lseek failed: " << errno;
return false; return false;
@ -551,6 +665,8 @@ static char *thisprog;
static char usage [] = static char usage [] =
" -c <dirname> : create\n" " -c <dirname> : create\n"
" -p <dirname> <apath> [apath ...] : put files\n" " -p <dirname> <apath> [apath ...] : put files\n"
" -d <dirname> : dump\n"
" -g [-i instance] <dirname> <udi>: get\n"
; ;
static void static void
Usage(FILE *fp = stderr) Usage(FILE *fp = stderr)
@ -562,14 +678,15 @@ Usage(FILE *fp = stderr)
static int op_flags; static int op_flags;
#define OPT_MOINS 0x1 #define OPT_MOINS 0x1
#define OPT_c 0x2 #define OPT_c 0x2
#define OPT_b 0x4
#define OPT_p 0x8 #define OPT_p 0x8
#define OPT_g 0x10 #define OPT_g 0x10
#define OPT_d 0x20
#define OPT_i 0x40
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int count = 10; int instance = -1;
thisprog = argv[0]; thisprog = argv[0];
argc--; argv++; argc--; argv++;
@ -583,8 +700,9 @@ int main(int argc, char **argv)
case 'c': op_flags |= OPT_c; break; case 'c': op_flags |= OPT_c; break;
case 'p': op_flags |= OPT_p; break; case 'p': op_flags |= OPT_p; break;
case 'g': op_flags |= OPT_g; break; case 'g': op_flags |= OPT_g; break;
case 'b': op_flags |= OPT_b; if (argc < 2) Usage(); case 'd': op_flags |= OPT_d; break;
if ((sscanf(*(++argv), "%d", &count)) != 1) case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
if ((sscanf(*(++argv), "%d", &instance)) != 1)
Usage(); Usage();
argc--; argc--;
goto b1; goto b1;
@ -633,10 +751,23 @@ int main(int argc, char **argv)
} }
cc.open(CirCache::CC_OPREAD); cc.open(CirCache::CC_OPREAD);
} else if (op_flags & OPT_g) { } else if (op_flags & OPT_g) {
string udi = *argv++;argc--;
if (!cc.open(CirCache::CC_OPREAD)) { if (!cc.open(CirCache::CC_OPREAD)) {
cerr << "Open failed: " << cc.getReason() << endl; cerr << "Open failed: " << cc.getReason() << endl;
exit(1); exit(1);
} }
string dic, data;
if (!cc.get(udi, dic, data, instance)) {
cerr << "Get failed: " << cc.getReason() << endl;
exit(1);
}
cout << "Dict: [" << dic << "]" << endl;
} else if (op_flags & OPT_d) {
if (!cc.open(CirCache::CC_OPREAD)) {
cerr << "Open failed: " << cc.getReason() << endl;
exit(1);
}
cc.dump();
} else } else
Usage(); Usage();

View File

@ -1,13 +1,28 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _circache_h_included_ #ifndef _circache_h_included_
#define _circache_h_included_ #define _circache_h_included_
/* @(#$Id: $ (C) 2009 J.F.Dockes */ /* @(#$Id: $ (C) 2009 J.F.Dockes */
/** /**
* A data cache implemented as a circularly managed file * A data cache implemented as a circularly managed file
* *
* This is used to store cached remote pages for recoll. A single file is used * A single file is used to stored objects. The file grows to a
* to store the compressed pages and the associated metadata. The file * specified maximum size, then is rewritten from the start,
* grows to a specified maximum size, then is rewritten from the * overwriting older entries.
* start, overwriting older entries.
* *
* Data objects inside the cache each have two parts: a data segment and an * Data objects inside the cache each have two parts: a data segment and an
* attribute (metadata) dictionary. * attribute (metadata) dictionary.
@ -30,20 +45,24 @@ class CirCacheInternal;
class CirCache { class CirCache {
public: public:
CirCache(const string& dir); CirCache(const string& dir);
~CirCache(); virtual ~CirCache();
string getReason(); virtual string getReason();
bool create(off_t maxsize); virtual bool create(off_t maxsize);
enum OpMode {CC_OPREAD, CC_OPWRITE}; enum OpMode {CC_OPREAD, CC_OPWRITE};
bool open(OpMode mode); virtual bool open(OpMode mode);
bool get(const string& udi, string dic, string data); virtual bool get(const string& udi, string& dic, string& data,
int instance = -1);
bool put(const string& udi, const string& dic, const string& data); virtual bool put(const string& udi, const string& dic, const string& data);
private: /* Debug */
virtual bool dump();
protected:
CirCacheInternal *m_d; CirCacheInternal *m_d;
string m_dir; string m_dir;
}; };