circache ok

2009-11-17 14:52:01 +00:00 · 2009-11-17 14:52:01 +00:00 · 1a767213a0
commit 1a767213a0
parent 120cf441ba
2 changed files with 291 additions and 56 deletions
--- a/src/utils/circache.cpp
+++ b/src/utils/circache.cpp
@ -32,11 +32,13 @@ static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes";
 #include <sstream>
 #include <iostream>
 #include <map>
 #include "circache.h"
 #include "conftree.h"
 #include "debuglog.h"
 #include "smallut.h"
 #include "md5.h"
 using namespace std;
@ -53,13 +55,19 @@ using namespace std;
 *
 * There is a write position, which can be at eof while 
 * the file is growing, or inside the file if we are recycling. This is stored
- * in the header, together with the maximum size
+ * in the header (oheadoffs), together with the maximum size
 *
 * If we are recycling, we have to take care to compute the size of the 
 * possible remaining area from the last object invalidated by the write, 
- * pad it with neutral data and store the size in the new header.
+ * pad it with neutral data and store the size in the new header. To help with
 * this, the address for the last object written is also kept in the header
 * (nheadoffs, npadsize)
 * 
 */
 typedef unsigned long ULONG;
 typedef unsigned int  UINT;
 // First block size
 #define CIRCACHE_FIRSTBLOCK_SIZE 1024
@ -71,9 +79,9 @@ const char *headerformat = "circacheSizes = %x %x %x";
 class EntryHeaderData {
 public:
    EntryHeaderData() : dicsize(0), datasize(0), padsize(0) {}
-    unsigned int dicsize;
+    UINT dicsize;
-    unsigned int datasize;
+    UINT datasize;
-    unsigned int padsize;
+    UINT padsize;
 };
 // A callback class for the header-hopping function.
@ -81,10 +89,61 @@ class CCScanHook {
 public:
    virtual ~CCScanHook() {}
    enum status {Stop, Continue, Error, Eof};
-    virtual status takeone(off_t offs, const string& udi, unsigned int dicsize, 
+    virtual status takeone(off_t offs, const string& udi, UINT dicsize, 
-                           unsigned int datasize, unsigned int padsize) = 0;
+                           UINT datasize, UINT padsize) = 0;
 };
 // We have an auxiliary in-memory multimap of hashed-udi -> offset to
 // speed things up. This is created the first time the file is scanned
 // (on the first get), and not saved to disk. 
 // The map key: hashed udi. As a very short hash seems sufficient,
 // maybe we could find something faster/simpler than md5?
 #define UDIHLEN 4
 class UdiH {
 public:
    unsigned char h[UDIHLEN];
    UdiH(const string& udi)
    {
        MD5_CTX ctx;
        MD5Init(&ctx);
        MD5Update(&ctx, (const unsigned char*)udi.c_str(), udi.length());
        unsigned char md[16];
        MD5Final(md, &ctx);
        memcpy(h, md, UDIHLEN);
    }
    string asHexString() const {
        static const char hex[]="0123456789abcdef";
        string out;
        for (int i = 0; i < UDIHLEN; i++) {
            out.append(1, hex[h[i] >> 4]);
            out.append(1, hex[h[i] & 0x0f]);
        }
        return out;
    }
    bool operator==(const UdiH& r) const
    {
        for (int i = 0; i < UDIHLEN; i++)
            if (h[i] != r.h[i])
                return false;
        return true;
    }
    bool operator<(const UdiH& r) const
    {
        for (int i = 0; i < UDIHLEN; i++) {
            if (h[i] < r.h[i])
                return true;
            if (h[i] > r.h[i])
                return false;
        }
        return false;
    }
 };
 typedef multimap<UdiH, off_t> kh_type;
 typedef multimap<UdiH, off_t>::value_type kh_value_type;
 class CirCacheInternal {
 public:
    int m_fd;
@ -102,16 +161,87 @@ public:
    // A place to hold data when reading
    char *m_buffer;
    size_t m_bufsiz;
    // Error messages
    ostringstream m_reason;
-    // State for rewind/next/getcurrent operation
+    // State for rewind/next/getcurrent operation. This could/should
    // be moved to a separate iterator.
    off_t  m_itoffs;
    EntryHeaderData m_ithd;
    // Offset cache
    kh_type m_ofskh;
    bool    m_ofskhcplt; // Has cache been fully read since open?
    // Add udi->offset translation to map
    bool khEnter(const string& udi, off_t ofs)
    {
        UdiH h(udi);
        LOGDEB2(("Circache::khEnter: h %s offs %lu udi [%s]\n", 
                h.asHexString().c_str(), (ULONG)ofs, udi.c_str()));
        pair<kh_type::iterator, kh_type::iterator> p = m_ofskh.equal_range(h);
        if (p.first != m_ofskh.end() && p.first->first == h) {
            for (kh_type::iterator it = p.first; it != p.second; it++) {
                LOGDEB2(("Circache::khEnter: col h %s, ofs %lu\n", 
                        it->first.asHexString().c_str(),
                        (ULONG)it->second));
                if (it->second == ofs) {
                    // (h,offs) already there. Happens
                    LOGDEB2(("Circache::khEnter: already there\n"));
                    return true;
                }
            }
        }
        m_ofskh.insert(kh_value_type(h, ofs));
        LOGDEB2(("Circache::khEnter: inserted\n"));
        return true;
    }
    void khDump()
    {
        for (kh_type::const_iterator it = m_ofskh.begin(); 
             it != m_ofskh.end(); it++) {
            LOGDEB(("Circache::KHDUMP: %s %d\n", 
                    it->first.asHexString().c_str(), (ULONG)it->second));
        }
    }
    bool khFind(const string& udi, vector<off_t>& ofss)
    {
        ofss.clear();
        UdiH h(udi);
        LOGDEB2(("Circache::khFind: h %s udi [%s]\n", 
                h.asHexString().c_str(), udi.c_str()));
        pair<kh_type::iterator, kh_type::iterator> p = 
            m_ofskh.equal_range(h);
 #if 0
        if (p.first == m_ofskh.end()) LOGDEB(("KHFIND: FIRST END()\n"));
        if (p.second == m_ofskh.end()) LOGDEB(("KHFIND: SECOND END()\n"));
        if (!(p.first->first == h))
            LOGDEB(("KHFIND: NOKEY: %s %s\n", 
                    p.first->first.asHexString().c_str(),
                    p.second->first.asHexString().c_str()));
 #endif 
        if (p.first == m_ofskh.end() || !(p.first->first == h))
            return false;
        for (kh_type::iterator it = p.first; it != p.second; it++) {
            ofss.push_back(it->second);
        }
        return true;
    }
    CirCacheInternal()
        : m_fd(-1), m_maxsize(-1), m_oheadoffs(-1), 
-          m_nheadoffs(0), m_npadsize(0), m_buffer(0), m_bufsiz(0)
+          m_nheadoffs(0), m_npadsize(0), m_buffer(0), m_bufsiz(0), 
          m_ofskhcplt(false)
    {}
    ~CirCacheInternal()
@ -153,6 +283,7 @@ public:
            "npadsize = " << m_npadsize << "\n" <<
            "                                                              " <<
            "                                                              " <<
            "                                                              " <<
            "\0";
        int sz = int(s.str().size());
@ -264,8 +395,10 @@ public:
        bool already_folded = false;
        while (true) {
-            if (already_folded && startoffset == so0)
+            if (already_folded && startoffset == so0) {
                m_ofskhcplt = true;
                return CCScanHook::Eof;
            }
            EntryHeaderData d;
            CCScanHook::status st;
@ -299,7 +432,8 @@ public:
                m_reason << "scan: no udi in dic";
                return CCScanHook::Error;
            }
-                
+            khEnter(udi, startoffset);
            // Call callback
            CCScanHook::status a = 
                user->takeone(startoffset, udi, d.dicsize, d.datasize, 
@ -315,10 +449,21 @@ public:
        }
    }
-    bool readDicData(off_t hoffs, unsigned int dicsize, string& dict, 
+    bool readHDicData(off_t hoffs, EntryHeaderData& d, string& dic, 
-                     unsigned int datasize, string* data)
+                      string* data)
    {
        if (readentryheader(hoffs, d) != CCScanHook::Continue)
            return false;
        return readDicData(hoffs, d.dicsize, dic, d.datasize, data);
    }
    bool readDicData(off_t hoffs, UINT dicsize, string& dic, 
                     UINT datasize, string* data)
    {
        off_t offs = hoffs + CIRCACHE_HEADER_SIZE;
        // This syscall could be avoided in some cases if we saved the offset
        // at each seek. In most cases, we just read the header and we are
        // at the right position
        if (lseek(m_fd, offs, 0) != offs) {
            m_reason << "CirCache::get: lseek(" << offs << ") failed: " << 
                errno;
@ -331,7 +476,8 @@ public:
            m_reason << "CirCache::get: read() failed: errno " << errno;
            return false;
        }
-        dict.assign(bf, dicsize);
+        dic.assign(bf, dicsize);
        if (data == 0)
            return true;
@ -353,7 +499,7 @@ CirCache::CirCache(const string& dir)
    : m_dir(dir)
 {
    m_d = new CirCacheInternal;
-    LOGDEB(("CirCache: [%s]\n", m_dir.c_str()));
+    LOGDEB0(("CirCache: [%s]\n", m_dir.c_str()));
 }
 CirCache::~CirCache()
@ -422,8 +568,8 @@ bool CirCache::open(OpMode mode)
 class CCScanHookDump : public  CCScanHook {
 public:
-    virtual status takeone(off_t offs, const string& udi, unsigned int dicsize, 
+    virtual status takeone(off_t offs, const string& udi, UINT dicsize, 
-                           unsigned int datasize, unsigned int padsize) 
+                           UINT datasize, UINT padsize) 
    {
        cout << "Scan: offs " << offs << " dicsize " << dicsize 
             << " datasize " << datasize << " padsize " << padsize << 
@ -435,8 +581,11 @@ public:
 bool CirCache::dump()
 {
    CCScanHookDump dumper;
-    off_t start = m_d->m_nheadoffs > CIRCACHE_FIRSTBLOCK_SIZE ? 
+
-        m_d->m_nheadoffs : CIRCACHE_FIRSTBLOCK_SIZE;
+    // Start at oldest header. This is eof while the file is growing, scan will
    // fold to bot at once.
    off_t start = m_d->m_oheadoffs;
    switch (m_d->scan(start, &dumper, true)) {
    case CCScanHook::Stop: 
        cout << "Scan returns Stop??" << endl;
@ -449,7 +598,7 @@ bool CirCache::dump()
        cout << "Scan returns Error: " << getReason() << endl;
        return false;
    case CCScanHook::Eof:
-        cout << "Scan returns Eof" << endl;
+        cout << "Scan returns Eof (ok)" << endl;
        return true;
    default:
        cout << "Scan returns Unknown ??" << endl;
@ -468,11 +617,12 @@ public:
    CCScanHookGetter(const string &udi, int ti)
        : m_udi(udi), m_targinstance(ti), m_instance(0), m_offs(0){}
-    virtual status takeone(off_t offs, const string& udi, unsigned int dicsize, 
+    virtual status takeone(off_t offs, const string& udi, UINT dicsize, 
-                           unsigned int datasize, unsigned int padsize) 
+                           UINT datasize, UINT padsize) 
    {
-//        cerr << "offs " << offs << " udi [" << udi << "] dicsize " << dicsize 
+        LOGDEB2(("Circache:Scan: off %ld udi [%s] dcsz %u dtsz %u pdsz %u\n",
-//             << " datasize " << datasize << " padsize " << padsize << endl;
+                 long(offs), udi.c_str(), (UINT)dicsize, 
                 (UINT)datasize, (UINT)padsize));
        if (!m_udi.compare(udi)) {
            m_instance++;
            m_offs = offs;
@ -487,19 +637,68 @@ public:
 };
 // instance == -1 means get latest. Otherwise specify from 1+
-bool CirCache::get(const string& udi, string& dict, string& data, int instance)
+bool CirCache::get(const string& udi, string& dic, string& data, int instance)
 {
    Chrono chron;
    assert(m_d != 0);
    if (m_d->m_fd < 0) {
        m_d->m_reason << "CirCache::get: not open";
        return false;
    }
-    LOGDEB(("CirCache::get: udi [%s], instance\n", udi.c_str(), instance));
+    LOGDEB0(("CirCache::get: udi [%s], instance %d\n", udi.c_str(), instance));
    // If memory map is up to date, use it:
    if (m_d->m_ofskhcplt) {
        LOGDEB1(("CirCache::get: using ofskh\n"));
        //m_d->khDump();
        vector<off_t> ofss;
        if (m_d->khFind(udi, ofss)) {
            LOGDEB1(("Circache::get: h found, colls %d\n", ofss.size()));
            int finst = 1;
            EntryHeaderData d_good;
            off_t           o_good = 0;
            for (vector<off_t>::iterator it = ofss.begin();
                 it != ofss.end(); it++) {
                LOGDEB1(("Circache::get: trying offs %lu\n", (ULONG)*it));
                string fdic;
                EntryHeaderData d;
                if (!m_d->readHDicData(*it, d, fdic, 0)) 
                    return false;
                ConfSimple conf(fdic);
                string fudi;
                if (!conf.get("udi", fudi, "")) {
                    m_d->m_reason << "get: bad file: no udi in dic";
                    return false;
                }
                if (!fudi.compare(udi)) {
                    // Found one, memorize offset. Done if instance
                    // matches, else go on. If instance is -1 need to
                    // go to the end of the list anyway
                    d_good = d;
                    o_good = *it;
                    if (finst == instance) {
                        break;
                    } else {
                        finst++;
                    }
                }
            }
            // Did we read an appropriate entry ?
            if (o_good != 0 && (instance == -1 || instance == finst)) {
                bool ret = m_d->readDicData(o_good, d_good.dicsize, dic,
                                            d_good.datasize, &data);
                LOGDEB0(("Circache::get: hfound, %d mS\n", 
                         chron.millis()));
                return ret;
            }
            // Else try to scan anyway.
        }
    }
    CCScanHookGetter getter(udi, instance);
-    off_t start = m_d->m_nheadoffs > CIRCACHE_FIRSTBLOCK_SIZE ? 
+    off_t start = m_d->m_oheadoffs;
        m_d->m_nheadoffs : CIRCACHE_FIRSTBLOCK_SIZE;
    CCScanHook::status ret = m_d->scan(start, &getter, true);
    if (ret == CCScanHook::Eof) {
@ -508,24 +707,29 @@ bool CirCache::get(const string& udi, string& dict, string& data, int instance)
    } else if (ret != CCScanHook::Stop) {
        return false;
    }
-    return m_d->readDicData(getter.m_offs, getter.m_hd.dicsize, dict,
+    bool bret = 
-                            getter.m_hd.datasize, &data);
+        m_d->readDicData(getter.m_offs, getter.m_hd.dicsize, dic,
                         getter.m_hd.datasize, &data);
    LOGDEB0(("Circache::get: scanfound, %d mS\n", chron.millis()));
    return bret;
 }
-
+// Used to scan the file ahead until we accumulated enough space for the new
 // entry. 
 class CCScanHookSpacer : public  CCScanHook {
 public:
-    unsigned int sizewanted;
+    UINT sizewanted;
-    unsigned int sizeseen;
+    UINT sizeseen;
    CCScanHookSpacer(int sz)
        : sizewanted(sz), sizeseen(0) {assert(sz > 0);}
-    virtual status takeone(off_t offs, const string& udi, unsigned int dicsize, 
+    virtual status takeone(off_t offs, const string& udi, UINT dicsize, 
-                           unsigned int datasize, unsigned int padsize) 
+                           UINT datasize, UINT padsize) 
    {
-        LOGDEB(("ScanSpacer: offs %u dicsz %u datasz %u padsz %u udi[%s]\n",
+        LOGDEB2(("Circache:ScanSpacer:off %u dcsz %u dtsz %u pdsz %u udi[%s]\n",
-                (unsigned int)offs, dicsize, datasize, padsize, udi.c_str()));
+                (UINT)offs, dicsize, datasize, padsize, udi.c_str()));
        sizeseen += CIRCACHE_HEADER_SIZE + dicsize + datasize + padsize;
        if (sizeseen >= sizewanted)
            return Stop;
@ -610,7 +814,7 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
            // and determine the pad size up to the 1st preserved entry
            int scansize = nsize - recovpadsize;
            LOGDEB2(("CirCache::put: scanning for size %d from offs %u\n",
-                    scansize, (unsigned int)m_d->m_oheadoffs));
+                    scansize, (UINT)m_d->m_oheadoffs));
            CCScanHookSpacer spacer(scansize);
            switch (m_d->scan(m_d->m_oheadoffs, &spacer)) {
            case CCScanHook::Stop: 
@ -654,9 +858,12 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
        return false;
    }
    m_d->khEnter(udi, nwriteoffs);
    // Update first block information
    m_d->m_nheadoffs = nwriteoffs;
    m_d->m_npadsize  = npadsize;
    // New oldest header is the one just after the one we just wrote.
    m_d->m_oheadoffs = nwriteoffs + nsize + npadsize;
    if (nwriteoffs + nsize >= m_d->m_maxsize) {
        // If we are at the biggest allowed size or we are currently
@ -670,9 +877,13 @@ bool CirCache::put(const string& udi, const string& idic, const string& data)
 bool CirCache::rewind(bool& eof)
 {
    assert(m_d != 0);
    eof = false;
    // Read oldest header
    m_d->m_itoffs = m_d->m_oheadoffs;
    CCScanHook::status st = m_d->readentryheader(m_d->m_itoffs, m_d->m_ithd);
    switch(st) {
    case CCScanHook::Eof:
        eof = true;
@ -690,43 +901,49 @@ bool CirCache::next(bool& eof)
    eof = false;
    // Skip to next header, using values stored from previous one
    m_d->m_itoffs += CIRCACHE_HEADER_SIZE + m_d->m_ithd.dicsize + 
        m_d->m_ithd.datasize + m_d->m_ithd.padsize;
    // Looped back ?
    if (m_d->m_itoffs == m_d->m_oheadoffs) {
        eof = true;
        return false;
    }
    // Read. If we hit physical eof, fold.
    CCScanHook::status st = m_d->readentryheader(m_d->m_itoffs, m_d->m_ithd);
    if (st == CCScanHook::Eof) {
        m_d->m_itoffs = CIRCACHE_FIRSTBLOCK_SIZE;
        if (m_d->m_itoffs == m_d->m_oheadoffs) {
            // Then the file is not folded yet (still growing)
            eof = true;
            return false;
        }
        st = m_d->readentryheader(m_d->m_itoffs, m_d->m_ithd);
    }
    if (st == CCScanHook::Continue)
        return true;
    return false;
 }
-bool CirCache::getcurrentdict(string& dict)
+bool CirCache::getcurrentdict(string& dic)
 {
    assert(m_d != 0);
-    if (!m_d->readDicData(m_d->m_itoffs, m_d->m_ithd.dicsize, dict, 0, 0))
+    if (!m_d->readDicData(m_d->m_itoffs, m_d->m_ithd.dicsize, dic, 0, 0))
        return false;
    return true;
 }
-bool CirCache::getcurrent(string& udi, string& dict, string& data)
+bool CirCache::getcurrent(string& udi, string& dic, string& data)
 {
    assert(m_d != 0);
-    if (!m_d->readDicData(m_d->m_itoffs, m_d->m_ithd.dicsize, dict,
+    if (!m_d->readDicData(m_d->m_itoffs, m_d->m_ithd.dicsize, dic,
                          m_d->m_ithd.datasize, &data))
        return false;
-    ConfSimple conf(dict, 1);
+    ConfSimple conf(dic, 1);
    conf.get("udi", udi, "");
    return true;
 }
@ -841,17 +1058,19 @@ int main(int argc, char **argv)
      }
      cc.open(CirCache::CC_OPREAD);
  } else if (op_flags & OPT_g) {
      string udi = *argv++;argc--;
      if (!cc.open(CirCache::CC_OPREAD)) {
          cerr << "Open failed: " << cc.getReason() << endl;
          exit(1);
      }
-      string dic, data;
+      while (argc) {
-      if (!cc.get(udi, dic, data, instance)) {
+          string udi = *argv++;argc--;
-          cerr << "Get failed: " << cc.getReason() << endl;
+          string dic, data;
-          exit(1);
+          if (!cc.get(udi, dic, data, instance)) {
              cerr << "Get failed: " << cc.getReason() << endl;
              exit(1);
          }
          cout << "Dict: [" << dic << "]" << endl;
      }
      cout << "Dict: [" << dic << "]" << endl;
  } else if (op_flags & OPT_d) {
      if (!cc.open(CirCache::CC_OPREAD)) {
          cerr << "Open failed: " << cc.getReason() << endl;
--- a/src/utils/circache.h
+++ b/src/utils/circache.h
@ -17,6 +17,7 @@
 #ifndef _circache_h_included_
 #define _circache_h_included_
 /* @(#$Id: $  (C) 2009 J.F.Dockes */
 /**
 * A data cache implemented as a circularly managed file
 *
@ -27,10 +28,18 @@
 * Data objects inside the cache each have two parts: a data segment and an 
 * attribute (metadata) dictionary.
 * They are named using the same identifiers that are used inside the Recoll
- * index, but any unique identifier scheme would work.
+ * index (the UDI).
 *
- * The names are stored in an auxiliary index for fast access. This index can
+ * Inside the file. the UDIs are stored inside the entry dictionary
- * be rebuilt from the main file.
+ * under the key "udi", which will appear when the dictionary is read
 * back even if not there when wrote.
 * 
 * It is assumed that the dictionary are small (they are routinely read/parsed)
 *
 * A problem with this approach is that repetitively storing the same
 * object will evict all others. This could be somewhat optimized by reusing 
 * the last entry if it has the same udi as the one written, but not done 
 * currently.
 */
 #include <sys/types.h>
@ -59,15 +68,22 @@ public:
    virtual bool put(const string& udi, const string& dic, const string& data);
-    /* Maybe we'll have separate iterators one day, but this is good enough for
+    /** Walk the archive.
-     * now. No put() operations should be performed while using these.
+     *
     * Maybe we'll have separate iterators one day, but this is good
     * enough for now. No put() operations should be performed while
     * using these.
     */
    /** Back to oldest */
    virtual bool rewind(bool& eof);
-    virtual bool next(bool& eof);
+    /** Get entry under cursor */
    virtual bool getcurrent(string& udi, string& dic, string& data);
    /** Get current entry dict only (ie: udi is in dict */
    virtual bool getcurrentdict(string& dict);
    /** Skip to next. (false && !eof) -> error, (false&&eof)->EOF. */
    virtual bool next(bool& eof);
-    /* Debug */
+    /* Debug. This writes the entry headers to stdout */
    virtual bool dump();
 protected: