circache: append: resize dest to avoid recycling while appending. Clarifications

This commit is contained in:
Jean-Francois Dockes 2021-03-24 09:31:37 +01:00
parent 5e82af9798
commit 9eac638bb9
5 changed files with 144 additions and 82 deletions

View File

@ -1289,7 +1289,6 @@ void RclMain::applyStyleSheet()
{ {
::applyStyleSheet(prefs.qssFile); ::applyStyleSheet(prefs.qssFile);
if (m_source) { if (m_source) {
std::cerr << "applyStyleSheet emit docSourceChanged\n";
emit docSourceChanged(m_source); emit docSourceChanged(m_source);
emit sortDataChanged(m_sortspec); emit sortDataChanged(m_sortspec);
} else { } else {

View File

@ -146,9 +146,10 @@ string WebcacheModel::getData(unsigned int idx)
return string(); return string();
} }
string udi = m->all[allidx].udi; string udi = m->all[allidx].udi;
// Compute the instance for this udi (in case we are not erasing older instances). // Compute the instance for this udi (in case we are configured to
// not erase older instances). Valid instance values begin at 1
int instance = 0; int instance = 0;
for (int i = 0; i < allidx; i++) { for (unsigned int i = 0; i <= idx; i++) {
if (m->all[i].udi == udi) { if (m->all[i].udi == udi) {
instance++; instance++;
} }
@ -345,6 +346,8 @@ void WebcacheEdit::saveToFile()
return; return;
string data = m_model->getData(selection[0].row()); string data = m_model->getData(selection[0].row());
QString qfn = myGetFileName(false, "Saving webcache data"); QString qfn = myGetFileName(false, "Saving webcache data");
if (qfn.isEmpty())
return;
string reason; string reason;
if (!stringtofile(data, qs2utf8s(qfn).c_str(), reason)) { if (!stringtofile(data, qs2utf8s(qfn).c_str(), reason)) {
QMessageBox::warning(0, "Recoll", tr("File creation failed: ") + u8s2qs(reason)); QMessageBox::warning(0, "Recoll", tr("File creation failed: ") + u8s2qs(reason));

View File

@ -23,13 +23,15 @@ using namespace std;
static char *thisprog; static char *thisprog;
static char usage [] = static char usage [] =
" -c [-u] <dirname> <sizekbs>: create\n" " -c [-u] <dirname> <sizekbs>: create new store or possibly resize existing one\n"
" -u: set the 'unique' flag (else unset it)\n"
" None of this changes the existing data\n"
" -p <dirname> <apath> [apath ...] : put files\n" " -p <dirname> <apath> [apath ...] : put files\n"
" -d <dirname> : dump\n" " -d <dirname> : dump\n"
" -g [-i instance] [-D] <dirname> <udi>: get\n" " -g [-i instance] [-D] <dirname> <udi>: get\n"
" -D: also dump data\n" " -D: also dump data\n"
" -e <dirname> <udi> : erase\n" " -e <dirname> <udi> : erase\n"
" -a <targetdir> <dir> [<dir> ...]: append old content to target\n" " -a <targetdir> <dir> [<dir> ...]: append content from existing cache(s) to target\n"
" The target should be first resized to hold all the data, else only\n" " The target should be first resized to hold all the data, else only\n"
" as many entries as capacity permit will be retained\n" " as many entries as capacity permit will be retained\n"
; ;
@ -53,6 +55,8 @@ static int op_flags;
#define OPT_e 0x200 #define OPT_e 0x200
#define OPT_a 0x800 #define OPT_a 0x800
bool storeFile(CirCache& cc, const std::string fn);
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int instance = -1; int instance = -1;
@ -143,7 +147,7 @@ b1:
} }
while (argc) { while (argc) {
string reason; string reason;
if (CirCache::append(dir, *argv++, &reason) < 0) { if (CirCache::appendCC(dir, *argv++, &reason) < 0) {
cerr << reason << endl; cerr << reason << endl;
return 1; return 1;
} }
@ -160,42 +164,8 @@ b1:
while (argc) { while (argc) {
string fn = *argv++; string fn = *argv++;
argc--; argc--;
char dic[1000]; if (!storeFile(cc, fn)) {
string data, reason; return 1;
if (!file_to_string(fn, data, &reason)) {
cerr << "File_to_string: " << reason << endl;
exit(1);
}
string udi;
make_udi(fn, "", udi);
string cmd("xdg-mime query filetype ");
// Should do more quoting here...
cmd += "'" + fn + "'";
FILE *fp = popen(cmd.c_str(), "r");
char* buf=0;
size_t sz = 0;
if (::getline(&buf, &sz, fp) -1) {
cerr << "Could not read from xdg-mime output\n";
exit(1);
}
pclose(fp);
string mimetype(buf);
free(buf);
trimstring(mimetype, "\n\r");
cout << "Got [" << mimetype << "]\n";
string s;
ConfSimple conf(s);
conf.set("udi", udi);
conf.set("mimetype", mimetype);
//ostringstream str; conf.write(str); cout << str.str() << endl;
if (!cc.put(udi, &conf, data, 0)) {
cerr << "Put failed: " << cc.getReason() << endl;
cerr << "conf: [";
conf.write(cerr);
cerr << "]" << endl;
exit(1);
} }
} }
cc.open(CirCache::CC_OPREAD); cc.open(CirCache::CC_OPREAD);
@ -243,3 +213,46 @@ b1:
exit(0); exit(0);
} }
bool storeFile(CirCache& cc, const std::string fn)
{
char dic[1000];
string data, reason;
if (!file_to_string(fn, data, &reason)) {
std::cerr << "File_to_string: " << reason << endl;
return false;
}
string udi;
make_udi(fn, "", udi);
string cmd("xdg-mime query filetype ");
// Should do more quoting here...
cmd += "'" + fn + "'";
FILE *fp = popen(cmd.c_str(), "r");
char* buf=0;
size_t sz = 0;
if (::getline(&buf, &sz, fp) -1) {
std::cerr << "Could not read from xdg-mime output\n";
return false;
}
pclose(fp);
string mimetype(buf);
free(buf);
trimstring(mimetype, "\n\r");
//std::cerr << "Got [" << mimetype << "]\n";
string s;
ConfSimple conf(s);
conf.set("udi", udi);
conf.set("mimetype", mimetype);
//ostringstream str; conf.write(str); cout << str.str() << endl;
if (!cc.put(udi, &conf, data, 0)) {
std::cerr << "Put failed: " << cc.getReason() << endl;
std::cerr << "conf: [";
conf.write(std::cerr);
std::cerr << "]" << endl;
return false;
}
return true;
}

View File

@ -14,6 +14,7 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/ */
#define LOGGER_LOCAL_LOGINC 4
#include "autoconfig.h" #include "autoconfig.h"
@ -34,6 +35,7 @@
#include "chrono.h" #include "chrono.h"
#include "zlibut.h" #include "zlibut.h"
#include "smallut.h"
#ifndef _WIN32 #ifndef _WIN32
#include <sys/uio.h> #include <sys/uio.h>
@ -255,10 +257,8 @@ public:
return true; return true;
} }
void khDump() { void khDump() {
for (kh_type::const_iterator it = m_ofskh.begin(); for (const auto& e : m_ofskh) {
it != m_ofskh.end(); it++) { LOGDEB("Circache::KHDUMP: " << e.first.asHexString() << " " << e.second << "\n");
LOGDEB("Circache::KHDUMP: " << it->first.asHexString() << " " <<
it->second << "\n");
} }
} }
@ -270,8 +270,7 @@ public:
UdiH h(udi); UdiH h(udi);
LOGDEB2("Circache::khFind: h " << h.asHexString() << " udi [" << udi << LOGDEB2("Circache::khFind: h " << h.asHexString() << " udi [" << udi << "]\n");
"]\n");
pair<kh_type::iterator, kh_type::iterator> p = m_ofskh.equal_range(h); pair<kh_type::iterator, kh_type::iterator> p = m_ofskh.equal_range(h);
@ -312,9 +311,8 @@ public:
} }
// Clear entries for vector of udi/offs // Clear entries for vector of udi/offs
bool khClear(const vector<pair<string, int64_t> >& udis) { bool khClear(const vector<pair<string, int64_t> >& udis) {
for (vector<pair<string, int64_t> >::const_iterator it = udis.begin(); for (const auto& udioffs : udis) {
it != udis.end(); it++) { khClear(udioffs);
khClear(*it);
} }
return true; return true;
} }
@ -589,15 +587,13 @@ public:
return true; return true;
} }
bool readDicData(int64_t hoffs, EntryHeaderData& hd, string& dic, bool readDicData(int64_t hoffs, EntryHeaderData& hd, string& dic, string* data) {
string* data) {
int64_t offs = hoffs + CIRCACHE_HEADER_SIZE; int64_t offs = hoffs + CIRCACHE_HEADER_SIZE;
// This syscall could be avoided in some cases if we saved the offset // This syscall could be avoided in some cases if we saved the offset
// at each seek. In most cases, we just read the header and we are // at each seek. In most cases, we just read the header and we are
// at the right position // at the right position
if (lseek(m_fd, offs, 0) != offs) { if (lseek(m_fd, offs, 0) != offs) {
m_reason << "CirCache::get: lseek(" << offs << ") failed: " << m_reason << "CirCache::get: lseek(" << offs << ") failed: " << errno;
errno;
return false; return false;
} }
char *bf = 0; char *bf = 0;
@ -863,9 +859,9 @@ public:
virtual status takeone(int64_t offs, const string& udi, virtual status takeone(int64_t offs, const string& udi,
const EntryHeaderData& d) { const EntryHeaderData& d) {
LOGDEB2("Circache:Scan: off " << offs << " udi [" << udi << "] dcsz " << LOGDEB1("Circache:Scan: off " << offs << " udi [" << udi << "] dcsz " <<
d.dicsize << " dtsz " << d.datasize << d.dicsize << " dtsz " << d.datasize << " pdsz " << d.padsize <<
" pdsz " << d.padsize << " flgs " << d.flags << "\n"); " flgs " << d.flags << " previnst " << m_instance << "\n");
if (!m_udi.compare(udi)) { if (!m_udi.compare(udi)) {
m_instance++; m_instance++;
m_offs = offs; m_offs = offs;
@ -878,7 +874,6 @@ public:
} }
}; };
// instance == -1 means get latest. Otherwise specify from 1+
bool CirCache::get(const string& udi, string& dic, string *data, int instance) bool CirCache::get(const string& udi, string& dic, string *data, int instance)
{ {
Chrono chron; Chrono chron;
@ -899,12 +894,11 @@ bool CirCache::get(const string& udi, string& dic, string *data, int instance)
int finst = 1; int finst = 1;
EntryHeaderData d_good; EntryHeaderData d_good;
int64_t o_good = 0; int64_t o_good = 0;
for (vector<int64_t>::iterator it = ofss.begin(); for (const auto& offset : ofss) {
it != ofss.end(); it++) { LOGDEB1("Circache::get: trying offs " << offset << "\n");
LOGDEB1("Circache::get: trying offs " << *it << "\n");
EntryHeaderData d; EntryHeaderData d;
string fudi; string fudi;
if (!m_d->readHUdi(*it, d, fudi)) { if (!m_d->readHUdi(offset, d, fudi)) {
return false; return false;
} }
if (!fudi.compare(udi)) { if (!fudi.compare(udi)) {
@ -912,7 +906,7 @@ bool CirCache::get(const string& udi, string& dic, string *data, int instance)
// matches, else go on. If instance is -1 need to // matches, else go on. If instance is -1 need to
// go to the end anyway // go to the end anyway
d_good = d; d_good = d;
o_good = *it; o_good = offset;
if (finst == instance) { if (finst == instance) {
break; break;
} else { } else {
@ -946,6 +940,10 @@ bool CirCache::get(const string& udi, string& dic, string *data, int instance)
return bret; return bret;
} }
// It would be possible to have an option to only erase if this is the
// last entry in the file, by comparing the offsets from khFind() with
// m_oheadoffs. Read the last offset < m_oheadoffs and check that
// offset+sizes == oheadoffs
bool CirCache::erase(const string& udi, bool reallyclear) bool CirCache::erase(const string& udi, bool reallyclear)
{ {
if (m_d == 0) { if (m_d == 0) {
@ -977,22 +975,22 @@ bool CirCache::erase(const string& udi, bool reallyclear)
return true; return true;
} }
for (vector<int64_t>::iterator it = ofss.begin(); it != ofss.end(); it++) { for (const auto& offset : ofss) {
LOGDEB2("CirCache::erase: reading at " << *it << "\n"); LOGDEB2("CirCache::erase: reading at " << offset << "\n");
EntryHeaderData d; EntryHeaderData d;
string fudi; string fudi;
if (!m_d->readHUdi(*it, d, fudi)) { if (!m_d->readHUdi(offset, d, fudi)) {
return false; return false;
} }
LOGDEB2("CirCache::erase: found fudi [" << fudi << "]\n"); LOGDEB2("CirCache::erase: found fudi [" << fudi << "]\n");
if (!fudi.compare(udi)) { if (!fudi.compare(udi)) {
EntryHeaderData nd; EntryHeaderData nd;
nd.padsize = d.dicsize + d.datasize + d.padsize; nd.padsize = d.dicsize + d.datasize + d.padsize;
LOGDEB2("CirCache::erase: rewrite at " << *it << "\n"); LOGDEB2("CirCache::erase: rewrite at " << offset << "\n");
if (*it == m_d->m_nheadoffs) { if (offset == m_d->m_nheadoffs) {
m_d->m_npadsize = nd.padsize; m_d->m_npadsize = nd.padsize;
} }
if (!m_d->writeEntryHeader(*it, nd, reallyclear)) { if (!m_d->writeEntryHeader(offset, nd, reallyclear)) {
LOGERR("CirCache::erase: write header failed\n"); LOGERR("CirCache::erase: write header failed\n");
return false; return false;
} }
@ -1339,26 +1337,60 @@ static bool copyall(std::shared_ptr<CirCache> occ,
return true; return true;
} }
// Append all entries from sdir to ddir int CirCache::appendCC(const string ddir, const string& sdir, string *reason)
int CirCache::append(const string ddir, const string& sdir, string *reason)
{ {
ostringstream msg; ostringstream msg;
// Open source file // Open source file
std::shared_ptr<CirCache> occ(new CirCache(sdir)); std::shared_ptr<CirCache> occ(new CirCache(sdir));
if (!occ->open(CirCache::CC_OPREAD)) { if (!occ->open(CirCache::CC_OPREAD)) {
if (reason) { if (reason) {
msg << "Open failed in " << sdir << " : " << msg << "Open failed in " << sdir << " : " << occ->getReason() << endl;
occ->getReason() << endl;
*reason = msg.str(); *reason = msg.str();
} }
return -1; return -1;
} }
// Possibly resize dest. If the dest is currently recycling, it
// will keep on. This only avoids erasing entries in dest if it is
// currently writing at EOF (growing), which will be the case if
// we are using this to compact existing file (the dest was just
// created for the purpose).
int64_t dstavail{0}, dstmaxsize{0};
bool isunique;
// Check dest size
{
std::shared_ptr<CirCache> ncc(new CirCache(ddir));
if (!ncc->open(CirCache::CC_OPREAD)) {
if (reason) {
msg << "Open failed in " << ddir << " : " << ncc->getReason() << endl;
*reason = msg.str();
}
return -1;
}
dstmaxsize = ncc->m_d->m_maxsize;
dstavail = dstmaxsize - ncc->m_d->m_nheadoffs;
isunique = ncc->m_d->m_uniquentries;
}
if (dstavail < occ->size()) {
std::shared_ptr<CirCache> ncc(new CirCache(ddir));
auto nsize = dstmaxsize + (occ->size() - dstavail) + 5*1000*1000;
LOGDEB1("CirCache::appendCC: Dstmaxsize " << displayableBytes(dstmaxsize) << " dstavail "<<
displayableBytes(dstavail) << " org size " << displayableBytes(occ->size()) <<
" nsize " << displayableBytes(nsize) << "\n");
if (!ncc->create(nsize, isunique ? CC_CRUNIQUE : CC_CRNONE)) {
if (reason) {
msg << "Open failed in " << ddir << " : " << ncc->getReason() << endl;
*reason = msg.str();
}
return -1;
}
}
// Open dest file // Open dest file
std::shared_ptr<CirCache> ncc(new CirCache(ddir)); std::shared_ptr<CirCache> ncc(new CirCache(ddir));
if (!ncc->open(CirCache::CC_OPWRITE)) { if (!ncc->open(CirCache::CC_OPWRITE)) {
if (reason) { if (reason) {
msg << "Open failed in " << ddir << " : " << msg << "Open failed in " << ddir << " : " << ncc->getReason() << endl;
ncc->getReason() << endl;
*reason = msg.str(); *reason = msg.str();
} }
return -1; return -1;

View File

@ -67,11 +67,14 @@ public:
virtual std::string getpath(); virtual std::string getpath();
// Set data to 0 if you just want the header // Set data to 0 if you just want the header
// instance == -1 means get latest. Otherwise specify from 1+
virtual bool get(const std::string& udi, std::string& dic, virtual bool get(const std::string& udi, std::string& dic,
std::string *data = 0, int instance = -1); std::string *data = 0, int instance = -1);
// Note: the dicp MUST have an udi entry // Note: the dicp MUST have an udi entry
enum PutFlags {NoCompHint = 1}; enum PutFlags {
NoCompHint = 1, // Do not attempt compression.
};
virtual bool put(const std::string& udi, const ConfSimple *dicp, virtual bool put(const std::string& udi, const ConfSimple *dicp,
const std::string& data, unsigned int flags = 0); const std::string& data, unsigned int flags = 0);
@ -99,15 +102,27 @@ public:
/* Utility: append all entries from sdir to ddir. /* Utility: append all entries from sdir to ddir.
* *
* This does not need to be a member at all, just using the namespace here. * ddir must already exist. It will be appropriately resized if
* needed to avoid recycling while writing the new entries.
* ** Note that if dest is not currently growing, this action
* will recycle old dest entries between the current write
* point and EOF (or up to wherever we need to write to store
* the source data) **
* Also note that if the objective is just to compact (reuse the erased
* entries space) you should first create the new circache with the
* same maxsize as the old one, else the new maxsize will be the
* current file size (current erased+active entries, with
* available space corresponding to the old erased entries).
* *
* @param ddir destination circache (must be previously created * This method does not need to be a member at all, just using the
* with appropriate size) * namespace here.
*
* @param ddir destination circache (must exist)
* @param sdir source circache * @param sdir source circache
* @ret number of entries copied or -a * @ret number of entries copied or -a
*/ */
static int append(const std::string ddir, const std::string& sdir, static int appendCC(const std::string ddir, const std::string& sdir,
std::string *reason = 0); std::string *reason = 0);
protected: protected:
CirCacheInternal *m_d; CirCacheInternal *m_d;