result storing experiments

This commit is contained in:
Jean-Francois Dockes 2020-12-15 19:33:32 +01:00
parent 5c2f62ae00
commit b7f0654526
4 changed files with 289 additions and 29 deletions

View File

@ -80,5 +80,5 @@ wipedir_LDADD = ../librecoll.la
x11mon_SOURCES = trx11mon.cpp
x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11
rclqdocmem_SOURCES = rclqdocmem.cpp
rclqdocmem_SOURCES = rclqdocmem.cpp qresultstore.cpp
rclqdocmem_LDADD = ../librecoll.la

View File

@ -0,0 +1,187 @@
/* Copyright (C) 2017-2020 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "qresultstore.h"
#include <string>
#include <iostream>
#include <map>
#include <vector>
#include <malloc.h>
#include <unistd.h>
#include <string.h>
#include "rcldoc.h"
#include "rclquery.h"
class QResultStore::Internal {
public:
bool testentry(const std::pair<std::string,std::string>& entry) {
return !entry.second.empty() &&
excludedfields.find(entry.first) == excludedfields.end();
}
std::map<std::string, int> keyidx;
int ndocs{0};
// Notes: offsets[0] is always 0, not really useful, simpler this
// way. Also could use simple C array instead of c++ vector...
struct docoffs {
~docoffs() {
free(base);
}
char *base{nullptr};
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
std::set<std::string> excludedfields;
};
QResultStore::QResultStore()
{
m = new Internal;
}
QResultStore::~QResultStore()
{
delete m;
}
//{"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption",
// "filename", "origcharset", "sig"};
bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
{
m->excludedfields = excl;
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
m->keyidx = {{"url",0},
{"mimetype", 1},
{"fmtime", 2},
{"dmtime", 3},
{"fbytes", 4},
{"dbytes", 5}
};
m->ndocs = 0;
for (;;m->ndocs++) {
Rcl::Doc doc;
if (!query.getDoc(m->ndocs, doc, false)) {
break;
}
for (const auto& entry : doc.meta) {
if (m->testentry(entry)) {
auto it = m->keyidx.find(entry.first);
if (it == m->keyidx.end()) {
int idx = m->keyidx.size();
m->keyidx.insert({entry.first, idx});
};
}
}
}
///////
// Populate the main array with doc-equivalent structures.
m->docs.resize(m->ndocs);
for (int i = 0; i < m->ndocs; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) {
break;
}
auto& vdoc = m->docs[i];
vdoc.offsets.resize(m->keyidx.size());
int nbytes =
doc.url.size() + 1 +
doc.mimetype.size() + 1 +
doc.fmtime.size() + 1 +
doc.dmtime.size() + 1 +
doc.fbytes.size() + 1 +
doc.dbytes.size() + 1;
for (const auto& entry : doc.meta) {
if (m->testentry(entry)) {
if (m->keyidx.find(entry.first) == m->keyidx.end()) {
continue;
}
nbytes += entry.second.size() + 1;
}
}
char *cp = (char*)malloc(nbytes);
if (nullptr == cp) {
abort();
}
#define STRINGCPCOPY(CHARP, S) do { \
memcpy(CHARP, S.c_str(), S.size()+1); \
CHARP += S.size()+1; \
} while (false);
vdoc.base = cp;
vdoc.offsets[0] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.url);
vdoc.offsets[1] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.mimetype);
vdoc.offsets[2] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fmtime);
vdoc.offsets[3] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dmtime);
vdoc.offsets[4] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fbytes);
vdoc.offsets[5] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dbytes);
for (const auto& entry : doc.meta) {
if (m->testentry(entry)) {
auto it = m->keyidx.find(entry.first);
if (it == m->keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
}
if (it->second <= 5) {
// Already done ! Storing another address would be
// wasteful and crash when freeing...
continue;
}
vdoc.offsets[it->second] = cp - vdoc.base;
STRINGCPCOPY(cp, entry.second);
}
}
}
return true;
}
const char *QResultStore::fieldvalue(int docindex, const std::string& fldname)
{
if (docindex < 0 || docindex >= m->ndocs) {
return nullptr;
}
auto& vdoc = m->docs[docindex];
auto it = m->keyidx.find(fldname);
if (it == m->keyidx.end()) {
return nullptr;
}
if (it->second < 0 || it->second >= int(vdoc.offsets.size())) {
//??
return nullptr;
}
return vdoc.base + vdoc.offsets[it->second];
}

View File

@ -0,0 +1,46 @@
/* Copyright (C) 2017-2020 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef _QRESULTSTORE_H_INCLUDED_
#define _QRESULTSTORE_H_INCLUDED_
#include <string>
#include <set>
namespace Rcl {
class Query;
}
class QResultStore {
public:
QResultStore();
~QResultStore();
bool storeQuery(Rcl::Query& q, std::set<std::string> excluded = {});
const char *fieldvalue(int docindex, const std::string& fldname);
QResultStore(const QResultStore&) = delete;
QResultStore& operator=(const QResultStore&) = delete;
class Internal;
private:
Internal *m{nullptr};
};
#endif /* _QRESULTSTORE_H_INCLUDED_ */

View File

@ -44,7 +44,7 @@
#include "plaintorich.h"
#include "hldata.h"
#include "smallut.h"
#include "qresultstore.h"
//const std::string confdir{"/home/dockes/.recoll-prod"};
const std::string confdir{"/var/cache/upmpdcli/uprcl"};
@ -265,7 +265,6 @@ int main(int argc, char *argv[])
}
#elif defined(STORE_ARRAYS)
//
// Each result stored as a vector<const char*> with a shared
// key->intidx map to store the key name to index mapping, and and
@ -302,6 +301,26 @@ int main(int argc, char *argv[])
// performance impact which should be quite modest.
// ** This supposes that we don't use obstack though, as obstack
// placement is unpredictable.
//
// This the solution now implemented: no obstack, use struct with offsets
// This uses 19 MB of storage for the audio index, and 72 MB for
// the main one (less keys->less gain)
{
#if 1
QResultStore store;
bool result = store.storeQuery(
query, {"author", "ipath", "rcludi", "relevancyrating",
"sig","abstract", "caption", "filename", "origcharset", "sig"});
if (!result) {
std::cerr << "storeQuery failed\n";
return 1;
}
meminfo("After storing");
std::cerr << "url 20 " << store.fieldvalue(20, "url") << "\n";
#else
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
std::map<std::string, int> keyidx {
{"url",0},
{"mimetype", 1},
@ -310,7 +329,6 @@ int main(int argc, char *argv[])
{"fbytes", 4},
{"dbytes", 5},
};
int ndocs = 0;
for (;;ndocs++) {
Rcl::Doc doc;
@ -327,10 +345,22 @@ int main(int argc, char *argv[])
}
}
}
// 49 keys !
// The audio db has 49 keys !
std::cerr << "Found " << keyidx.size() << " different keys\n";
std::vector<std::vector<char*>> docs;
///////
// Populate the main array with doc-equivalent structures.
// Notes: offsets[0] is always 0, not really useful, simpler this way. Also
// could use simple C array instead of c++ vector...
struct docoffs {
~docoffs() {
free(base);
}
char *base{nullptr};
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
docs.resize(ndocs);
meminfo("After resize");
@ -340,7 +370,7 @@ int main(int argc, char *argv[])
break;
}
auto& vdoc = docs[i];
vdoc.resize(keyidx.size());
vdoc.offsets.resize(keyidx.size());
int nbytes =
doc.url.size() + 1 +
doc.mimetype.size() + 1 +
@ -362,21 +392,25 @@ int main(int argc, char *argv[])
if (nullptr == cp) {
abort();
}
if (i < 2) {
std::cerr << "malloc returned " << (void*)cp << "\n";
}
#define STRINGCPCOPY(CHARP, S) do { \
memcpy(CHARP, S.c_str(), S.size()+1); \
CHARP += S.size()+1; \
} while (false);
vdoc[0] = cp; STRINGCPCOPY(cp, doc.url);
vdoc[1] = cp; STRINGCPCOPY(cp, doc.mimetype);
vdoc[2] = cp; STRINGCPCOPY(cp, doc.fmtime);
vdoc[3] = cp; STRINGCPCOPY(cp, doc.dmtime);
vdoc[4] = cp; STRINGCPCOPY(cp, doc.fbytes);
vdoc[5] = cp; STRINGCPCOPY(cp, doc.dbytes);
vdoc.base = cp;
vdoc.offsets[0] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.url);
vdoc.offsets[1] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.mimetype);
vdoc.offsets[2] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fmtime);
vdoc.offsets[3] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dmtime);
vdoc.offsets[4] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fbytes);
vdoc.offsets[5] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dbytes);
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
@ -385,26 +419,19 @@ int main(int argc, char *argv[])
abort();
}
if (it->second <= 5) {
// Already done !
// Already done ! Storing another address would be
// wasteful and crash when freeing...
continue;
}
vdoc[it->second] = cp; STRINGCPCOPY(cp, entry.second);
vdoc.offsets[it->second] = cp - vdoc.base;
STRINGCPCOPY(cp, entry.second);
}
}
if (i < 2) {
std::cerr << "vdoc[0] " << (void*)vdoc[0] << "\n";
}
}
meminfo("After storing");
for (auto& vdoc : docs) {
if (!vdoc.empty()) {
//std::cerr << "Freeing " << (void*)(vdoc[0]) << "\n";
free(vdoc[0]);
}
}
meminfo("After free");
#endif
}
#elif defined(STORE_ALLOBSTACK)
//