resultstore: 1st working. Interface needs improvements

This commit is contained in:
Jean-Francois Dockes 2020-12-17 11:15:07 +01:00
parent 3479e7cd85
commit ea9b5ab9eb
10 changed files with 445 additions and 159 deletions

View File

@ -156,6 +156,8 @@ query/filtseq.cpp \
query/filtseq.h \ query/filtseq.h \
query/plaintorich.cpp \ query/plaintorich.cpp \
query/plaintorich.h \ query/plaintorich.h \
query/qresultstore.cpp \
query/qresultstore.h \
query/recollq.cpp \ query/recollq.cpp \
query/recollq.h \ query/recollq.h \
query/reslistpager.cpp \ query/reslistpager.cpp \

View File

@ -828,19 +828,6 @@ typedef struct recoll_DbObject {
std::shared_ptr<RclConfig> rclconfig; std::shared_ptr<RclConfig> rclconfig;
} recoll_DbObject; } recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
PyDoc_STRVAR(doc_Query_close, PyDoc_STRVAR(doc_Query_close,
"close(). Deallocate query. Object is unusable after the call." "close(). Deallocate query. Object is unusable after the call."
); );
@ -1521,7 +1508,7 @@ PyDoc_STRVAR(doc_QueryObject,
"Recoll Query objects are used to execute index searches. \n" "Recoll Query objects are used to execute index searches. \n"
"They must be created by the Db.query() method.\n" "They must be created by the Db.query() method.\n"
); );
static PyTypeObject recoll_QueryType = { PyTypeObject recoll_QueryType = {
PyVarObject_HEAD_INIT(NULL, 0) PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.Query", /*tp_name*/ "_recoll.Query", /*tp_name*/
sizeof(recoll_QueryObject), /*tp_basicsize*/ sizeof(recoll_QueryObject), /*tp_basicsize*/
@ -2195,6 +2182,12 @@ PyInit__recoll(void)
Py_INCREF(&rclx_ExtractorType); Py_INCREF(&rclx_ExtractorType);
PyModule_AddObject(module, "Extractor", (PyObject *)&rclx_ExtractorType); PyModule_AddObject(module, "Extractor", (PyObject *)&rclx_ExtractorType);
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF(&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType);
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
return module; return module;
#endif #endif

View File

@ -22,8 +22,13 @@
#include <Python.h> #include <Python.h>
#include <memory> #include <memory>
#include <string>
class RclConfig; class RclConfig;
namespace Rcl {
class Doc;
class Query;
};
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
@ -33,7 +38,24 @@ typedef struct {
std::shared_ptr<RclConfig> rclconfig; std::shared_ptr<RclConfig> rclconfig;
} recoll_DocObject; } recoll_DocObject;
extern PyTypeObject rclx_ExtractorType; struct recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
std::string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
extern PyTypeObject recoll_DocType; extern PyTypeObject recoll_DocType;
extern PyTypeObject recoll_QueryType;
extern PyTypeObject rclx_ExtractorType;
extern PyTypeObject recoll_QResultStoreType;
#endif // _PYRECOLL_H_INCLUDED_ #endif // _PYRECOLL_H_INCLUDED_

View File

@ -0,0 +1,321 @@
/* Copyright (C) 2007-2020 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>
#include <string>
#include <iostream>
#include <set>
#include "qresultstore.h"
#include "pyrecoll.h"
#include "log.h"
using namespace std;
#if PY_MAJOR_VERSION >=3
# define Py_TPFLAGS_HAVE_ITER 0
#else
#define PyLong_FromLong PyInt_FromLong
#endif
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::QResultStore *store;
} recoll_QResultStoreObject;
static void
QResultStore_dealloc(recoll_QResultStoreObject *self)
{
LOGDEB("QResultStore_dealloc.\n");
delete self->store;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QResultStore_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB("QResultStore_new\n");
recoll_QResultStoreObject *self;
self = (recoll_QResultStoreObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
self->store = new Rcl::QResultStore();
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QResultStoreObject,
"QResultStore()\n"
"\n"
"A QResultStore can efficiently store query result documents.\n"
);
static int
QResultStore_init(
recoll_QResultStoreObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB("QResultStore_init\n");
return 0;
}
PyDoc_STRVAR(
qrs_doc_storeQuery,
"storeQuery(query, fieldspec=[], isinc=False)\n"
"\n"
"Stores the results from the input query object, possibly "
"excluding/including the specified fields.\n"
);
static PyObject *
QResultStore_storeQuery(recoll_QResultStoreObject* self, PyObject *args,
PyObject *kwargs)
{
static const char* kwlist[] = {"query", "fieldspec", "isinc", NULL};
PyObject *q{nullptr};
PyObject *fieldspec{nullptr};
PyObject *isinco = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OO", (char**)kwlist,
&recoll_QueryType, &q, &fieldspec, &isinco))
return nullptr;
recoll_QueryObject *query = (recoll_QueryObject*)q;
if (nullptr == query->query) {
PyErr_SetString(PyExc_ValueError,
"query not initialised (null query ?)");
return nullptr;
}
bool isinc{false};
if (nullptr != isinco && PyObject_IsTrue(isinco))
isinc = true;
std::set<std::string> fldspec;
if (nullptr != fieldspec) {
// fieldspec must be either single string or list of strings
if (PyUnicode_Check(fieldspec)) {
PyObject *utf8o = PyUnicode_AsUTF8String(fieldspec);
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
} else if (PySequence_Check(fieldspec)) {
for (Py_ssize_t i = 0; i < PySequence_Size(fieldspec); i++) {
PyObject *utf8o =
PyUnicode_AsUTF8String(PySequence_GetItem(fieldspec, i));
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
}
} else {
PyErr_SetString(PyExc_TypeError,
"fieldspec arg must be str or sequence of str");
return nullptr;
}
}
self->store->storeQuery(*(query->query), fldspec, isinc);
Py_RETURN_NONE;
}
PyDoc_STRVAR(
qrs_doc_getCount,
"getCount()\n"
"\n"
"Return the stored results count.\n"
);
static PyObject *
QResultStore_getCount(recoll_QResultStoreObject* self, PyObject *args)
{
return PyLong_FromLong(self->store->getCount());
}
PyDoc_STRVAR(
qrs_doc_getField,
"getField(index, fieldname)\n"
"\n"
"Retrieve tha value of field <fieldname> from result at index <index>.\n"
);
static PyObject *
QResultStore_getField(recoll_QResultStoreObject* self, PyObject *args)
{
int index;
const char *fieldname;
if (!PyArg_ParseTuple(args, "is", &index, &fieldname)) {
return nullptr;
}
const char *result = self->store->fieldValue(index, fieldname);
if (nullptr == result) {
Py_RETURN_NONE;
} else {
return PyBytes_FromString(result);
}
}
static PyMethodDef QResultStore_methods[] = {
{"storeQuery", (PyCFunction)QResultStore_storeQuery,
METH_VARARGS|METH_KEYWORDS, qrs_doc_getCount},
{"getCount", (PyCFunction)QResultStore_getCount,
METH_VARARGS|METH_KEYWORDS, qrs_doc_storeQuery},
{"getField", (PyCFunction)QResultStore_getField,
METH_VARARGS, qrs_doc_getField},
{NULL} /* Sentinel */
};
PyTypeObject recoll_QResultStoreType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QResultStore", /*tp_name*/
sizeof(recoll_QResultStoreObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QResultStore_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QResultStoreObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QResultStore_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QResultStore_init, /* tp_init */
0, /* tp_alloc */
QResultStore_new, /* tp_new */
};
//////////////////////////////////////////////////////////////////////////
// Module methods
static PyMethodDef rclrstore_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyDoc_STRVAR(pyrclrstore_doc_string,
"Utility module for efficiently storing many query results.\n");
struct module_state {
PyObject *error;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
static int rclrstore_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int rclrstore_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_rclrstore",
NULL,
sizeof(struct module_state),
rclrstore_methods,
NULL,
rclrstore_traverse,
rclrstore_clear,
NULL
};
#define INITERROR return NULL
extern "C" PyObject *
PyInit__rclrstore(void)
#else
#define INITERROR return
PyMODINIT_FUNC
init__rclrstore(void)
#endif
{
// Note: we can't call recollinit here, because the confdir is only really
// known when the first db object is created (it is an optional parameter).
// Using a default here may end up with variables such as stripchars being
// wrong
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("_rclrstore", rclrstore_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
// The first parameter is a char *. Hopefully we don't initialize
// modules too often...
st->error = PyErr_NewException(strdup("_rclrstore.Error"), NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore",
(PyObject *)&recoll_QResultStoreType);
PyModule_AddStringConstant(module, "__doc__", pyrclrstore_doc_string);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@ -0,0 +1,23 @@
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# We used to have two C extensions: recoll and rclextract, which was a really
# bad idea. They are now merged into the _recoll C extension module. The two
# python modules recoll.py and rclextract.py only exist for compatibility (for
# now: maybe we'll do something with them in the future).
from ._recoll import QResultStore

View File

@ -38,13 +38,15 @@ module1 = Extension('_recoll',
os.path.join(top, 'internfile'), os.path.join(top, 'internfile'),
os.path.join(top, 'rcldb'), os.path.join(top, 'rcldb'),
os.path.join(top, 'query'), os.path.join(top, 'query'),
os.path.join(top, 'unac') os.path.join(top, 'unac'),
os.path.join(top, 'testmains')
], ],
extra_compile_args = extra_compile_args, extra_compile_args = extra_compile_args,
libraries = libraries, libraries = libraries,
library_dirs = library_dirs, library_dirs = library_dirs,
runtime_library_dirs = runtime_library_dirs, runtime_library_dirs = runtime_library_dirs,
sources = [os.path.join(pytop, 'pyrecoll.cpp'), sources = [os.path.join(pytop, 'pyrecoll.cpp'),
os.path.join(pytop, 'pyresultstore.cpp'),
os.path.join(pytop, 'pyrclextract.cpp') os.path.join(pytop, 'pyrclextract.cpp')
]) ])

View File

@ -32,15 +32,17 @@
#include "rcldoc.h" #include "rcldoc.h"
#include "rclquery.h" #include "rclquery.h"
namespace Rcl {
class QResultStore::Internal { class QResultStore::Internal {
public: public:
bool testentry(const std::pair<std::string,std::string>& entry) { bool testentry(const std::pair<std::string,std::string>& entry) {
return !entry.second.empty() && return !entry.second.empty() &&
excludedfields.find(entry.first) == excludedfields.end(); (isinc ? fieldspec.find(entry.first) != fieldspec.end() :
fieldspec.find(entry.first) == fieldspec.end());
} }
std::map<std::string, int> keyidx; std::map<std::string, int> keyidx;
int ndocs{0};
// Notes: offsets[0] is always 0, not really useful, simpler this // Notes: offsets[0] is always 0, not really useful, simpler this
// way. Also could use simple C array instead of c++ vector... // way. Also could use simple C array instead of c++ vector...
struct docoffs { struct docoffs {
@ -51,7 +53,8 @@ public:
std::vector<int> offsets; std::vector<int> offsets;
}; };
std::vector<struct docoffs> docs; std::vector<struct docoffs> docs;
std::set<std::string> excludedfields; std::set<std::string> fieldspec;
bool isinc{false};
}; };
QResultStore::QResultStore() QResultStore::QResultStore()
@ -63,14 +66,17 @@ QResultStore::~QResultStore()
delete m; delete m;
} }
// For reference : Fields normally excluded by uprcl:
//{"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption", // {"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption",
// "filename", "origcharset", "sig"}; // "filename", "origcharset", "sig"};
bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl) bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> fldspec,
bool isinc)
{ {
m->excludedfields = excl; m->fieldspec = fldspec;
m->isinc = isinc;
///////////// /////////////
// Enumerate all existing keys and assign array indexes for // Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it. // them. Count documents while we are at it.
@ -81,10 +87,11 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
{"fbytes", 4}, {"fbytes", 4},
{"dbytes", 5} {"dbytes", 5}
}; };
m->ndocs = 0;
for (;;m->ndocs++) { int count = 0;
for (;;count++) {
Rcl::Doc doc; Rcl::Doc doc;
if (!query.getDoc(m->ndocs, doc, false)) { if (!query.getDoc(count, doc, false)) {
break; break;
} }
for (const auto& entry : doc.meta) { for (const auto& entry : doc.meta) {
@ -101,9 +108,9 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
/////// ///////
// Populate the main array with doc-equivalent structures. // Populate the main array with doc-equivalent structures.
m->docs.resize(m->ndocs); m->docs.resize(count);
for (int i = 0; i < m->ndocs; i++) { for (int i = 0; i < count; i++) {
Rcl::Doc doc; Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) { if (!query.getDoc(i, doc, false)) {
break; break;
@ -168,20 +175,25 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
return true; return true;
} }
const char *QResultStore::fieldvalue(int docindex, const std::string& fldname) int QResultStore::getCount()
{ {
if (docindex < 0 || docindex >= m->ndocs) { return int(m->docs.size());
}
const char *QResultStore::fieldValue(int docindex, const std::string& fldname)
{
if (docindex < 0 || docindex >= int(m->docs.size())) {
return nullptr; return nullptr;
} }
auto& vdoc = m->docs[docindex]; auto& vdoc = m->docs[docindex];
auto it = m->keyidx.find(fldname); auto it = m->keyidx.find(fldname);
if (it == m->keyidx.end()) { if (it == m->keyidx.end() ||
return nullptr; it->second < 0 || it->second >= int(vdoc.offsets.size())) {
}
if (it->second < 0 || it->second >= int(vdoc.offsets.size())) {
//?? //??
return nullptr; return nullptr;
} }
return vdoc.base + vdoc.offsets[it->second]; return vdoc.base + vdoc.offsets[it->second];
} }
} // namespace Rcl

View File

@ -17,24 +17,49 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/ */
#ifndef _QRESULTSTORE_H_INCLUDED_ #ifndef _QRESULTSTORE_H_INCLUDED_
#define _QRESULTSTORE_H_INCLUDED_ #define _QRESULTSTORE_H_INCLUDED_
/**
* Implement an efficient way to store the whole or part of a query result set.
* This would naturally be done as a vector<Rcl::Doc>, but the natural
* way leads to a huge space waste (8-10x), which may be a problem in
* some cases. This is mostly used by the uprcl Media Server.
*/
#include <string> #include <string>
#include <set> #include <set>
namespace Rcl { namespace Rcl {
class Query; class Query;
}
class QResultStore { class QResultStore {
public: public:
QResultStore(); QResultStore();
~QResultStore(); ~QResultStore();
bool storeQuery(Rcl::Query& q, std::set<std::string> excluded = {}); /**
const char *fieldvalue(int docindex, const std::string& fldname); * Fetch and store the results of the input query.
*
* @param q the executed query object to use for fetching results.
* @param fldspec list of fields to be excluded or included.
* @param isinc if true, the field list defines the fields to be stored,
* else, those to be excluded.
*/
bool storeQuery(Rcl::Query& q, std::set<std::string> fldspec = {},
bool isinc = false);
/** Retrieve count of stored results */
int getCount();
/**
* Retrieve field value.
*
* @param docindex index in query results.
* @param fldname field name.
*/
const char *fieldValue(int docindex, const std::string& fldname);
QResultStore(const QResultStore&) = delete; QResultStore(const QResultStore&) = delete;
QResultStore& operator=(const QResultStore&) = delete; QResultStore& operator=(const QResultStore&) = delete;
@ -43,4 +68,5 @@ private:
Internal *m{nullptr}; Internal *m{nullptr};
}; };
}
#endif /* _QRESULTSTORE_H_INCLUDED_ */ #endif /* _QRESULTSTORE_H_INCLUDED_ */

View File

@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(DEFS) $(DEFS)
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \ noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \
circache wipedir mimetype pathut fileudi x11mon rclqdocmem circache wipedir mimetype pathut fileudi x11mon trqrstore
circache_SOURCES = trcircache.cpp circache_SOURCES = trcircache.cpp
circache_LDADD = ../librecoll.la circache_LDADD = ../librecoll.la
@ -80,5 +80,5 @@ wipedir_LDADD = ../librecoll.la
x11mon_SOURCES = trx11mon.cpp x11mon_SOURCES = trx11mon.cpp
x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11 x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11
rclqdocmem_SOURCES = rclqdocmem.cpp qresultstore.cpp trqrstore_SOURCES = trqrstore.cpp
rclqdocmem_LDADD = ../librecoll.la trqrstore_LDADD = ../librecoll.la

View File

@ -306,8 +306,7 @@ int main(int argc, char *argv[])
// This uses 19 MB of storage for the audio index, and 72 MB for // This uses 19 MB of storage for the audio index, and 72 MB for
// the main one (less keys->less gain) // the main one (less keys->less gain)
{ {
#if 1 Rcl::QResultStore store;
QResultStore store;
bool result = store.storeQuery( bool result = store.storeQuery(
query, {"author", "ipath", "rcludi", "relevancyrating", query, {"author", "ipath", "rcludi", "relevancyrating",
"sig","abstract", "caption", "filename", "origcharset", "sig"}); "sig","abstract", "caption", "filename", "origcharset", "sig"});
@ -316,121 +315,7 @@ int main(int argc, char *argv[])
return 1; return 1;
} }
meminfo("After storing"); meminfo("After storing");
std::cerr << "url 20 " << store.fieldvalue(20, "url") << "\n"; std::cerr << "url 20 " << store.fieldValue(20, "url") << "\n";
#else
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
std::map<std::string, int> keyidx {
{"url",0},
{"mimetype", 1},
{"fmtime", 2},
{"dmtime", 3},
{"fbytes", 4},
{"dbytes", 5},
};
int ndocs = 0;
for (;;ndocs++) {
Rcl::Doc doc;
if (!query.getDoc(ndocs, doc, false)) {
break;
}
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
int idx = keyidx.size();
keyidx.insert({entry.first, idx});
};
}
}
}
// The audio db has 49 keys !
std::cerr << "Found " << keyidx.size() << " different keys\n";
///////
// Populate the main array with doc-equivalent structures.
// Notes: offsets[0] is always 0, not really useful, simpler this way. Also
// could use simple C array instead of c++ vector...
struct docoffs {
~docoffs() {
free(base);
}
char *base{nullptr};
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
docs.resize(ndocs);
meminfo("After resize");
for (int i = 0; i < ndocs; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) {
break;
}
auto& vdoc = docs[i];
vdoc.offsets.resize(keyidx.size());
int nbytes =
doc.url.size() + 1 +
doc.mimetype.size() + 1 +
doc.fmtime.size() + 1 +
doc.dmtime.size() + 1 +
doc.fbytes.size() + 1 +
doc.dbytes.size() + 1;
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
if (keyidx.find(entry.first) == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
nbytes += entry.second.size() + 1;
}
}
char *cp = (char*)malloc(nbytes);
if (nullptr == cp) {
abort();
}
#define STRINGCPCOPY(CHARP, S) do { \
memcpy(CHARP, S.c_str(), S.size()+1); \
CHARP += S.size()+1; \
} while (false);
vdoc.base = cp;
vdoc.offsets[0] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.url);
vdoc.offsets[1] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.mimetype);
vdoc.offsets[2] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fmtime);
vdoc.offsets[3] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dmtime);
vdoc.offsets[4] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fbytes);
vdoc.offsets[5] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dbytes);
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
if (it->second <= 5) {
// Already done ! Storing another address would be
// wasteful and crash when freeing...
continue;
}
vdoc.offsets[it->second] = cp - vdoc.base;
STRINGCPCOPY(cp, entry.second);
}
}
}
meminfo("After storing");
#endif
} }
#elif defined(STORE_ALLOBSTACK) #elif defined(STORE_ALLOBSTACK)