Merge branch 'resultstore'

This commit is contained in:
Jean-Francois Dockes 2020-12-26 12:23:08 +01:00
commit 1fc5e0db1d
10 changed files with 628 additions and 271 deletions

View File

@ -156,6 +156,8 @@ query/filtseq.cpp \
query/filtseq.h \
query/plaintorich.cpp \
query/plaintorich.h \
query/qresultstore.cpp \
query/qresultstore.h \
query/recollq.cpp \
query/recollq.h \
query/reslistpager.cpp \

View File

@ -327,11 +327,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
return 0;
}
PyDoc_STRVAR(doc_Doc_getbinurl,
"getbinurl(none) -> binary url\n"
"\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
PyDoc_STRVAR(
doc_Doc_getbinurl,
"getbinurl(none) -> binary url\n"
"\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
);
static PyObject *
@ -339,17 +340,18 @@ Doc_getbinurl(recoll_DocObject *self)
{
LOGDEB0("Doc_getbinurl\n");
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
PyErr_SetString(PyExc_AttributeError, "doc is NULL");
Py_RETURN_NONE;
}
return PyBytes_FromStringAndSize(self->doc->url.c_str(),
self->doc->url.size());
self->doc->url.size());
}
PyDoc_STRVAR(doc_Doc_setbinurl,
"setbinurl(url) -> binary url\n"
"\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
PyDoc_STRVAR(
doc_Doc_setbinurl,
"setbinurl(url) -> binary url\n"
"\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
);
static PyObject *
@ -367,6 +369,8 @@ Doc_setbinurl(recoll_DocObject *self, PyObject *value)
self->doc->url = string(PyByteArray_AsString(value),
PyByteArray_Size(value));
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
Py_RETURN_NONE;
}
@ -387,7 +391,7 @@ Doc_keys(recoll_DocObject *self)
return 0;
for (const auto& entry : self->doc->meta) {
PyList_Append(pkeys,
PyUnicode_Decode(entry.first.c_str(),entry.first.size(),
PyUnicode_Decode(entry.first.c_str(), entry.first.size(),
"UTF-8", "replace"));
}
return pkeys;
@ -537,6 +541,23 @@ static PyMethodDef Doc_methods[] = {
{NULL} /* Sentinel */
};
int pys2cpps(PyObject *pyval, std::string& out)
{
if (PyUnicode_Check(pyval)) {
PyObject* utf8o = PyUnicode_AsUTF8String(pyval);
if (utf8o == 0) {
return -1;
}
out = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(pyval)) {
out = PyBytes_AsString(pyval);
} else {
return -1;
}
return 0;
}
// Note that this returns None if the attribute is not found instead of raising
// an exception as would be standard. We don't change it to keep existing code
// working.
@ -560,18 +581,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
PyErr_Clear();
string name;
if (PyUnicode_Check(nameobj)) {
PyObject* utf8o = PyUnicode_AsUTF8String(nameobj);
if (utf8o == 0) {
LOGERR("Doc_getattro: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(nameobj)) {
name = PyBytes_AsString(nameobj);
} else {
if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
Py_RETURN_NONE;
}
@ -588,7 +598,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
}
static int
Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
Doc_setattro(recoll_DocObject *self, PyObject *nameobj, PyObject *value)
{
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??");
@ -599,84 +609,78 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
"Configuration not initialized");
return -1;
}
if (name == 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
string name;
if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
return -1;
}
if (PyBytes_Check(value)) {
value = PyUnicode_FromEncodedObject(value, "UTF-8", "strict");
if (value == 0)
return -1;
}
if (!PyUnicode_Check(value)) {
PyErr_SetString(PyExc_AttributeError, "value not unicode??");
string uvalue;
if (pys2cpps(value, uvalue) < 0) {
PyErr_SetString(PyExc_AttributeError, "value neither bytes nor str");
return -1;
}
PyObject* putf8 = PyUnicode_AsUTF8String(value);
if (putf8 == 0) {
LOGERR("Doc_setmeta: encoding to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "value??");
return -1;
}
string uvalue = PyBytes_AsString(putf8);
Py_DECREF(putf8);
string key = self->rclconfig->fieldQCanon(name);
LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name <<
") -> [" << uvalue << "]\n");
// We set the value in the meta array in all cases. Good idea ? or do it
// only for fields without a dedicated Doc:: entry?
self->doc->meta[key] = uvalue;
// Note that some attributes are set both as struct fields and
// meta members, keep compat with movedocfields() used when
// fetching from query.
switch (key.at(0)) {
case 't':
if (!key.compare("text")) {
if (key == "text") {
self->doc->text.swap(uvalue);
}
break;
case 'u':
if (!key.compare(Rcl::Doc::keyurl)) {
if (key == Rcl::Doc::keyurl) {
self->doc->url.swap(uvalue);
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
}
break;
case 'f':
if (!key.compare(Rcl::Doc::keyfs)) {
if (key == Rcl::Doc::keyfs) {
self->doc->fbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keyfmt)) {
self->doc->meta[Rcl::Doc::keyfs] = self->doc->fbytes;
} else if (key == Rcl::Doc::keyfmt) {
self->doc->fmtime.swap(uvalue);
}
break;
case 'd':
if (!key.compare(Rcl::Doc::keyds)) {
if (key == Rcl::Doc::keyds) {
self->doc->dbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keydmt)) {
self->doc->meta[Rcl::Doc::keyds] = self->doc->dbytes;
} else if (key == Rcl::Doc::keydmt) {
self->doc->dmtime.swap(uvalue);
}
break;
case 'i':
if (!key.compare(Rcl::Doc::keyipt)) {
if (key == Rcl::Doc::keyipt) {
self->doc->ipath.swap(uvalue);
self->doc->meta[Rcl::Doc::keyipt] = self->doc->ipath;
}
break;
case 'm':
if (!key.compare(Rcl::Doc::keytp)) {
if (key == Rcl::Doc::keytp) {
self->doc->mimetype.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keymt)) {
self->doc->meta[Rcl::Doc::keytp] = self->doc->mimetype;
} else if (key == Rcl::Doc::keymt) {
self->doc->dmtime.swap(uvalue);
}
break;
case 'o':
if (!key.compare(Rcl::Doc::keyoc)) {
if (key == Rcl::Doc::keyoc) {
self->doc->origcharset.swap(uvalue);
}
break;
case 's':
if (!key.compare(Rcl::Doc::keysig)) {
if (key == Rcl::Doc::keysig) {
self->doc->sig.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keysz)) {
} else if (key == Rcl::Doc::keysz) {
self->doc->dbytes.swap(uvalue);
}
break;
@ -697,6 +701,7 @@ Doc_length(recoll_DocObject *self)
static PyObject *
Doc_subscript(recoll_DocObject *self, PyObject *key)
{
// Can't just return getattro because this first checks for a method name
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??");
return NULL;
@ -707,18 +712,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
return NULL;
}
string name;
if (PyUnicode_Check(key)) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
LOGERR("Doc_getitemo: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
if (pys2cpps(key, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE;
}
@ -726,54 +720,61 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
string skey = self->rclconfig->fieldQCanon(name);
string value;
if (idocget(self, skey, value)) {
return PyUnicode_Decode(value.c_str(), value.size(), "UTF-8","replace");
return PyUnicode_Decode(value.c_str(), value.size(),
"UTF-8", "backslashreplace");
}
Py_RETURN_NONE;
}
static int
Doc_ass_subscript(recoll_DocObject *self, PyObject *key, PyObject *val)
{
return Doc_setattro(self, key, val);
}
static PyMappingMethods doc_as_mapping = {
(lenfunc)Doc_length, /*mp_length*/
(binaryfunc)Doc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
(objobjargproc)Doc_ass_subscript, /*mp_ass_subscript*/
};
PyDoc_STRVAR(doc_DocObject,
"Doc()\n"
"\n"
"A Doc object contains index data for a given document.\n"
"The data is extracted from the index when searching, or set by the\n"
"indexer program when updating. The Doc object has no useful methods but\n"
"many attributes to be read or set by its user. It matches exactly the\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"especially when indexing, others can be set, the name of which will be\n"
"processed as field names by the indexing configuration.\n"
"Inputs can be specified as unicode or strings.\n"
"Outputs are unicode objects.\n"
"All dates are specified as unix timestamps, printed as strings\n"
"Predefined attributes (index/query/both):\n"
" text (index): document plain text\n"
" url (both)\n"
" fbytes (both) optional) file size in bytes\n"
" filename (both)\n"
" fmtime (both) optional file modification date. Unix time printed \n"
" as string\n"
" dbytes (both) document text bytes\n"
" dmtime (both) document creation/modification date\n"
" ipath (both) value private to the app.: internal access path\n"
" inside file\n"
" mtype (both) mime type for original document\n"
" mtime (query) dmtime if set else fmtime\n"
" origcharset (both) charset the text was converted from\n"
" size (query) dbytes if set, else fbytes\n"
" sig (both) app-defined file modification signature. \n"
" For up to date checks\n"
" relevancyrating (query)\n"
" abstract (both)\n"
" author (both)\n"
" title (both)\n"
" keywords (both)\n"
PyDoc_STRVAR(
doc_DocObject,
"Doc()\n"
"\n"
"A Doc object contains index data for a given document.\n"
"The data is extracted from the index when searching, or set by the\n"
"indexer program when updating. The Doc object has no useful methods but\n"
"many attributes to be read or set by its user. It matches exactly the\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"especially when indexing, others can be set, the name of which will be\n"
"processed as field names by the indexing configuration.\n"
"Inputs can be specified as unicode or strings.\n"
"Outputs are unicode objects.\n"
"All dates are specified as unix timestamps, printed as strings\n"
"Predefined attributes (index/query/both):\n"
" text (index): document plain text\n"
" url (both)\n"
" fbytes (both) optional) file size in bytes\n"
" filename (both)\n"
" fmtime (both) optional file modification date. Unix time printed \n"
" as string\n"
" dbytes (both) document text bytes\n"
" dmtime (both) document creation/modification date\n"
" ipath (both) value private to the app.: internal access path\n"
" inside file\n"
" mtype (both) mime type for original document\n"
" mtime (query) dmtime if set else fmtime\n"
" origcharset (both) charset the text was converted from\n"
" size (query) dbytes if set, else fbytes\n"
" sig (both) app-defined file modification signature. \n"
" For up to date checks\n"
" relevancyrating (query)\n"
" abstract (both)\n"
" author (both)\n"
" title (both)\n"
" keywords (both)\n"
);
PyTypeObject recoll_DocType = {
@ -784,7 +785,7 @@ PyTypeObject recoll_DocType = {
(destructor)Doc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
(setattrfunc)Doc_setattr, /*tp_setattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
@ -794,7 +795,7 @@ PyTypeObject recoll_DocType = {
0, /*tp_call*/
0, /*tp_str*/
(getattrofunc)Doc_getattro,/*tp_getattro*/
0, /*tp_setattro*/
(setattrofunc)Doc_setattro,/*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
doc_DocObject, /* tp_doc */
@ -828,19 +829,6 @@ typedef struct recoll_DbObject {
std::shared_ptr<RclConfig> rclconfig;
} recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
PyDoc_STRVAR(doc_Query_close,
"close(). Deallocate query. Object is unusable after the call."
);
@ -1521,7 +1509,7 @@ PyDoc_STRVAR(doc_QueryObject,
"Recoll Query objects are used to execute index searches. \n"
"They must be created by the Db.query() method.\n"
);
static PyTypeObject recoll_QueryType = {
PyTypeObject recoll_QueryType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.Query", /*tp_name*/
sizeof(recoll_QueryObject), /*tp_basicsize*/
@ -2195,6 +2183,17 @@ PyInit__recoll(void)
Py_INCREF(&rclx_ExtractorType);
PyModule_AddObject(module, "Extractor", (PyObject *)&rclx_ExtractorType);
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF(&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType);
if (PyType_Ready(&recoll_QRSDocType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QRSDocType);
PyModule_AddObject(module, "QRSDoc",
(PyObject *)&recoll_QRSDocType);
#if PY_MAJOR_VERSION >= 3
return module;
#endif

View File

@ -22,8 +22,13 @@
#include <Python.h>
#include <memory>
#include <string>
class RclConfig;
namespace Rcl {
class Doc;
class Query;
};
typedef struct {
PyObject_HEAD
@ -33,7 +38,27 @@ typedef struct {
std::shared_ptr<RclConfig> rclconfig;
} recoll_DocObject;
extern PyTypeObject rclx_ExtractorType;
struct recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
std::string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
extern PyTypeObject recoll_DocType;
extern PyTypeObject recoll_QueryType;
extern PyTypeObject rclx_ExtractorType;
extern PyTypeObject recoll_QResultStoreType;
extern PyTypeObject recoll_QRSDocType;
extern int pys2cpps(PyObject *pyval, std::string& out);
#endif // _PYRECOLL_H_INCLUDED_

View File

@ -0,0 +1,378 @@
/* Copyright (C) 2007-2020 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>
#include <string>
#include <iostream>
#include <set>
#include "qresultstore.h"
#include "pyrecoll.h"
#include "log.h"
#include "rclutil.h"
using namespace std;
#if PY_MAJOR_VERSION >=3
# define Py_TPFLAGS_HAVE_ITER 0
#else
#define PyLong_FromLong PyInt_FromLong
#endif
struct recoll_QRSDocObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::QResultStore *store;
} recoll_QResultStoreObject;
static void
QResultStore_dealloc(recoll_QResultStoreObject *self)
{
LOGDEB1("QResultStore_dealloc.\n");
delete self->store;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QResultStore_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB1("QResultStore_new\n");
recoll_QResultStoreObject *self =
(recoll_QResultStoreObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
self->store = new Rcl::QResultStore();
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QResultStoreObject,
"QResultStore()\n"
"\n"
"A QResultStore can efficiently store query result documents.\n"
);
static int
QResultStore_init(
recoll_QResultStoreObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB("QResultStore_init\n");
return 0;
}
PyDoc_STRVAR(
qrs_doc_storeQuery,
"storeQuery(query, fieldspec=[], isinc=False)\n"
"\n"
"Stores the results from the input query object, possibly "
"excluding/including the specified fields.\n"
);
static PyObject *
QResultStore_storeQuery(recoll_QResultStoreObject* self, PyObject *args,
PyObject *kwargs)
{
LOGDEB0("QResultStore_storeQuery\n");
static const char* kwlist[] = {"query", "fieldspec", "isinc", NULL};
PyObject *q{nullptr};
PyObject *fieldspec{nullptr};
PyObject *isinco = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OO", (char**)kwlist,
&recoll_QueryType, &q, &fieldspec, &isinco))
return nullptr;
recoll_QueryObject *query = (recoll_QueryObject*)q;
if (nullptr == query->query) {
PyErr_SetString(PyExc_ValueError,
"query not initialised (null query ?)");
return nullptr;
}
bool isinc{false};
if (nullptr != isinco && PyObject_IsTrue(isinco))
isinc = true;
std::set<std::string> fldspec;
if (nullptr != fieldspec) {
// fieldspec must be either single string or list of strings
if (PyUnicode_Check(fieldspec)) {
PyObject *utf8o = PyUnicode_AsUTF8String(fieldspec);
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
} else if (PySequence_Check(fieldspec)) {
for (Py_ssize_t i = 0; i < PySequence_Size(fieldspec); i++) {
PyObject *utf8o =
PyUnicode_AsUTF8String(PySequence_GetItem(fieldspec, i));
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
}
} else {
PyErr_SetString(PyExc_TypeError,
"fieldspec arg must be str or sequence of str");
return nullptr;
}
}
self->store->storeQuery(*(query->query), fldspec, isinc);
Py_RETURN_NONE;
}
PyDoc_STRVAR(
qrs_doc_getField,
"getField(index, fieldname)\n"
"\n"
"Retrieve tha value of field <fieldname> from result at index <index>.\n"
);
static PyObject *
QResultStore_getField(recoll_QResultStoreObject* self, PyObject *args)
{
int index;
const char *fieldname;
if (!PyArg_ParseTuple(args, "is", &index, &fieldname)) {
return nullptr;
}
const char *result = self->store->fieldValue(index, fieldname);
if (nullptr == result) {
Py_RETURN_NONE;
} else {
return PyBytes_FromString(result);
}
}
static PyMethodDef QResultStore_methods[] = {
{"storeQuery", (PyCFunction)QResultStore_storeQuery,
METH_VARARGS|METH_KEYWORDS, qrs_doc_storeQuery},
{"getField", (PyCFunction)QResultStore_getField,
METH_VARARGS, qrs_doc_getField},
{NULL} /* Sentinel */
};
static Py_ssize_t QResultStore_Size(PyObject *o)
{
return ((recoll_QResultStoreObject*)o)->store->getCount();
}
static PyObject* QResultStore_GetItem(PyObject *o, Py_ssize_t i)
{
if (i < 0 || i >= ((recoll_QResultStoreObject*)o)->store->getCount()) {
return nullptr;
}
PyObject *args = Py_BuildValue("Oi", o, i);
auto res = PyObject_CallObject((PyObject *)&recoll_QRSDocType, args);
Py_DECREF(args);
return res;
}
static PySequenceMethods resultstore_as_sequence = {
(lenfunc)QResultStore_Size, // sq_length
(binaryfunc)0, // sq_concat
(ssizeargfunc)0, // sq_repeat
(ssizeargfunc)QResultStore_GetItem, // sq_item
0, // was sq_slice
(ssizeobjargproc)0, // sq_ass_item
0, // was sq_ass_slice
(objobjproc)0, // sq_contains
(binaryfunc)0, // sq_inplace_concat
(ssizeargfunc)0, // sq_inplace_repeat
};
PyTypeObject recoll_QResultStoreType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QResultStore", /*tp_name*/
sizeof(recoll_QResultStoreObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QResultStore_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
&resultstore_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QResultStoreObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QResultStore_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QResultStore_init, /* tp_init */
0, /* tp_alloc */
QResultStore_new, /* tp_new */
};
////////////////////////////////////////////////////////////////////////
// QRSDoc iterator
typedef struct recoll_QRSDocObject {
PyObject_HEAD
/* Type-specific fields go here. */
recoll_QResultStoreObject *pystore;
int index;
} recoll_QRSDocObject;
static void
QRSDoc_dealloc(recoll_QRSDocObject *self)
{
LOGDEB1("QRSDoc_dealloc\n");
Py_DECREF(self->pystore);
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QRSDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
recoll_QRSDocObject *self = (recoll_QRSDocObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QRSDocObject,
"QRSDoc(resultstore, index)\n"
"\n"
"A QRSDoc gives access to one result from a qresultstore.\n"
);
static int
QRSDoc_init(
recoll_QRSDocObject *self, PyObject *args, PyObject *kwargs)
{
recoll_QResultStoreObject *pystore;
int index;
if (!PyArg_ParseTuple(args, "O!i",
&recoll_QResultStoreType, &pystore, &index)) {
return -1;
}
Py_INCREF(pystore);
self->pystore = pystore;
self->index = index;
return 0;
}
static PyObject *
QRSDoc_subscript(recoll_QRSDocObject *self, PyObject *key)
{
if (self->pystore == 0) {
PyErr_SetString(PyExc_AttributeError, "store??");
return NULL;
}
string name;
if (pys2cpps(key, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
const char *value = self->pystore->store->fieldValue(self->index, name);
if (nullptr == value) {
Py_RETURN_NONE;
}
string urlstring;
if (name == "url") {
printableUrl("UTF-8", value, urlstring);
value = urlstring.c_str();
}
PyObject *bytes = PyBytes_FromString(value);
PyObject *u =
PyUnicode_FromEncodedObject(bytes, "UTF-8", "backslashreplace");
Py_DECREF(bytes);
return u;
}
static PyMappingMethods qrsdoc_as_mapping = {
(lenfunc)0, /*mp_length*/
(binaryfunc)QRSDoc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
};
static PyMethodDef QRSDoc_methods[] = {
{NULL} /* Sentinel */
};
PyTypeObject recoll_QRSDocType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QRSDoc", /*tp_name*/
sizeof(recoll_QRSDocObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QRSDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
&qrsdoc_as_mapping, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QRSDocObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QRSDoc_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QRSDoc_init, /* tp_init */
0, /* tp_alloc */
QRSDoc_new, /* tp_new */
};

View File

@ -0,0 +1,23 @@
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# We used to have two C extensions: recoll and rclextract, which was a really
# bad idea. They are now merged into the _recoll C extension module. The two
# python modules recoll.py and rclextract.py only exist for compatibility (for
# now: maybe we'll do something with them in the future).
from ._recoll import QResultStore, QRSDoc

View File

@ -38,13 +38,15 @@ module1 = Extension('_recoll',
os.path.join(top, 'internfile'),
os.path.join(top, 'rcldb'),
os.path.join(top, 'query'),
os.path.join(top, 'unac')
os.path.join(top, 'unac'),
os.path.join(top, 'testmains')
],
extra_compile_args = extra_compile_args,
libraries = libraries,
library_dirs = library_dirs,
runtime_library_dirs = runtime_library_dirs,
sources = [os.path.join(pytop, 'pyrecoll.cpp'),
os.path.join(pytop, 'pyresultstore.cpp'),
os.path.join(pytop, 'pyrclextract.cpp')
])

View File

@ -32,15 +32,17 @@
#include "rcldoc.h"
#include "rclquery.h"
namespace Rcl {
class QResultStore::Internal {
public:
bool testentry(const std::pair<std::string,std::string>& entry) {
return !entry.second.empty() &&
excludedfields.find(entry.first) == excludedfields.end();
(isinc ? fieldspec.find(entry.first) != fieldspec.end() :
fieldspec.find(entry.first) == fieldspec.end());
}
std::map<std::string, int> keyidx;
int ndocs{0};
// Notes: offsets[0] is always 0, not really useful, simpler this
// way. Also could use simple C array instead of c++ vector...
struct docoffs {
@ -51,7 +53,8 @@ public:
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
std::set<std::string> excludedfields;
std::set<std::string> fieldspec;
bool isinc{false};
};
QResultStore::QResultStore()
@ -63,14 +66,17 @@ QResultStore::~QResultStore()
delete m;
}
//{"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption",
// "filename", "origcharset", "sig"};
// For reference : Fields normally excluded by uprcl:
// {"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption",
// "filename", "origcharset", "sig"};
bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> fldspec,
bool isinc)
{
m->excludedfields = excl;
m->fieldspec = fldspec;
m->isinc = isinc;
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
@ -81,10 +87,11 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
{"fbytes", 4},
{"dbytes", 5}
};
m->ndocs = 0;
for (;;m->ndocs++) {
int count = 0;
for (;;count++) {
Rcl::Doc doc;
if (!query.getDoc(m->ndocs, doc, false)) {
if (!query.getDoc(count, doc, false)) {
break;
}
for (const auto& entry : doc.meta) {
@ -101,9 +108,9 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
///////
// Populate the main array with doc-equivalent structures.
m->docs.resize(m->ndocs);
m->docs.resize(count);
for (int i = 0; i < m->ndocs; i++) {
for (int i = 0; i < count; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) {
break;
@ -164,24 +171,34 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
STRINGCPCOPY(cp, entry.second);
}
}
// Point all empty entries to the final null byte
for (unsigned int i = 1; i < vdoc.offsets.size(); i++) {
if (vdoc.offsets[i] == 0) {
vdoc.offsets[i] = cp - 1 - vdoc.base;
}
}
}
return true;
}
const char *QResultStore::fieldvalue(int docindex, const std::string& fldname)
int QResultStore::getCount()
{
if (docindex < 0 || docindex >= m->ndocs) {
return int(m->docs.size());
}
const char *QResultStore::fieldValue(int docindex, const std::string& fldname)
{
if (docindex < 0 || docindex >= int(m->docs.size())) {
return nullptr;
}
auto& vdoc = m->docs[docindex];
auto it = m->keyidx.find(fldname);
if (it == m->keyidx.end()) {
return nullptr;
}
if (it->second < 0 || it->second >= int(vdoc.offsets.size())) {
//??
if (it == m->keyidx.end() ||
it->second < 0 || it->second >= int(vdoc.offsets.size())) {
return nullptr;
}
return vdoc.base + vdoc.offsets[it->second];
}
} // namespace Rcl

View File

@ -17,24 +17,49 @@
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef _QRESULTSTORE_H_INCLUDED_
#define _QRESULTSTORE_H_INCLUDED_
/**
* Implement an efficient way to store the whole or part of a query result set.
* This would naturally be done as a vector<Rcl::Doc>, but the natural
* way leads to a huge space waste (8-10x), which may be a problem in
* some cases. This is mostly used by the uprcl Media Server.
*/
#include <string>
#include <set>
namespace Rcl {
class Query;
}
class QResultStore {
public:
QResultStore();
~QResultStore();
bool storeQuery(Rcl::Query& q, std::set<std::string> excluded = {});
const char *fieldvalue(int docindex, const std::string& fldname);
/**
* Fetch and store the results of the input query.
*
* @param q the executed query object to use for fetching results.
* @param fldspec list of fields to be excluded or included.
* @param isinc if true, the field list defines the fields to be stored,
* else, those to be excluded.
*/
bool storeQuery(Rcl::Query& q, std::set<std::string> fldspec = {},
bool isinc = false);
/** Retrieve count of stored results */
int getCount();
/**
* Retrieve field value.
*
* @param docindex index in query results.
* @param fldname field name.
*/
const char *fieldValue(int docindex, const std::string& fldname);
QResultStore(const QResultStore&) = delete;
QResultStore& operator=(const QResultStore&) = delete;
@ -43,4 +68,5 @@ private:
Internal *m{nullptr};
};
}
#endif /* _QRESULTSTORE_H_INCLUDED_ */

View File

@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(DEFS)
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \
circache wipedir mimetype pathut fileudi x11mon rclqdocmem
circache wipedir mimetype pathut fileudi x11mon trqrstore
circache_SOURCES = trcircache.cpp
circache_LDADD = ../librecoll.la
@ -80,5 +80,5 @@ wipedir_LDADD = ../librecoll.la
x11mon_SOURCES = trx11mon.cpp
x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11
rclqdocmem_SOURCES = rclqdocmem.cpp qresultstore.cpp
rclqdocmem_LDADD = ../librecoll.la
trqrstore_SOURCES = trqrstore.cpp
trqrstore_LDADD = ../librecoll.la

View File

@ -306,8 +306,7 @@ int main(int argc, char *argv[])
// This uses 19 MB of storage for the audio index, and 72 MB for
// the main one (less keys->less gain)
{
#if 1
QResultStore store;
Rcl::QResultStore store;
bool result = store.storeQuery(
query, {"author", "ipath", "rcludi", "relevancyrating",
"sig","abstract", "caption", "filename", "origcharset", "sig"});
@ -316,121 +315,7 @@ int main(int argc, char *argv[])
return 1;
}
meminfo("After storing");
std::cerr << "url 20 " << store.fieldvalue(20, "url") << "\n";
#else
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
std::map<std::string, int> keyidx {
{"url",0},
{"mimetype", 1},
{"fmtime", 2},
{"dmtime", 3},
{"fbytes", 4},
{"dbytes", 5},
};
int ndocs = 0;
for (;;ndocs++) {
Rcl::Doc doc;
if (!query.getDoc(ndocs, doc, false)) {
break;
}
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
int idx = keyidx.size();
keyidx.insert({entry.first, idx});
};
}
}
}
// The audio db has 49 keys !
std::cerr << "Found " << keyidx.size() << " different keys\n";
///////
// Populate the main array with doc-equivalent structures.
// Notes: offsets[0] is always 0, not really useful, simpler this way. Also
// could use simple C array instead of c++ vector...
struct docoffs {
~docoffs() {
free(base);
}
char *base{nullptr};
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
docs.resize(ndocs);
meminfo("After resize");
for (int i = 0; i < ndocs; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) {
break;
}
auto& vdoc = docs[i];
vdoc.offsets.resize(keyidx.size());
int nbytes =
doc.url.size() + 1 +
doc.mimetype.size() + 1 +
doc.fmtime.size() + 1 +
doc.dmtime.size() + 1 +
doc.fbytes.size() + 1 +
doc.dbytes.size() + 1;
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
if (keyidx.find(entry.first) == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
nbytes += entry.second.size() + 1;
}
}
char *cp = (char*)malloc(nbytes);
if (nullptr == cp) {
abort();
}
#define STRINGCPCOPY(CHARP, S) do { \
memcpy(CHARP, S.c_str(), S.size()+1); \
CHARP += S.size()+1; \
} while (false);
vdoc.base = cp;
vdoc.offsets[0] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.url);
vdoc.offsets[1] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.mimetype);
vdoc.offsets[2] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fmtime);
vdoc.offsets[3] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dmtime);
vdoc.offsets[4] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fbytes);
vdoc.offsets[5] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dbytes);
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
if (it->second <= 5) {
// Already done ! Storing another address would be
// wasteful and crash when freeing...
continue;
}
vdoc.offsets[it->second] = cp - vdoc.base;
STRINGCPCOPY(cp, entry.second);
}
}
}
meminfo("After storing");
#endif
std::cerr << "url 20 " << store.fieldValue(20, "url") << "\n";
}
#elif defined(STORE_ALLOBSTACK)