Merge branch 'resultstore'

This commit is contained in:
Jean-Francois Dockes 2020-12-26 12:23:08 +01:00
commit 1fc5e0db1d
10 changed files with 628 additions and 271 deletions

View File

@ -156,6 +156,8 @@ query/filtseq.cpp \
query/filtseq.h \ query/filtseq.h \
query/plaintorich.cpp \ query/plaintorich.cpp \
query/plaintorich.h \ query/plaintorich.h \
query/qresultstore.cpp \
query/qresultstore.h \
query/recollq.cpp \ query/recollq.cpp \
query/recollq.h \ query/recollq.h \
query/reslistpager.cpp \ query/reslistpager.cpp \

View File

@ -327,11 +327,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
return 0; return 0;
} }
PyDoc_STRVAR(doc_Doc_getbinurl, PyDoc_STRVAR(
"getbinurl(none) -> binary url\n" doc_Doc_getbinurl,
"\n" "getbinurl(none) -> binary url\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n" "\n"
"file system path, without encoding\n" "Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
); );
static PyObject * static PyObject *
@ -339,17 +340,18 @@ Doc_getbinurl(recoll_DocObject *self)
{ {
LOGDEB0("Doc_getbinurl\n"); LOGDEB0("Doc_getbinurl\n");
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc"); PyErr_SetString(PyExc_AttributeError, "doc is NULL");
return 0; Py_RETURN_NONE;
} }
return PyBytes_FromStringAndSize(self->doc->url.c_str(), return PyBytes_FromStringAndSize(self->doc->url.c_str(),
self->doc->url.size()); self->doc->url.size());
} }
PyDoc_STRVAR(doc_Doc_setbinurl, PyDoc_STRVAR(
"setbinurl(url) -> binary url\n" doc_Doc_setbinurl,
"\n" "setbinurl(url) -> binary url\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n" "\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
); );
static PyObject * static PyObject *
@ -367,6 +369,8 @@ Doc_setbinurl(recoll_DocObject *self, PyObject *value)
self->doc->url = string(PyByteArray_AsString(value), self->doc->url = string(PyByteArray_AsString(value),
PyByteArray_Size(value)); PyByteArray_Size(value));
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -387,7 +391,7 @@ Doc_keys(recoll_DocObject *self)
return 0; return 0;
for (const auto& entry : self->doc->meta) { for (const auto& entry : self->doc->meta) {
PyList_Append(pkeys, PyList_Append(pkeys,
PyUnicode_Decode(entry.first.c_str(),entry.first.size(), PyUnicode_Decode(entry.first.c_str(), entry.first.size(),
"UTF-8", "replace")); "UTF-8", "replace"));
} }
return pkeys; return pkeys;
@ -537,6 +541,23 @@ static PyMethodDef Doc_methods[] = {
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };
int pys2cpps(PyObject *pyval, std::string& out)
{
if (PyUnicode_Check(pyval)) {
PyObject* utf8o = PyUnicode_AsUTF8String(pyval);
if (utf8o == 0) {
return -1;
}
out = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(pyval)) {
out = PyBytes_AsString(pyval);
} else {
return -1;
}
return 0;
}
// Note that this returns None if the attribute is not found instead of raising // Note that this returns None if the attribute is not found instead of raising
// an exception as would be standard. We don't change it to keep existing code // an exception as would be standard. We don't change it to keep existing code
// working. // working.
@ -560,18 +581,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
PyErr_Clear(); PyErr_Clear();
string name; string name;
if (PyUnicode_Check(nameobj)) { if (pys2cpps(nameobj, name) < 0) {
PyObject* utf8o = PyUnicode_AsUTF8String(nameobj);
if (utf8o == 0) {
LOGERR("Doc_getattro: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(nameobj)) {
name = PyBytes_AsString(nameobj);
} else {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??"); PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -588,7 +598,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
} }
static int static int
Doc_setattr(recoll_DocObject *self, char *name, PyObject *value) Doc_setattro(recoll_DocObject *self, PyObject *nameobj, PyObject *value)
{ {
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??"); PyErr_SetString(PyExc_AttributeError, "doc??");
@ -599,84 +609,78 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
"Configuration not initialized"); "Configuration not initialized");
return -1; return -1;
} }
if (name == 0) { string name;
PyErr_SetString(PyExc_AttributeError, "name??"); if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
return -1; return -1;
} }
if (PyBytes_Check(value)) { string uvalue;
value = PyUnicode_FromEncodedObject(value, "UTF-8", "strict"); if (pys2cpps(value, uvalue) < 0) {
if (value == 0) PyErr_SetString(PyExc_AttributeError, "value neither bytes nor str");
return -1;
}
if (!PyUnicode_Check(value)) {
PyErr_SetString(PyExc_AttributeError, "value not unicode??");
return -1; return -1;
} }
PyObject* putf8 = PyUnicode_AsUTF8String(value);
if (putf8 == 0) {
LOGERR("Doc_setmeta: encoding to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "value??");
return -1;
}
string uvalue = PyBytes_AsString(putf8);
Py_DECREF(putf8);
string key = self->rclconfig->fieldQCanon(name); string key = self->rclconfig->fieldQCanon(name);
LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name << LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name <<
") -> [" << uvalue << "]\n"); ") -> [" << uvalue << "]\n");
// We set the value in the meta array in all cases. Good idea ? or do it // Note that some attributes are set both as struct fields and
// only for fields without a dedicated Doc:: entry? // meta members, keep compat with movedocfields() used when
self->doc->meta[key] = uvalue; // fetching from query.
switch (key.at(0)) { switch (key.at(0)) {
case 't': case 't':
if (!key.compare("text")) { if (key == "text") {
self->doc->text.swap(uvalue); self->doc->text.swap(uvalue);
} }
break; break;
case 'u': case 'u':
if (!key.compare(Rcl::Doc::keyurl)) { if (key == Rcl::Doc::keyurl) {
self->doc->url.swap(uvalue); self->doc->url.swap(uvalue);
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
} }
break; break;
case 'f': case 'f':
if (!key.compare(Rcl::Doc::keyfs)) { if (key == Rcl::Doc::keyfs) {
self->doc->fbytes.swap(uvalue); self->doc->fbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keyfmt)) { self->doc->meta[Rcl::Doc::keyfs] = self->doc->fbytes;
} else if (key == Rcl::Doc::keyfmt) {
self->doc->fmtime.swap(uvalue); self->doc->fmtime.swap(uvalue);
} }
break; break;
case 'd': case 'd':
if (!key.compare(Rcl::Doc::keyds)) { if (key == Rcl::Doc::keyds) {
self->doc->dbytes.swap(uvalue); self->doc->dbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keydmt)) { self->doc->meta[Rcl::Doc::keyds] = self->doc->dbytes;
} else if (key == Rcl::Doc::keydmt) {
self->doc->dmtime.swap(uvalue); self->doc->dmtime.swap(uvalue);
} }
break; break;
case 'i': case 'i':
if (!key.compare(Rcl::Doc::keyipt)) { if (key == Rcl::Doc::keyipt) {
self->doc->ipath.swap(uvalue); self->doc->ipath.swap(uvalue);
self->doc->meta[Rcl::Doc::keyipt] = self->doc->ipath;
} }
break; break;
case 'm': case 'm':
if (!key.compare(Rcl::Doc::keytp)) { if (key == Rcl::Doc::keytp) {
self->doc->mimetype.swap(uvalue); self->doc->mimetype.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keymt)) { self->doc->meta[Rcl::Doc::keytp] = self->doc->mimetype;
} else if (key == Rcl::Doc::keymt) {
self->doc->dmtime.swap(uvalue); self->doc->dmtime.swap(uvalue);
} }
break; break;
case 'o': case 'o':
if (!key.compare(Rcl::Doc::keyoc)) { if (key == Rcl::Doc::keyoc) {
self->doc->origcharset.swap(uvalue); self->doc->origcharset.swap(uvalue);
} }
break; break;
case 's': case 's':
if (!key.compare(Rcl::Doc::keysig)) { if (key == Rcl::Doc::keysig) {
self->doc->sig.swap(uvalue); self->doc->sig.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keysz)) { } else if (key == Rcl::Doc::keysz) {
self->doc->dbytes.swap(uvalue); self->doc->dbytes.swap(uvalue);
} }
break; break;
@ -697,6 +701,7 @@ Doc_length(recoll_DocObject *self)
static PyObject * static PyObject *
Doc_subscript(recoll_DocObject *self, PyObject *key) Doc_subscript(recoll_DocObject *self, PyObject *key)
{ {
// Can't just return getattro because this first checks for a method name
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??"); PyErr_SetString(PyExc_AttributeError, "doc??");
return NULL; return NULL;
@ -707,18 +712,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
return NULL; return NULL;
} }
string name; string name;
if (PyUnicode_Check(key)) { if (pys2cpps(key, name) < 0) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
LOGERR("Doc_getitemo: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??"); PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -726,54 +720,61 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
string skey = self->rclconfig->fieldQCanon(name); string skey = self->rclconfig->fieldQCanon(name);
string value; string value;
if (idocget(self, skey, value)) { if (idocget(self, skey, value)) {
return PyUnicode_Decode(value.c_str(), value.size(), "UTF-8","replace"); return PyUnicode_Decode(value.c_str(), value.size(),
"UTF-8", "backslashreplace");
} }
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static int
Doc_ass_subscript(recoll_DocObject *self, PyObject *key, PyObject *val)
{
return Doc_setattro(self, key, val);
}
static PyMappingMethods doc_as_mapping = { static PyMappingMethods doc_as_mapping = {
(lenfunc)Doc_length, /*mp_length*/ (lenfunc)Doc_length, /*mp_length*/
(binaryfunc)Doc_subscript, /*mp_subscript*/ (binaryfunc)Doc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/ (objobjargproc)Doc_ass_subscript, /*mp_ass_subscript*/
}; };
PyDoc_STRVAR(doc_DocObject, PyDoc_STRVAR(
"Doc()\n" doc_DocObject,
"\n" "Doc()\n"
"A Doc object contains index data for a given document.\n" "\n"
"The data is extracted from the index when searching, or set by the\n" "A Doc object contains index data for a given document.\n"
"indexer program when updating. The Doc object has no useful methods but\n" "The data is extracted from the index when searching, or set by the\n"
"many attributes to be read or set by its user. It matches exactly the\n" "indexer program when updating. The Doc object has no useful methods but\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n" "many attributes to be read or set by its user. It matches exactly the\n"
"especially when indexing, others can be set, the name of which will be\n" "Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"processed as field names by the indexing configuration.\n" "especially when indexing, others can be set, the name of which will be\n"
"Inputs can be specified as unicode or strings.\n" "processed as field names by the indexing configuration.\n"
"Outputs are unicode objects.\n" "Inputs can be specified as unicode or strings.\n"
"All dates are specified as unix timestamps, printed as strings\n" "Outputs are unicode objects.\n"
"Predefined attributes (index/query/both):\n" "All dates are specified as unix timestamps, printed as strings\n"
" text (index): document plain text\n" "Predefined attributes (index/query/both):\n"
" url (both)\n" " text (index): document plain text\n"
" fbytes (both) optional) file size in bytes\n" " url (both)\n"
" filename (both)\n" " fbytes (both) optional) file size in bytes\n"
" fmtime (both) optional file modification date. Unix time printed \n" " filename (both)\n"
" as string\n" " fmtime (both) optional file modification date. Unix time printed \n"
" dbytes (both) document text bytes\n" " as string\n"
" dmtime (both) document creation/modification date\n" " dbytes (both) document text bytes\n"
" ipath (both) value private to the app.: internal access path\n" " dmtime (both) document creation/modification date\n"
" inside file\n" " ipath (both) value private to the app.: internal access path\n"
" mtype (both) mime type for original document\n" " inside file\n"
" mtime (query) dmtime if set else fmtime\n" " mtype (both) mime type for original document\n"
" origcharset (both) charset the text was converted from\n" " mtime (query) dmtime if set else fmtime\n"
" size (query) dbytes if set, else fbytes\n" " origcharset (both) charset the text was converted from\n"
" sig (both) app-defined file modification signature. \n" " size (query) dbytes if set, else fbytes\n"
" For up to date checks\n" " sig (both) app-defined file modification signature. \n"
" relevancyrating (query)\n" " For up to date checks\n"
" abstract (both)\n" " relevancyrating (query)\n"
" author (both)\n" " abstract (both)\n"
" title (both)\n" " author (both)\n"
" keywords (both)\n" " title (both)\n"
" keywords (both)\n"
); );
PyTypeObject recoll_DocType = { PyTypeObject recoll_DocType = {
@ -784,7 +785,7 @@ PyTypeObject recoll_DocType = {
(destructor)Doc_dealloc, /*tp_dealloc*/ (destructor)Doc_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
0, /*tp_getattr*/ 0, /*tp_getattr*/
(setattrfunc)Doc_setattr, /*tp_setattr*/ 0, /*tp_setattr*/
0, /*tp_compare*/ 0, /*tp_compare*/
0, /*tp_repr*/ 0, /*tp_repr*/
0, /*tp_as_number*/ 0, /*tp_as_number*/
@ -794,7 +795,7 @@ PyTypeObject recoll_DocType = {
0, /*tp_call*/ 0, /*tp_call*/
0, /*tp_str*/ 0, /*tp_str*/
(getattrofunc)Doc_getattro,/*tp_getattro*/ (getattrofunc)Doc_getattro,/*tp_getattro*/
0, /*tp_setattro*/ (setattrofunc)Doc_setattro,/*tp_setattro*/
0, /*tp_as_buffer*/ 0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/ Py_TPFLAGS_DEFAULT, /*tp_flags*/
doc_DocObject, /* tp_doc */ doc_DocObject, /* tp_doc */
@ -828,19 +829,6 @@ typedef struct recoll_DbObject {
std::shared_ptr<RclConfig> rclconfig; std::shared_ptr<RclConfig> rclconfig;
} recoll_DbObject; } recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
PyDoc_STRVAR(doc_Query_close, PyDoc_STRVAR(doc_Query_close,
"close(). Deallocate query. Object is unusable after the call." "close(). Deallocate query. Object is unusable after the call."
); );
@ -1521,7 +1509,7 @@ PyDoc_STRVAR(doc_QueryObject,
"Recoll Query objects are used to execute index searches. \n" "Recoll Query objects are used to execute index searches. \n"
"They must be created by the Db.query() method.\n" "They must be created by the Db.query() method.\n"
); );
static PyTypeObject recoll_QueryType = { PyTypeObject recoll_QueryType = {
PyVarObject_HEAD_INIT(NULL, 0) PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.Query", /*tp_name*/ "_recoll.Query", /*tp_name*/
sizeof(recoll_QueryObject), /*tp_basicsize*/ sizeof(recoll_QueryObject), /*tp_basicsize*/
@ -2195,6 +2183,17 @@ PyInit__recoll(void)
Py_INCREF(&rclx_ExtractorType); Py_INCREF(&rclx_ExtractorType);
PyModule_AddObject(module, "Extractor", (PyObject *)&rclx_ExtractorType); PyModule_AddObject(module, "Extractor", (PyObject *)&rclx_ExtractorType);
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF(&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType);
if (PyType_Ready(&recoll_QRSDocType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QRSDocType);
PyModule_AddObject(module, "QRSDoc",
(PyObject *)&recoll_QRSDocType);
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
return module; return module;
#endif #endif

View File

@ -22,8 +22,13 @@
#include <Python.h> #include <Python.h>
#include <memory> #include <memory>
#include <string>
class RclConfig; class RclConfig;
namespace Rcl {
class Doc;
class Query;
};
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
@ -33,7 +38,27 @@ typedef struct {
std::shared_ptr<RclConfig> rclconfig; std::shared_ptr<RclConfig> rclconfig;
} recoll_DocObject; } recoll_DocObject;
extern PyTypeObject rclx_ExtractorType; struct recoll_DbObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
int rowcount; // Number of records returned by last execute
std::string *sortfield; // Need to allocate in here, main program is C.
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
extern PyTypeObject recoll_DocType; extern PyTypeObject recoll_DocType;
extern PyTypeObject recoll_QueryType;
extern PyTypeObject rclx_ExtractorType;
extern PyTypeObject recoll_QResultStoreType;
extern PyTypeObject recoll_QRSDocType;
extern int pys2cpps(PyObject *pyval, std::string& out);
#endif // _PYRECOLL_H_INCLUDED_ #endif // _PYRECOLL_H_INCLUDED_

View File

@ -0,0 +1,378 @@
/* Copyright (C) 2007-2020 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <Python.h>
#include <structmember.h>
#include <bytesobject.h>
#include <string>
#include <iostream>
#include <set>
#include "qresultstore.h"
#include "pyrecoll.h"
#include "log.h"
#include "rclutil.h"
using namespace std;
#if PY_MAJOR_VERSION >=3
# define Py_TPFLAGS_HAVE_ITER 0
#else
#define PyLong_FromLong PyInt_FromLong
#endif
struct recoll_QRSDocObject;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::QResultStore *store;
} recoll_QResultStoreObject;
static void
QResultStore_dealloc(recoll_QResultStoreObject *self)
{
LOGDEB1("QResultStore_dealloc.\n");
delete self->store;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QResultStore_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB1("QResultStore_new\n");
recoll_QResultStoreObject *self =
(recoll_QResultStoreObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
self->store = new Rcl::QResultStore();
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QResultStoreObject,
"QResultStore()\n"
"\n"
"A QResultStore can efficiently store query result documents.\n"
);
static int
QResultStore_init(
recoll_QResultStoreObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB("QResultStore_init\n");
return 0;
}
PyDoc_STRVAR(
qrs_doc_storeQuery,
"storeQuery(query, fieldspec=[], isinc=False)\n"
"\n"
"Stores the results from the input query object, possibly "
"excluding/including the specified fields.\n"
);
static PyObject *
QResultStore_storeQuery(recoll_QResultStoreObject* self, PyObject *args,
PyObject *kwargs)
{
LOGDEB0("QResultStore_storeQuery\n");
static const char* kwlist[] = {"query", "fieldspec", "isinc", NULL};
PyObject *q{nullptr};
PyObject *fieldspec{nullptr};
PyObject *isinco = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OO", (char**)kwlist,
&recoll_QueryType, &q, &fieldspec, &isinco))
return nullptr;
recoll_QueryObject *query = (recoll_QueryObject*)q;
if (nullptr == query->query) {
PyErr_SetString(PyExc_ValueError,
"query not initialised (null query ?)");
return nullptr;
}
bool isinc{false};
if (nullptr != isinco && PyObject_IsTrue(isinco))
isinc = true;
std::set<std::string> fldspec;
if (nullptr != fieldspec) {
// fieldspec must be either single string or list of strings
if (PyUnicode_Check(fieldspec)) {
PyObject *utf8o = PyUnicode_AsUTF8String(fieldspec);
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
} else if (PySequence_Check(fieldspec)) {
for (Py_ssize_t i = 0; i < PySequence_Size(fieldspec); i++) {
PyObject *utf8o =
PyUnicode_AsUTF8String(PySequence_GetItem(fieldspec, i));
if (nullptr == utf8o) {
PyErr_SetString(PyExc_AttributeError,
"storeQuery: can't encode field name??");
return nullptr;
}
fldspec.insert(PyBytes_AsString(utf8o));
Py_DECREF(utf8o);
}
} else {
PyErr_SetString(PyExc_TypeError,
"fieldspec arg must be str or sequence of str");
return nullptr;
}
}
self->store->storeQuery(*(query->query), fldspec, isinc);
Py_RETURN_NONE;
}
PyDoc_STRVAR(
qrs_doc_getField,
"getField(index, fieldname)\n"
"\n"
"Retrieve tha value of field <fieldname> from result at index <index>.\n"
);
static PyObject *
QResultStore_getField(recoll_QResultStoreObject* self, PyObject *args)
{
int index;
const char *fieldname;
if (!PyArg_ParseTuple(args, "is", &index, &fieldname)) {
return nullptr;
}
const char *result = self->store->fieldValue(index, fieldname);
if (nullptr == result) {
Py_RETURN_NONE;
} else {
return PyBytes_FromString(result);
}
}
static PyMethodDef QResultStore_methods[] = {
{"storeQuery", (PyCFunction)QResultStore_storeQuery,
METH_VARARGS|METH_KEYWORDS, qrs_doc_storeQuery},
{"getField", (PyCFunction)QResultStore_getField,
METH_VARARGS, qrs_doc_getField},
{NULL} /* Sentinel */
};
static Py_ssize_t QResultStore_Size(PyObject *o)
{
return ((recoll_QResultStoreObject*)o)->store->getCount();
}
static PyObject* QResultStore_GetItem(PyObject *o, Py_ssize_t i)
{
if (i < 0 || i >= ((recoll_QResultStoreObject*)o)->store->getCount()) {
return nullptr;
}
PyObject *args = Py_BuildValue("Oi", o, i);
auto res = PyObject_CallObject((PyObject *)&recoll_QRSDocType, args);
Py_DECREF(args);
return res;
}
static PySequenceMethods resultstore_as_sequence = {
(lenfunc)QResultStore_Size, // sq_length
(binaryfunc)0, // sq_concat
(ssizeargfunc)0, // sq_repeat
(ssizeargfunc)QResultStore_GetItem, // sq_item
0, // was sq_slice
(ssizeobjargproc)0, // sq_ass_item
0, // was sq_ass_slice
(objobjproc)0, // sq_contains
(binaryfunc)0, // sq_inplace_concat
(ssizeargfunc)0, // sq_inplace_repeat
};
PyTypeObject recoll_QResultStoreType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QResultStore", /*tp_name*/
sizeof(recoll_QResultStoreObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QResultStore_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
&resultstore_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QResultStoreObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QResultStore_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QResultStore_init, /* tp_init */
0, /* tp_alloc */
QResultStore_new, /* tp_new */
};
////////////////////////////////////////////////////////////////////////
// QRSDoc iterator
typedef struct recoll_QRSDocObject {
PyObject_HEAD
/* Type-specific fields go here. */
recoll_QResultStoreObject *pystore;
int index;
} recoll_QRSDocObject;
static void
QRSDoc_dealloc(recoll_QRSDocObject *self)
{
LOGDEB1("QRSDoc_dealloc\n");
Py_DECREF(self->pystore);
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QRSDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
recoll_QRSDocObject *self = (recoll_QRSDocObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QRSDocObject,
"QRSDoc(resultstore, index)\n"
"\n"
"A QRSDoc gives access to one result from a qresultstore.\n"
);
static int
QRSDoc_init(
recoll_QRSDocObject *self, PyObject *args, PyObject *kwargs)
{
recoll_QResultStoreObject *pystore;
int index;
if (!PyArg_ParseTuple(args, "O!i",
&recoll_QResultStoreType, &pystore, &index)) {
return -1;
}
Py_INCREF(pystore);
self->pystore = pystore;
self->index = index;
return 0;
}
static PyObject *
QRSDoc_subscript(recoll_QRSDocObject *self, PyObject *key)
{
if (self->pystore == 0) {
PyErr_SetString(PyExc_AttributeError, "store??");
return NULL;
}
string name;
if (pys2cpps(key, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
const char *value = self->pystore->store->fieldValue(self->index, name);
if (nullptr == value) {
Py_RETURN_NONE;
}
string urlstring;
if (name == "url") {
printableUrl("UTF-8", value, urlstring);
value = urlstring.c_str();
}
PyObject *bytes = PyBytes_FromString(value);
PyObject *u =
PyUnicode_FromEncodedObject(bytes, "UTF-8", "backslashreplace");
Py_DECREF(bytes);
return u;
}
static PyMappingMethods qrsdoc_as_mapping = {
(lenfunc)0, /*mp_length*/
(binaryfunc)QRSDoc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
};
static PyMethodDef QRSDoc_methods[] = {
{NULL} /* Sentinel */
};
PyTypeObject recoll_QRSDocType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QRSDoc", /*tp_name*/
sizeof(recoll_QRSDocObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QRSDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
&qrsdoc_as_mapping, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QRSDocObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QRSDoc_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QRSDoc_init, /* tp_init */
0, /* tp_alloc */
QRSDoc_new, /* tp_new */
};

View File

@ -0,0 +1,23 @@
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# We used to have two C extensions: recoll and rclextract, which was a really
# bad idea. They are now merged into the _recoll C extension module. The two
# python modules recoll.py and rclextract.py only exist for compatibility (for
# now: maybe we'll do something with them in the future).
from ._recoll import QResultStore, QRSDoc

View File

@ -38,13 +38,15 @@ module1 = Extension('_recoll',
os.path.join(top, 'internfile'), os.path.join(top, 'internfile'),
os.path.join(top, 'rcldb'), os.path.join(top, 'rcldb'),
os.path.join(top, 'query'), os.path.join(top, 'query'),
os.path.join(top, 'unac') os.path.join(top, 'unac'),
os.path.join(top, 'testmains')
], ],
extra_compile_args = extra_compile_args, extra_compile_args = extra_compile_args,
libraries = libraries, libraries = libraries,
library_dirs = library_dirs, library_dirs = library_dirs,
runtime_library_dirs = runtime_library_dirs, runtime_library_dirs = runtime_library_dirs,
sources = [os.path.join(pytop, 'pyrecoll.cpp'), sources = [os.path.join(pytop, 'pyrecoll.cpp'),
os.path.join(pytop, 'pyresultstore.cpp'),
os.path.join(pytop, 'pyrclextract.cpp') os.path.join(pytop, 'pyrclextract.cpp')
]) ])

View File

@ -32,15 +32,17 @@
#include "rcldoc.h" #include "rcldoc.h"
#include "rclquery.h" #include "rclquery.h"
namespace Rcl {
class QResultStore::Internal { class QResultStore::Internal {
public: public:
bool testentry(const std::pair<std::string,std::string>& entry) { bool testentry(const std::pair<std::string,std::string>& entry) {
return !entry.second.empty() && return !entry.second.empty() &&
excludedfields.find(entry.first) == excludedfields.end(); (isinc ? fieldspec.find(entry.first) != fieldspec.end() :
fieldspec.find(entry.first) == fieldspec.end());
} }
std::map<std::string, int> keyidx; std::map<std::string, int> keyidx;
int ndocs{0};
// Notes: offsets[0] is always 0, not really useful, simpler this // Notes: offsets[0] is always 0, not really useful, simpler this
// way. Also could use simple C array instead of c++ vector... // way. Also could use simple C array instead of c++ vector...
struct docoffs { struct docoffs {
@ -51,7 +53,8 @@ public:
std::vector<int> offsets; std::vector<int> offsets;
}; };
std::vector<struct docoffs> docs; std::vector<struct docoffs> docs;
std::set<std::string> excludedfields; std::set<std::string> fieldspec;
bool isinc{false};
}; };
QResultStore::QResultStore() QResultStore::QResultStore()
@ -63,14 +66,17 @@ QResultStore::~QResultStore()
delete m; delete m;
} }
// For reference : Fields normally excluded by uprcl:
//{"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption", // {"author", "ipath", "rcludi", "relevancyrating", "sig", "abstract", "caption",
// "filename", "origcharset", "sig"}; // "filename", "origcharset", "sig"};
bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl) bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> fldspec,
bool isinc)
{ {
m->excludedfields = excl; m->fieldspec = fldspec;
m->isinc = isinc;
///////////// /////////////
// Enumerate all existing keys and assign array indexes for // Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it. // them. Count documents while we are at it.
@ -81,10 +87,11 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
{"fbytes", 4}, {"fbytes", 4},
{"dbytes", 5} {"dbytes", 5}
}; };
m->ndocs = 0;
for (;;m->ndocs++) { int count = 0;
for (;;count++) {
Rcl::Doc doc; Rcl::Doc doc;
if (!query.getDoc(m->ndocs, doc, false)) { if (!query.getDoc(count, doc, false)) {
break; break;
} }
for (const auto& entry : doc.meta) { for (const auto& entry : doc.meta) {
@ -101,9 +108,9 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
/////// ///////
// Populate the main array with doc-equivalent structures. // Populate the main array with doc-equivalent structures.
m->docs.resize(m->ndocs); m->docs.resize(count);
for (int i = 0; i < m->ndocs; i++) { for (int i = 0; i < count; i++) {
Rcl::Doc doc; Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) { if (!query.getDoc(i, doc, false)) {
break; break;
@ -164,24 +171,34 @@ bool QResultStore::storeQuery(Rcl::Query& query, std::set<std::string> excl)
STRINGCPCOPY(cp, entry.second); STRINGCPCOPY(cp, entry.second);
} }
} }
// Point all empty entries to the final null byte
for (unsigned int i = 1; i < vdoc.offsets.size(); i++) {
if (vdoc.offsets[i] == 0) {
vdoc.offsets[i] = cp - 1 - vdoc.base;
}
}
} }
return true; return true;
} }
const char *QResultStore::fieldvalue(int docindex, const std::string& fldname) int QResultStore::getCount()
{ {
if (docindex < 0 || docindex >= m->ndocs) { return int(m->docs.size());
}
const char *QResultStore::fieldValue(int docindex, const std::string& fldname)
{
if (docindex < 0 || docindex >= int(m->docs.size())) {
return nullptr; return nullptr;
} }
auto& vdoc = m->docs[docindex]; auto& vdoc = m->docs[docindex];
auto it = m->keyidx.find(fldname); auto it = m->keyidx.find(fldname);
if (it == m->keyidx.end()) { if (it == m->keyidx.end() ||
return nullptr; it->second < 0 || it->second >= int(vdoc.offsets.size())) {
}
if (it->second < 0 || it->second >= int(vdoc.offsets.size())) {
//??
return nullptr; return nullptr;
} }
return vdoc.base + vdoc.offsets[it->second]; return vdoc.base + vdoc.offsets[it->second];
} }
} // namespace Rcl

View File

@ -17,24 +17,49 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/ */
#ifndef _QRESULTSTORE_H_INCLUDED_ #ifndef _QRESULTSTORE_H_INCLUDED_
#define _QRESULTSTORE_H_INCLUDED_ #define _QRESULTSTORE_H_INCLUDED_
/**
* Implement an efficient way to store the whole or part of a query result set.
* This would naturally be done as a vector<Rcl::Doc>, but the natural
* way leads to a huge space waste (8-10x), which may be a problem in
* some cases. This is mostly used by the uprcl Media Server.
*/
#include <string> #include <string>
#include <set> #include <set>
namespace Rcl { namespace Rcl {
class Query; class Query;
}
class QResultStore { class QResultStore {
public: public:
QResultStore(); QResultStore();
~QResultStore(); ~QResultStore();
bool storeQuery(Rcl::Query& q, std::set<std::string> excluded = {}); /**
const char *fieldvalue(int docindex, const std::string& fldname); * Fetch and store the results of the input query.
*
* @param q the executed query object to use for fetching results.
* @param fldspec list of fields to be excluded or included.
* @param isinc if true, the field list defines the fields to be stored,
* else, those to be excluded.
*/
bool storeQuery(Rcl::Query& q, std::set<std::string> fldspec = {},
bool isinc = false);
/** Retrieve count of stored results */
int getCount();
/**
* Retrieve field value.
*
* @param docindex index in query results.
* @param fldname field name.
*/
const char *fieldValue(int docindex, const std::string& fldname);
QResultStore(const QResultStore&) = delete; QResultStore(const QResultStore&) = delete;
QResultStore& operator=(const QResultStore&) = delete; QResultStore& operator=(const QResultStore&) = delete;
@ -43,4 +68,5 @@ private:
Internal *m{nullptr}; Internal *m{nullptr};
}; };
}
#endif /* _QRESULTSTORE_H_INCLUDED_ */ #endif /* _QRESULTSTORE_H_INCLUDED_ */

View File

@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(DEFS) $(DEFS)
noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \ noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \
circache wipedir mimetype pathut fileudi x11mon rclqdocmem circache wipedir mimetype pathut fileudi x11mon trqrstore
circache_SOURCES = trcircache.cpp circache_SOURCES = trcircache.cpp
circache_LDADD = ../librecoll.la circache_LDADD = ../librecoll.la
@ -80,5 +80,5 @@ wipedir_LDADD = ../librecoll.la
x11mon_SOURCES = trx11mon.cpp x11mon_SOURCES = trx11mon.cpp
x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11 x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11
rclqdocmem_SOURCES = rclqdocmem.cpp qresultstore.cpp trqrstore_SOURCES = trqrstore.cpp
rclqdocmem_LDADD = ../librecoll.la trqrstore_LDADD = ../librecoll.la

View File

@ -306,8 +306,7 @@ int main(int argc, char *argv[])
// This uses 19 MB of storage for the audio index, and 72 MB for // This uses 19 MB of storage for the audio index, and 72 MB for
// the main one (less keys->less gain) // the main one (less keys->less gain)
{ {
#if 1 Rcl::QResultStore store;
QResultStore store;
bool result = store.storeQuery( bool result = store.storeQuery(
query, {"author", "ipath", "rcludi", "relevancyrating", query, {"author", "ipath", "rcludi", "relevancyrating",
"sig","abstract", "caption", "filename", "origcharset", "sig"}); "sig","abstract", "caption", "filename", "origcharset", "sig"});
@ -316,121 +315,7 @@ int main(int argc, char *argv[])
return 1; return 1;
} }
meminfo("After storing"); meminfo("After storing");
std::cerr << "url 20 " << store.fieldvalue(20, "url") << "\n"; std::cerr << "url 20 " << store.fieldValue(20, "url") << "\n";
#else
/////////////
// Enumerate all existing keys and assign array indexes for
// them. Count documents while we are at it.
std::map<std::string, int> keyidx {
{"url",0},
{"mimetype", 1},
{"fmtime", 2},
{"dmtime", 3},
{"fbytes", 4},
{"dbytes", 5},
};
int ndocs = 0;
for (;;ndocs++) {
Rcl::Doc doc;
if (!query.getDoc(ndocs, doc, false)) {
break;
}
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
int idx = keyidx.size();
keyidx.insert({entry.first, idx});
};
}
}
}
// The audio db has 49 keys !
std::cerr << "Found " << keyidx.size() << " different keys\n";
///////
// Populate the main array with doc-equivalent structures.
// Notes: offsets[0] is always 0, not really useful, simpler this way. Also
// could use simple C array instead of c++ vector...
struct docoffs {
~docoffs() {
free(base);
}
char *base{nullptr};
std::vector<int> offsets;
};
std::vector<struct docoffs> docs;
docs.resize(ndocs);
meminfo("After resize");
for (int i = 0; i < ndocs; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc, false)) {
break;
}
auto& vdoc = docs[i];
vdoc.offsets.resize(keyidx.size());
int nbytes =
doc.url.size() + 1 +
doc.mimetype.size() + 1 +
doc.fmtime.size() + 1 +
doc.dmtime.size() + 1 +
doc.fbytes.size() + 1 +
doc.dbytes.size() + 1;
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
if (keyidx.find(entry.first) == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
nbytes += entry.second.size() + 1;
}
}
char *cp = (char*)malloc(nbytes);
if (nullptr == cp) {
abort();
}
#define STRINGCPCOPY(CHARP, S) do { \
memcpy(CHARP, S.c_str(), S.size()+1); \
CHARP += S.size()+1; \
} while (false);
vdoc.base = cp;
vdoc.offsets[0] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.url);
vdoc.offsets[1] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.mimetype);
vdoc.offsets[2] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fmtime);
vdoc.offsets[3] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dmtime);
vdoc.offsets[4] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.fbytes);
vdoc.offsets[5] = cp - vdoc.base;
STRINGCPCOPY(cp, doc.dbytes);
for (const auto& entry : doc.meta) {
if (testentry(entry)) {
auto it = keyidx.find(entry.first);
if (it == keyidx.end()) {
std::cerr << "Unknown key: " << entry.first << "\n";
abort();
}
if (it->second <= 5) {
// Already done ! Storing another address would be
// wasteful and crash when freeing...
continue;
}
vdoc.offsets[it->second] = cp - vdoc.base;
STRINGCPCOPY(cp, entry.second);
}
}
}
meminfo("After storing");
#endif
} }
#elif defined(STORE_ALLOBSTACK) #elif defined(STORE_ALLOBSTACK)