Implement doc_ass_subscript, cleanup the Doc interface. Connect the qresultstore to the python module

This commit is contained in:
Jean-Francois Dockes 2020-12-18 11:20:39 +01:00
parent ea9b5ab9eb
commit 22f62216d2
4 changed files with 242 additions and 197 deletions

View File

@ -327,11 +327,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
return 0;
}
PyDoc_STRVAR(doc_Doc_getbinurl,
"getbinurl(none) -> binary url\n"
"\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
PyDoc_STRVAR(
doc_Doc_getbinurl,
"getbinurl(none) -> binary url\n"
"\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
);
static PyObject *
@ -339,17 +340,18 @@ Doc_getbinurl(recoll_DocObject *self)
{
LOGDEB0("Doc_getbinurl\n");
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
PyErr_SetString(PyExc_AttributeError, "doc is NULL");
Py_RETURN_NONE;
}
return PyBytes_FromStringAndSize(self->doc->url.c_str(),
self->doc->url.size());
self->doc->url.size());
}
PyDoc_STRVAR(doc_Doc_setbinurl,
"setbinurl(url) -> binary url\n"
"\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
PyDoc_STRVAR(
doc_Doc_setbinurl,
"setbinurl(url) -> binary url\n"
"\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
);
static PyObject *
@ -387,7 +389,7 @@ Doc_keys(recoll_DocObject *self)
return 0;
for (const auto& entry : self->doc->meta) {
PyList_Append(pkeys,
PyUnicode_Decode(entry.first.c_str(),entry.first.size(),
PyUnicode_Decode(entry.first.c_str(), entry.first.size(),
"UTF-8", "replace"));
}
return pkeys;
@ -537,6 +539,23 @@ static PyMethodDef Doc_methods[] = {
{NULL} /* Sentinel */
};
static int pys2cpps(PyObject *pyval, std::string& out)
{
if (PyUnicode_Check(pyval)) {
PyObject* utf8o = PyUnicode_AsUTF8String(pyval);
if (utf8o == 0) {
return -1;
}
out = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(pyval)) {
out = PyBytes_AsString(pyval);
} else {
return -1;
}
return 0;
}
// Note that this returns None if the attribute is not found instead of raising
// an exception as would be standard. We don't change it to keep existing code
// working.
@ -560,18 +579,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
PyErr_Clear();
string name;
if (PyUnicode_Check(nameobj)) {
PyObject* utf8o = PyUnicode_AsUTF8String(nameobj);
if (utf8o == 0) {
LOGERR("Doc_getattro: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(nameobj)) {
name = PyBytes_AsString(nameobj);
} else {
if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
Py_RETURN_NONE;
}
@ -588,7 +596,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
}
static int
Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
Doc_setattro(recoll_DocObject *self, PyObject *nameobj, PyObject *value)
{
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??");
@ -599,84 +607,78 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
"Configuration not initialized");
return -1;
}
if (name == 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
string name;
if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
return -1;
}
if (PyBytes_Check(value)) {
value = PyUnicode_FromEncodedObject(value, "UTF-8", "strict");
if (value == 0)
return -1;
}
if (!PyUnicode_Check(value)) {
PyErr_SetString(PyExc_AttributeError, "value not unicode??");
string uvalue;
if (pys2cpps(value, uvalue) < 0) {
PyErr_SetString(PyExc_AttributeError, "value neither bytes nor str");
return -1;
}
PyObject* putf8 = PyUnicode_AsUTF8String(value);
if (putf8 == 0) {
LOGERR("Doc_setmeta: encoding to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "value??");
return -1;
}
string uvalue = PyBytes_AsString(putf8);
Py_DECREF(putf8);
string key = self->rclconfig->fieldQCanon(name);
LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name <<
") -> [" << uvalue << "]\n");
// We set the value in the meta array in all cases. Good idea ? or do it
// only for fields without a dedicated Doc:: entry?
self->doc->meta[key] = uvalue;
// Note that some attributes are set both as struct fields and
// meta members, keep compat with movedocfields() used when
// fetching from query.
switch (key.at(0)) {
case 't':
if (!key.compare("text")) {
if (key == "text") {
self->doc->text.swap(uvalue);
}
break;
case 'u':
if (!key.compare(Rcl::Doc::keyurl)) {
if (key == Rcl::Doc::keyurl) {
self->doc->url.swap(uvalue);
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
}
break;
case 'f':
if (!key.compare(Rcl::Doc::keyfs)) {
if (key == Rcl::Doc::keyfs) {
self->doc->fbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keyfmt)) {
self->doc->meta[Rcl::Doc::keyfs] = self->doc->fbytes;
} else if (key == Rcl::Doc::keyfmt) {
self->doc->fmtime.swap(uvalue);
}
break;
case 'd':
if (!key.compare(Rcl::Doc::keyds)) {
if (key == Rcl::Doc::keyds) {
self->doc->dbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keydmt)) {
self->doc->meta[Rcl::Doc::keyds] = self->doc->dbytes;
} else if (key == Rcl::Doc::keydmt) {
self->doc->dmtime.swap(uvalue);
}
break;
case 'i':
if (!key.compare(Rcl::Doc::keyipt)) {
if (key == Rcl::Doc::keyipt) {
self->doc->ipath.swap(uvalue);
self->doc->meta[Rcl::Doc::keyipt] = self->doc->ipath;
}
break;
case 'm':
if (!key.compare(Rcl::Doc::keytp)) {
if (key == Rcl::Doc::keytp) {
self->doc->mimetype.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keymt)) {
self->doc->meta[Rcl::Doc::keytp] = self->doc->mimetype;
} else if (key == Rcl::Doc::keymt) {
self->doc->dmtime.swap(uvalue);
}
break;
case 'o':
if (!key.compare(Rcl::Doc::keyoc)) {
if (key == Rcl::Doc::keyoc) {
self->doc->origcharset.swap(uvalue);
}
break;
case 's':
if (!key.compare(Rcl::Doc::keysig)) {
if (key == Rcl::Doc::keysig) {
self->doc->sig.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keysz)) {
} else if (key == Rcl::Doc::keysz) {
self->doc->dbytes.swap(uvalue);
}
break;
@ -697,6 +699,7 @@ Doc_length(recoll_DocObject *self)
static PyObject *
Doc_subscript(recoll_DocObject *self, PyObject *key)
{
// Can't just return getattro because this first checks for a method name
if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??");
return NULL;
@ -707,18 +710,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
return NULL;
}
string name;
if (PyUnicode_Check(key)) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
LOGERR("Doc_getitemo: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
if (pys2cpps(key, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE;
}
@ -726,54 +718,61 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
string skey = self->rclconfig->fieldQCanon(name);
string value;
if (idocget(self, skey, value)) {
return PyUnicode_Decode(value.c_str(), value.size(), "UTF-8","replace");
return PyUnicode_Decode(value.c_str(), value.size(),
"UTF-8", "backslashreplace");
}
Py_RETURN_NONE;
}
static int
Doc_ass_subscript(recoll_DocObject *self, PyObject *key, PyObject *val)
{
return Doc_setattro(self, key, val);
}
static PyMappingMethods doc_as_mapping = {
(lenfunc)Doc_length, /*mp_length*/
(binaryfunc)Doc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
(objobjargproc)Doc_ass_subscript, /*mp_ass_subscript*/
};
PyDoc_STRVAR(doc_DocObject,
"Doc()\n"
"\n"
"A Doc object contains index data for a given document.\n"
"The data is extracted from the index when searching, or set by the\n"
"indexer program when updating. The Doc object has no useful methods but\n"
"many attributes to be read or set by its user. It matches exactly the\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"especially when indexing, others can be set, the name of which will be\n"
"processed as field names by the indexing configuration.\n"
"Inputs can be specified as unicode or strings.\n"
"Outputs are unicode objects.\n"
"All dates are specified as unix timestamps, printed as strings\n"
"Predefined attributes (index/query/both):\n"
" text (index): document plain text\n"
" url (both)\n"
" fbytes (both) optional) file size in bytes\n"
" filename (both)\n"
" fmtime (both) optional file modification date. Unix time printed \n"
" as string\n"
" dbytes (both) document text bytes\n"
" dmtime (both) document creation/modification date\n"
" ipath (both) value private to the app.: internal access path\n"
" inside file\n"
" mtype (both) mime type for original document\n"
" mtime (query) dmtime if set else fmtime\n"
" origcharset (both) charset the text was converted from\n"
" size (query) dbytes if set, else fbytes\n"
" sig (both) app-defined file modification signature. \n"
" For up to date checks\n"
" relevancyrating (query)\n"
" abstract (both)\n"
" author (both)\n"
" title (both)\n"
" keywords (both)\n"
PyDoc_STRVAR(
doc_DocObject,
"Doc()\n"
"\n"
"A Doc object contains index data for a given document.\n"
"The data is extracted from the index when searching, or set by the\n"
"indexer program when updating. The Doc object has no useful methods but\n"
"many attributes to be read or set by its user. It matches exactly the\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"especially when indexing, others can be set, the name of which will be\n"
"processed as field names by the indexing configuration.\n"
"Inputs can be specified as unicode or strings.\n"
"Outputs are unicode objects.\n"
"All dates are specified as unix timestamps, printed as strings\n"
"Predefined attributes (index/query/both):\n"
" text (index): document plain text\n"
" url (both)\n"
" fbytes (both) optional) file size in bytes\n"
" filename (both)\n"
" fmtime (both) optional file modification date. Unix time printed \n"
" as string\n"
" dbytes (both) document text bytes\n"
" dmtime (both) document creation/modification date\n"
" ipath (both) value private to the app.: internal access path\n"
" inside file\n"
" mtype (both) mime type for original document\n"
" mtime (query) dmtime if set else fmtime\n"
" origcharset (both) charset the text was converted from\n"
" size (query) dbytes if set, else fbytes\n"
" sig (both) app-defined file modification signature. \n"
" For up to date checks\n"
" relevancyrating (query)\n"
" abstract (both)\n"
" author (both)\n"
" title (both)\n"
" keywords (both)\n"
);
PyTypeObject recoll_DocType = {
@ -784,7 +783,7 @@ PyTypeObject recoll_DocType = {
(destructor)Doc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
(setattrfunc)Doc_setattr, /*tp_setattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
@ -794,7 +793,7 @@ PyTypeObject recoll_DocType = {
0, /*tp_call*/
0, /*tp_str*/
(getattrofunc)Doc_getattro,/*tp_getattro*/
0, /*tp_setattro*/
(setattrofunc)Doc_setattro,/*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
doc_DocObject, /* tp_doc */
@ -2187,6 +2186,11 @@ PyInit__recoll(void)
Py_INCREF(&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType);
if (PyType_Ready(&recoll_QRSDocType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QRSDocType);
PyModule_AddObject(module, "QRSDoc",
(PyObject *)&recoll_QRSDocType);
#if PY_MAJOR_VERSION >= 3
return module;

View File

@ -57,5 +57,6 @@ extern PyTypeObject recoll_DocType;
extern PyTypeObject recoll_QueryType;
extern PyTypeObject rclx_ExtractorType;
extern PyTypeObject recoll_QResultStoreType;
extern PyTypeObject recoll_QRSDocType;
#endif // _PYRECOLL_H_INCLUDED_

View File

@ -231,91 +231,131 @@ PyTypeObject recoll_QResultStoreType = {
QResultStore_new, /* tp_new */
};
////////////////////////////////////////////////////////////////////////
// QRSDoc iterator
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
recoll_QResultStoreObject *pystore;
int index;
} recoll_QRSDocObject;
//////////////////////////////////////////////////////////////////////////
// Module methods
static PyMethodDef rclrstore_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyDoc_STRVAR(pyrclrstore_doc_string,
"Utility module for efficiently storing many query results.\n");
struct module_state {
PyObject *error;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
static int rclrstore_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int rclrstore_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_rclrstore",
NULL,
sizeof(struct module_state),
rclrstore_methods,
NULL,
rclrstore_traverse,
rclrstore_clear,
NULL
};
#define INITERROR return NULL
extern "C" PyObject *
PyInit__rclrstore(void)
#else
#define INITERROR return
PyMODINIT_FUNC
init__rclrstore(void)
#endif
static void
QRSDoc_dealloc(recoll_QRSDocObject *self)
{
// Note: we can't call recollinit here, because the confdir is only really
// known when the first db object is created (it is an optional parameter).
// Using a default here may end up with variables such as stripchars being
// wrong
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("_rclrstore", rclrstore_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
// The first parameter is a char *. Hopefully we don't initialize
// modules too often...
st->error = PyErr_NewException(strdup("_rclrstore.Error"), NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore",
(PyObject *)&recoll_QResultStoreType);
PyModule_AddStringConstant(module, "__doc__", pyrclrstore_doc_string);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
Py_DECREF(self->pystore);
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
QRSDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
recoll_QRSDocObject *self = (recoll_QRSDocObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QRSDocObject,
"QRSDoc(resultstore, index)\n"
"\n"
"A QRSDoc gives access to one result from a qresultstore.\n"
);
static int
QRSDoc_init(
recoll_QRSDocObject *self, PyObject *args, PyObject *kwargs)
{
recoll_QResultStoreObject *pystore;
int index;
if (!PyArg_ParseTuple(args, "O!i",
&recoll_QResultStoreType, &pystore, &index)) {
return -1;
}
Py_INCREF(pystore);
self->pystore = pystore;
self->index = index;
return 0;
}
static PyObject *
QRSDoc_subscript(recoll_QRSDocObject *self, PyObject *key)
{
if (self->pystore == 0) {
PyErr_SetString(PyExc_AttributeError, "store??");
return NULL;
}
string name;
if (PyUnicode_Check(key)) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE;
}
const char *value = self->pystore->store->fieldValue(self->index, name);
if (nullptr == value) {
Py_RETURN_NONE;
}
return PyBytes_FromString(value);
}
static PyMappingMethods qrsdoc_as_mapping = {
(lenfunc)0, /*mp_length*/
(binaryfunc)QRSDoc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
};
static PyMethodDef QRSDoc_methods[] = {
{NULL} /* Sentinel */
};
PyTypeObject recoll_QRSDocType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QRSDoc", /*tp_name*/
sizeof(recoll_QRSDocObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QRSDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
&qrsdoc_as_mapping, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QRSDocObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QRSDoc_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QRSDoc_init, /* tp_init */
0, /* tp_alloc */
QRSDoc_new, /* tp_new */
};

View File

@ -20,4 +20,4 @@
# now: maybe we'll do something with them in the future).
from ._recoll import QResultStore
from ._recoll import QResultStore, QRSDoc