Implement doc_ass_subscript, cleanup the Doc interface. Connect the qresultstore to the python module

This commit is contained in:
Jean-Francois Dockes 2020-12-18 11:20:39 +01:00
parent ea9b5ab9eb
commit 22f62216d2
4 changed files with 242 additions and 197 deletions

View File

@ -327,11 +327,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
return 0; return 0;
} }
PyDoc_STRVAR(doc_Doc_getbinurl, PyDoc_STRVAR(
"getbinurl(none) -> binary url\n" doc_Doc_getbinurl,
"\n" "getbinurl(none) -> binary url\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n" "\n"
"file system path, without encoding\n" "Returns an URL with a path part which is a as bit for bit copy of the \n"
"file system path, without encoding\n"
); );
static PyObject * static PyObject *
@ -339,17 +340,18 @@ Doc_getbinurl(recoll_DocObject *self)
{ {
LOGDEB0("Doc_getbinurl\n"); LOGDEB0("Doc_getbinurl\n");
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc"); PyErr_SetString(PyExc_AttributeError, "doc is NULL");
return 0; Py_RETURN_NONE;
} }
return PyBytes_FromStringAndSize(self->doc->url.c_str(), return PyBytes_FromStringAndSize(self->doc->url.c_str(),
self->doc->url.size()); self->doc->url.size());
} }
PyDoc_STRVAR(doc_Doc_setbinurl, PyDoc_STRVAR(
"setbinurl(url) -> binary url\n" doc_Doc_setbinurl,
"\n" "setbinurl(url) -> binary url\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n" "\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
); );
static PyObject * static PyObject *
@ -387,7 +389,7 @@ Doc_keys(recoll_DocObject *self)
return 0; return 0;
for (const auto& entry : self->doc->meta) { for (const auto& entry : self->doc->meta) {
PyList_Append(pkeys, PyList_Append(pkeys,
PyUnicode_Decode(entry.first.c_str(),entry.first.size(), PyUnicode_Decode(entry.first.c_str(), entry.first.size(),
"UTF-8", "replace")); "UTF-8", "replace"));
} }
return pkeys; return pkeys;
@ -537,6 +539,23 @@ static PyMethodDef Doc_methods[] = {
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };
static int pys2cpps(PyObject *pyval, std::string& out)
{
if (PyUnicode_Check(pyval)) {
PyObject* utf8o = PyUnicode_AsUTF8String(pyval);
if (utf8o == 0) {
return -1;
}
out = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(pyval)) {
out = PyBytes_AsString(pyval);
} else {
return -1;
}
return 0;
}
// Note that this returns None if the attribute is not found instead of raising // Note that this returns None if the attribute is not found instead of raising
// an exception as would be standard. We don't change it to keep existing code // an exception as would be standard. We don't change it to keep existing code
// working. // working.
@ -560,18 +579,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
PyErr_Clear(); PyErr_Clear();
string name; string name;
if (PyUnicode_Check(nameobj)) { if (pys2cpps(nameobj, name) < 0) {
PyObject* utf8o = PyUnicode_AsUTF8String(nameobj);
if (utf8o == 0) {
LOGERR("Doc_getattro: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(nameobj)) {
name = PyBytes_AsString(nameobj);
} else {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??"); PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -588,7 +596,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
} }
static int static int
Doc_setattr(recoll_DocObject *self, char *name, PyObject *value) Doc_setattro(recoll_DocObject *self, PyObject *nameobj, PyObject *value)
{ {
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??"); PyErr_SetString(PyExc_AttributeError, "doc??");
@ -599,84 +607,78 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
"Configuration not initialized"); "Configuration not initialized");
return -1; return -1;
} }
if (name == 0) { string name;
PyErr_SetString(PyExc_AttributeError, "name??"); if (pys2cpps(nameobj, name) < 0) {
PyErr_SetString(PyExc_AttributeError, "name not unicode nor string??");
return -1; return -1;
} }
if (PyBytes_Check(value)) { string uvalue;
value = PyUnicode_FromEncodedObject(value, "UTF-8", "strict"); if (pys2cpps(value, uvalue) < 0) {
if (value == 0) PyErr_SetString(PyExc_AttributeError, "value neither bytes nor str");
return -1;
}
if (!PyUnicode_Check(value)) {
PyErr_SetString(PyExc_AttributeError, "value not unicode??");
return -1; return -1;
} }
PyObject* putf8 = PyUnicode_AsUTF8String(value);
if (putf8 == 0) {
LOGERR("Doc_setmeta: encoding to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "value??");
return -1;
}
string uvalue = PyBytes_AsString(putf8);
Py_DECREF(putf8);
string key = self->rclconfig->fieldQCanon(name); string key = self->rclconfig->fieldQCanon(name);
LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name << LOGDEB0("Doc_setattr: doc " << self->doc << " [" << key << "] (" << name <<
") -> [" << uvalue << "]\n"); ") -> [" << uvalue << "]\n");
// We set the value in the meta array in all cases. Good idea ? or do it // Note that some attributes are set both as struct fields and
// only for fields without a dedicated Doc:: entry? // meta members, keep compat with movedocfields() used when
self->doc->meta[key] = uvalue; // fetching from query.
switch (key.at(0)) { switch (key.at(0)) {
case 't': case 't':
if (!key.compare("text")) { if (key == "text") {
self->doc->text.swap(uvalue); self->doc->text.swap(uvalue);
} }
break; break;
case 'u': case 'u':
if (!key.compare(Rcl::Doc::keyurl)) { if (key == Rcl::Doc::keyurl) {
self->doc->url.swap(uvalue); self->doc->url.swap(uvalue);
printableUrl(self->rclconfig->getDefCharset(), self->doc->url,
self->doc->meta[Rcl::Doc::keyurl]);
} }
break; break;
case 'f': case 'f':
if (!key.compare(Rcl::Doc::keyfs)) { if (key == Rcl::Doc::keyfs) {
self->doc->fbytes.swap(uvalue); self->doc->fbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keyfmt)) { self->doc->meta[Rcl::Doc::keyfs] = self->doc->fbytes;
} else if (key == Rcl::Doc::keyfmt) {
self->doc->fmtime.swap(uvalue); self->doc->fmtime.swap(uvalue);
} }
break; break;
case 'd': case 'd':
if (!key.compare(Rcl::Doc::keyds)) { if (key == Rcl::Doc::keyds) {
self->doc->dbytes.swap(uvalue); self->doc->dbytes.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keydmt)) { self->doc->meta[Rcl::Doc::keyds] = self->doc->dbytes;
} else if (key == Rcl::Doc::keydmt) {
self->doc->dmtime.swap(uvalue); self->doc->dmtime.swap(uvalue);
} }
break; break;
case 'i': case 'i':
if (!key.compare(Rcl::Doc::keyipt)) { if (key == Rcl::Doc::keyipt) {
self->doc->ipath.swap(uvalue); self->doc->ipath.swap(uvalue);
self->doc->meta[Rcl::Doc::keyipt] = self->doc->ipath;
} }
break; break;
case 'm': case 'm':
if (!key.compare(Rcl::Doc::keytp)) { if (key == Rcl::Doc::keytp) {
self->doc->mimetype.swap(uvalue); self->doc->mimetype.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keymt)) { self->doc->meta[Rcl::Doc::keytp] = self->doc->mimetype;
} else if (key == Rcl::Doc::keymt) {
self->doc->dmtime.swap(uvalue); self->doc->dmtime.swap(uvalue);
} }
break; break;
case 'o': case 'o':
if (!key.compare(Rcl::Doc::keyoc)) { if (key == Rcl::Doc::keyoc) {
self->doc->origcharset.swap(uvalue); self->doc->origcharset.swap(uvalue);
} }
break; break;
case 's': case 's':
if (!key.compare(Rcl::Doc::keysig)) { if (key == Rcl::Doc::keysig) {
self->doc->sig.swap(uvalue); self->doc->sig.swap(uvalue);
} else if (!key.compare(Rcl::Doc::keysz)) { } else if (key == Rcl::Doc::keysz) {
self->doc->dbytes.swap(uvalue); self->doc->dbytes.swap(uvalue);
} }
break; break;
@ -697,6 +699,7 @@ Doc_length(recoll_DocObject *self)
static PyObject * static PyObject *
Doc_subscript(recoll_DocObject *self, PyObject *key) Doc_subscript(recoll_DocObject *self, PyObject *key)
{ {
// Can't just return getattro because this first checks for a method name
if (self->doc == 0) { if (self->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "doc??"); PyErr_SetString(PyExc_AttributeError, "doc??");
return NULL; return NULL;
@ -707,18 +710,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
return NULL; return NULL;
} }
string name; string name;
if (PyUnicode_Check(key)) { if (pys2cpps(key, name) < 0) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
LOGERR("Doc_getitemo: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??"); PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -726,54 +718,61 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
string skey = self->rclconfig->fieldQCanon(name); string skey = self->rclconfig->fieldQCanon(name);
string value; string value;
if (idocget(self, skey, value)) { if (idocget(self, skey, value)) {
return PyUnicode_Decode(value.c_str(), value.size(), "UTF-8","replace"); return PyUnicode_Decode(value.c_str(), value.size(),
"UTF-8", "backslashreplace");
} }
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static int
Doc_ass_subscript(recoll_DocObject *self, PyObject *key, PyObject *val)
{
return Doc_setattro(self, key, val);
}
static PyMappingMethods doc_as_mapping = { static PyMappingMethods doc_as_mapping = {
(lenfunc)Doc_length, /*mp_length*/ (lenfunc)Doc_length, /*mp_length*/
(binaryfunc)Doc_subscript, /*mp_subscript*/ (binaryfunc)Doc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/ (objobjargproc)Doc_ass_subscript, /*mp_ass_subscript*/
}; };
PyDoc_STRVAR(doc_DocObject, PyDoc_STRVAR(
"Doc()\n" doc_DocObject,
"\n" "Doc()\n"
"A Doc object contains index data for a given document.\n" "\n"
"The data is extracted from the index when searching, or set by the\n" "A Doc object contains index data for a given document.\n"
"indexer program when updating. The Doc object has no useful methods but\n" "The data is extracted from the index when searching, or set by the\n"
"many attributes to be read or set by its user. It matches exactly the\n" "indexer program when updating. The Doc object has no useful methods but\n"
"Rcl::Doc c++ object. Some of the attributes are predefined, but, \n" "many attributes to be read or set by its user. It matches exactly the\n"
"especially when indexing, others can be set, the name of which will be\n" "Rcl::Doc c++ object. Some of the attributes are predefined, but, \n"
"processed as field names by the indexing configuration.\n" "especially when indexing, others can be set, the name of which will be\n"
"Inputs can be specified as unicode or strings.\n" "processed as field names by the indexing configuration.\n"
"Outputs are unicode objects.\n" "Inputs can be specified as unicode or strings.\n"
"All dates are specified as unix timestamps, printed as strings\n" "Outputs are unicode objects.\n"
"Predefined attributes (index/query/both):\n" "All dates are specified as unix timestamps, printed as strings\n"
" text (index): document plain text\n" "Predefined attributes (index/query/both):\n"
" url (both)\n" " text (index): document plain text\n"
" fbytes (both) optional) file size in bytes\n" " url (both)\n"
" filename (both)\n" " fbytes (both) optional) file size in bytes\n"
" fmtime (both) optional file modification date. Unix time printed \n" " filename (both)\n"
" as string\n" " fmtime (both) optional file modification date. Unix time printed \n"
" dbytes (both) document text bytes\n" " as string\n"
" dmtime (both) document creation/modification date\n" " dbytes (both) document text bytes\n"
" ipath (both) value private to the app.: internal access path\n" " dmtime (both) document creation/modification date\n"
" inside file\n" " ipath (both) value private to the app.: internal access path\n"
" mtype (both) mime type for original document\n" " inside file\n"
" mtime (query) dmtime if set else fmtime\n" " mtype (both) mime type for original document\n"
" origcharset (both) charset the text was converted from\n" " mtime (query) dmtime if set else fmtime\n"
" size (query) dbytes if set, else fbytes\n" " origcharset (both) charset the text was converted from\n"
" sig (both) app-defined file modification signature. \n" " size (query) dbytes if set, else fbytes\n"
" For up to date checks\n" " sig (both) app-defined file modification signature. \n"
" relevancyrating (query)\n" " For up to date checks\n"
" abstract (both)\n" " relevancyrating (query)\n"
" author (both)\n" " abstract (both)\n"
" title (both)\n" " author (both)\n"
" keywords (both)\n" " title (both)\n"
" keywords (both)\n"
); );
PyTypeObject recoll_DocType = { PyTypeObject recoll_DocType = {
@ -784,7 +783,7 @@ PyTypeObject recoll_DocType = {
(destructor)Doc_dealloc, /*tp_dealloc*/ (destructor)Doc_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
0, /*tp_getattr*/ 0, /*tp_getattr*/
(setattrfunc)Doc_setattr, /*tp_setattr*/ 0, /*tp_setattr*/
0, /*tp_compare*/ 0, /*tp_compare*/
0, /*tp_repr*/ 0, /*tp_repr*/
0, /*tp_as_number*/ 0, /*tp_as_number*/
@ -794,7 +793,7 @@ PyTypeObject recoll_DocType = {
0, /*tp_call*/ 0, /*tp_call*/
0, /*tp_str*/ 0, /*tp_str*/
(getattrofunc)Doc_getattro,/*tp_getattro*/ (getattrofunc)Doc_getattro,/*tp_getattro*/
0, /*tp_setattro*/ (setattrofunc)Doc_setattro,/*tp_setattro*/
0, /*tp_as_buffer*/ 0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/ Py_TPFLAGS_DEFAULT, /*tp_flags*/
doc_DocObject, /* tp_doc */ doc_DocObject, /* tp_doc */
@ -2187,6 +2186,11 @@ PyInit__recoll(void)
Py_INCREF(&recoll_QResultStoreType); Py_INCREF(&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType); PyModule_AddObject(module, "QResultStore", (PyObject *)&recoll_QResultStoreType);
if (PyType_Ready(&recoll_QRSDocType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QRSDocType);
PyModule_AddObject(module, "QRSDoc",
(PyObject *)&recoll_QRSDocType);
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
return module; return module;

View File

@ -57,5 +57,6 @@ extern PyTypeObject recoll_DocType;
extern PyTypeObject recoll_QueryType; extern PyTypeObject recoll_QueryType;
extern PyTypeObject rclx_ExtractorType; extern PyTypeObject rclx_ExtractorType;
extern PyTypeObject recoll_QResultStoreType; extern PyTypeObject recoll_QResultStoreType;
extern PyTypeObject recoll_QRSDocType;
#endif // _PYRECOLL_H_INCLUDED_ #endif // _PYRECOLL_H_INCLUDED_

View File

@ -231,91 +231,131 @@ PyTypeObject recoll_QResultStoreType = {
QResultStore_new, /* tp_new */ QResultStore_new, /* tp_new */
}; };
////////////////////////////////////////////////////////////////////////
// QRSDoc iterator
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
recoll_QResultStoreObject *pystore;
int index;
} recoll_QRSDocObject;
////////////////////////////////////////////////////////////////////////// static void
// Module methods QRSDoc_dealloc(recoll_QRSDocObject *self)
static PyMethodDef rclrstore_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyDoc_STRVAR(pyrclrstore_doc_string,
"Utility module for efficiently storing many query results.\n");
struct module_state {
PyObject *error;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
static int rclrstore_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int rclrstore_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_rclrstore",
NULL,
sizeof(struct module_state),
rclrstore_methods,
NULL,
rclrstore_traverse,
rclrstore_clear,
NULL
};
#define INITERROR return NULL
extern "C" PyObject *
PyInit__rclrstore(void)
#else
#define INITERROR return
PyMODINIT_FUNC
init__rclrstore(void)
#endif
{ {
// Note: we can't call recollinit here, because the confdir is only really Py_DECREF(self->pystore);
// known when the first db object is created (it is an optional parameter). Py_TYPE(self)->tp_free((PyObject*)self);
// Using a default here may end up with variables such as stripchars being
// wrong
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("_rclrstore", rclrstore_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
// The first parameter is a char *. Hopefully we don't initialize
// modules too often...
st->error = PyErr_NewException(strdup("_rclrstore.Error"), NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (PyType_Ready(&recoll_QResultStoreType) < 0)
INITERROR;
Py_INCREF((PyObject*)&recoll_QResultStoreType);
PyModule_AddObject(module, "QResultStore",
(PyObject *)&recoll_QResultStoreType);
PyModule_AddStringConstant(module, "__doc__", pyrclrstore_doc_string);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
} }
static PyObject *
QRSDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
recoll_QRSDocObject *self = (recoll_QRSDocObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
return (PyObject *)self;
}
PyDoc_STRVAR(qrs_doc_QRSDocObject,
"QRSDoc(resultstore, index)\n"
"\n"
"A QRSDoc gives access to one result from a qresultstore.\n"
);
static int
QRSDoc_init(
recoll_QRSDocObject *self, PyObject *args, PyObject *kwargs)
{
recoll_QResultStoreObject *pystore;
int index;
if (!PyArg_ParseTuple(args, "O!i",
&recoll_QResultStoreType, &pystore, &index)) {
return -1;
}
Py_INCREF(pystore);
self->pystore = pystore;
self->index = index;
return 0;
}
static PyObject *
QRSDoc_subscript(recoll_QRSDocObject *self, PyObject *key)
{
if (self->pystore == 0) {
PyErr_SetString(PyExc_AttributeError, "store??");
return NULL;
}
string name;
if (PyUnicode_Check(key)) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
name = PyBytes_AsString(utf8o);
Py_DECREF(utf8o);
} else if (PyBytes_Check(key)) {
name = PyBytes_AsString(key);
} else {
PyErr_SetString(PyExc_AttributeError, "key not unicode nor string??");
Py_RETURN_NONE;
}
const char *value = self->pystore->store->fieldValue(self->index, name);
if (nullptr == value) {
Py_RETURN_NONE;
}
return PyBytes_FromString(value);
}
static PyMappingMethods qrsdoc_as_mapping = {
(lenfunc)0, /*mp_length*/
(binaryfunc)QRSDoc_subscript, /*mp_subscript*/
(objobjargproc)0, /*mp_ass_subscript*/
};
static PyMethodDef QRSDoc_methods[] = {
{NULL} /* Sentinel */
};
PyTypeObject recoll_QRSDocType = {
PyVarObject_HEAD_INIT(NULL, 0)
"_recoll.QRSDoc", /*tp_name*/
sizeof(recoll_QRSDocObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)QRSDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
&qrsdoc_as_mapping, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
qrs_doc_QRSDocObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
QRSDoc_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)QRSDoc_init, /* tp_init */
0, /* tp_alloc */
QRSDoc_new, /* tp_new */
};

View File

@ -20,4 +20,4 @@
# now: maybe we'll do something with them in the future). # now: maybe we'll do something with them in the future).
from ._recoll import QResultStore from ._recoll import QResultStore, QRSDoc