python module: termMatch: add option to return term frequencies

This commit is contained in:
Jean-Francois Dockes 2018-04-13 14:30:43 +02:00
parent 4ffdbc43f2
commit 92e172cad0

View File

@ -50,6 +50,8 @@ static RclConfig *rclconfig;
#if PY_MAJOR_VERSION >=3 #if PY_MAJOR_VERSION >=3
# define Py_TPFLAGS_HAVE_ITER 0 # define Py_TPFLAGS_HAVE_ITER 0
#else
#define PyLong_FromLong PyInt_FromLong
#endif #endif
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -1738,35 +1740,38 @@ Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
PyDoc_STRVAR(doc_Db_termMatch, PyDoc_STRVAR(doc_Db_termMatch,
"termMatch(match_type='wildcard|regexp|stem', expr, field='', " "termMatch(match_type='wildcard|regexp|stem', expr, field='', "
"maxlen=-1, casesens=False, diacsens=False, lang='english')" "maxlen=-1, casesens=False, diacsens=False, lang='english', freqs=False)"
" returns the expanded term list\n" " returns the expanded term list\n"
"\n" "\n"
"Expands the input expression according to the mode and parameters and " "Expands the input expression according to the mode and parameters and "
"returns the expanded term list.\n" "returns the expanded term list, as raw terms if freqs is False, or "
"(term, totcnt, docnt) tuples if freqs is True.\n"
); );
static PyObject * static PyObject *
Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs) Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs)
{ {
LOGDEB0("Db_termMatch\n"); LOGDEB0("Db_termMatch\n");
static const char *kwlist[] = {"type", "expr", "field", "maxlen", static const char *kwlist[] = {"type", "expr", "field", "maxlen",
"casesens", "diacsens", "lang", NULL}; "casesens", "diacsens", "freqs", "lang", NULL};
char *tp = 0; char *tp = 0;
char *expr = 0; // needs freeing char *expr = 0; // needs freeing
char *field = 0; // needs freeing char *field = 0; // needs freeing
int maxlen = -1; int maxlen = -1;
PyObject *casesens = 0; PyObject *casesens = 0;
PyObject *diacsens = 0; PyObject *diacsens = 0;
PyObject *freqs = 0;
char *lang = 0; // needs freeing char *lang = 0; // needs freeing
PyObject *ret = 0; PyObject *ret = 0;
int typ_sens = 0; int typ_sens = 0;
Rcl::TermMatchResult result; Rcl::TermMatchResult result;
bool showfreqs = false;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ses|esiOOes", if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ses|esiOOOes",
(char**)kwlist, (char**)kwlist,
&tp, "utf-8", &expr, "utf-8", &field, &tp, "utf-8", &expr, "utf-8", &field,
&maxlen, &casesens, &maxlen,
&diacsens, "utf-8", &lang)) &casesens, &diacsens, &freqs,
"utf-8", &lang))
return 0; return 0;
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
@ -1792,18 +1797,31 @@ Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs)
if (diacsens != 0 && PyObject_IsTrue(diacsens)) { if (diacsens != 0 && PyObject_IsTrue(diacsens)) {
typ_sens |= Rcl::Db::ET_DIACSENS; typ_sens |= Rcl::Db::ET_DIACSENS;
} }
if (freqs != 0 && PyObject_IsTrue(freqs)) {
showfreqs = true;
}
if (!self->db->termMatch(typ_sens, lang ? lang : "english", if (!self->db->termMatch(typ_sens, lang ? lang : "english",
expr, result, maxlen, field ? field : "")) { expr, result, maxlen, field ? field : "")) {
LOGERR("Db_termMatch: db termMatch error\n"); LOGERR("Db_termMatch: db termMatch error\n");
PyErr_SetString(PyExc_AttributeError, "rcldb termMatch error"); PyErr_SetString(PyExc_AttributeError, "rcldb termMatch error");
goto out; goto out;
} }
ret = PyList_New(result.entries.size()); ret = PyList_New(result.entries.size());
for (unsigned int i = 0; i < result.entries.size(); i++) { for (unsigned int i = 0; i < result.entries.size(); i++) {
PyList_SetItem(ret, i, PyObject *term = PyUnicode_FromString(
PyUnicode_FromString( Rcl::strip_prefix(result.entries[i].term).c_str());
Rcl::strip_prefix(result.entries[i].term).c_str())); if (showfreqs) {
PyObject *totcnt = PyLong_FromLong(result.entries[i].wcf);
PyObject *doccnt = PyLong_FromLong(result.entries[i].docs);
PyObject *tup = PyTuple_New(3);
PyTuple_SetItem(tup, 0, term);
PyTuple_SetItem(tup, 1, totcnt);
PyTuple_SetItem(tup, 2, doccnt);
PyList_SetItem(ret, i, tup);
} else {
PyList_SetItem(ret, i, term);
}
} }
out: out: