diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index f8ec8392..16885cc8 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -7307,6 +7307,26 @@ for doc in results: also perform highlighting. See the highlight method.

+
Query.getsnippets(doc, + maxoccs = -1, ctxwords = -1, sortbypage=False, + methods = object)
+
+

Will return a list of extracts from the + result document by selecting text around the + match terms. Each entry in the result list is a + triple: page number, term, text. By default, + the most relevants snippets appear first in the + list. Set sortbypage to sort by page + number instead. If methods is set, the fragments + will be highlighted (see the highlight method). + If maxoccs is set, + it defines the maximum result list length. + ctxwords allows + adjusting the individual snippet context + size.

+
Query.__iter__() and Query.next()
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index 76019ab0..385ead62 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5601,6 +5601,22 @@ for doc in results: + + Query.getsnippets(doc, maxoccs = -1, ctxwords = -1, + sortbypage=False, methods = object) + Will return a list of extracts from the result + document by selecting text around the match terms. Each + entry in the result list is a triple: page number, term, + text. By default, the most relevants snippets appear first + in the list. Set sortbypage to sort by + page number instead. If methods is set, + the fragments will be highlighted (see the highlight + method). If maxoccs is set, it defines + the maximum result list + length. ctxwords allows adjusting the + individual snippet context size. + + Query.__iter__() and Query.next() So that things like diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 8b48663e..9bde5fdf 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -1210,42 +1210,38 @@ PyDoc_STRVAR(doc_Query_highlight, class PyPlainToRich: public PlainToRich { public: + PyPlainToRich() {} PyPlainToRich(PyObject *methods, bool eolbr = false) - : m_methods(methods) - { - m_eolbr = eolbr; - } - virtual ~PyPlainToRich() - { - } - virtual string startMatch(unsigned int idx) - { - PyObject *res = 0; - if (m_methods) - res = PyObject_CallMethod(m_methods, (char *)"startMatch", - (char *)"(i)", idx); - if (res == 0) - return ""; - PyObject *res1 = res; - if (PyUnicode_Check(res)) - res1 = PyUnicode_AsUTF8String(res); - return PyBytes_AsString(res1); - } + : m_methods(methods) { + m_eolbr = eolbr; + } + virtual ~PyPlainToRich() {} + virtual string startMatch(unsigned int idx) { + PyObject *res = 0; + if (m_methods) + res = PyObject_CallMethod(m_methods, (char *)"startMatch", + (char *)"(i)", idx); + if (res == 0) + return ""; + PyObject *res1 = res; + if (PyUnicode_Check(res)) + res1 = PyUnicode_AsUTF8String(res); + return PyBytes_AsString(res1); + } - virtual string endMatch() - { - PyObject *res = 0; - if (m_methods) - res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0); - if (res == 0) - return ""; - PyObject *res1 = res; - if (PyUnicode_Check(res)) - res1 = PyUnicode_AsUTF8String(res); - return PyBytes_AsString(res1); - } + virtual string endMatch() { + PyObject *res = 0; + if (m_methods) + res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0); + if (res == 0) + return ""; + PyObject *res1 = res; + if (PyUnicode_Check(res)) + res1 = PyUnicode_AsUTF8String(res); + return PyBytes_AsString(res1); + } - PyObject *m_methods; + PyObject *m_methods{nullptr}; }; static PyObject * @@ -1373,6 +1369,78 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs) "UTF-8", "replace"); } +PyDoc_STRVAR(doc_Query_getsnippets, + "getsnippets(doc, maxoccs = -1, ctxwords = -1, sortbypage=False, methods = object))\n" + "Will return a list of snippets for doc by selecting text around the match terms\n" + "If methods is set, will also perform highlighting. See the highlight method\n" + ); + +static PyObject * +Query_getsnippets(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) +{ + LOGDEB0("Query_getSnippets\n"); + static const char *kwlist[] = {"doc", "methods", "maxoccs", "ctxwords", "sortbypage", NULL}; + recoll_DocObject *pydoc = 0; + PyObject *hlmethods = 0; + int maxoccs = -1; + int ctxwords = -1; + PyObject *osortbp = 0; + bool sortbypage = false; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OiiO:Query_getSnippets", + (char **)kwlist, + &recoll_DocType, &pydoc, + &hlmethods, + &maxoccs, + &ctxwords, + &osortbp)) { + return 0; + } + if (osortbp && PyObject_IsTrue(osortbp)) + sortbypage = true; + + if (pydoc->doc == 0) { + LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n"); + PyErr_SetString(PyExc_AttributeError, "doc"); + return 0; + } + if (self->query == 0) { + LOGERR("Query_makeDocAbstract: query not found " << self->query<< "\n"); + PyErr_SetString(PyExc_AttributeError, "query"); + return 0; + } + std::shared_ptr sd = self->query->getSD(); + if (!sd) { + PyErr_SetString(PyExc_ValueError, "Query not initialized"); + return 0; + } + std::vector snippets; + self->query->makeDocAbstract(*(pydoc->doc), snippets, maxoccs, ctxwords, sortbypage); + PyObject *sniplist = PyList_New(snippets.size()); + int i = 0; + HighlightData hldata; + PyPlainToRich hler; + if (hlmethods) { + sd->getTerms(hldata); + hler = PyPlainToRich(hlmethods); + hler.set_inputhtml(0); + } + for (const auto& snip : snippets) { + const std::string *textp = &snip.snippet; + list lr; + if (hlmethods) { + hler.plaintorich(snip.snippet, lr, hldata); + textp = &lr.front(); + } + PyList_SetItem( + sniplist, i++, + Py_BuildValue( + "(iOO)", snip.page, + PyUnicode_Decode(snip.term.c_str(), snip.term.size(), "UTF-8", "replace"), + PyUnicode_Decode(textp->c_str(), textp->size(), "UTF-8", "replace"))); + } + return sniplist; +} + PyDoc_STRVAR(doc_Query_getxquery, "getxquery(None) -> Unicode string\n" "\n" @@ -1483,6 +1551,8 @@ static PyMethodDef Query_methods[] = { doc_Query_getgroups}, {"makedocabstract", (PyCFunction)Query_makedocabstract, METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract}, + {"getsnippets", (PyCFunction)Query_getsnippets, + METH_VARARGS|METH_KEYWORDS, doc_Query_getsnippets}, {"scroll", (PyCFunction)Query_scroll, METH_VARARGS|METH_KEYWORDS, doc_Query_scroll}, {NULL} /* Sentinel */ diff --git a/src/python/samples/snippets.py b/src/python/samples/snippets.py new file mode 100644 index 00000000..068304f9 --- /dev/null +++ b/src/python/samples/snippets.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 + +from recoll import recoll + +db = recoll.connect() +q = db.query() +q.execute("orographic") + +class HL: + def startMatch(self, i): + return "" + def endMatch(self): + return ""; + +hlmeths = HL() + +for doc in q: + print("DOC %s" % doc.title) + snippets = q.getsnippets(doc, maxoccs=-1, ctxwords=10, methods=hlmeths, sortbypage=False) + print("Got %d snippets" % len(snippets)) + for snip in snippets: + try: + print("Page %d term [%s] snippet [%s]" % (snip[0], snip[1], snip[2])) + except Exception as ex: + print("Print failed: %s" % ex) diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index c3d14b78..795d14a9 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -297,15 +297,14 @@ bool Query::makeDocAbstract(const Doc &doc, vector& abstract) vector vpabs; if (!makeDocAbstract(doc, vpabs)) return false; - for (vector::const_iterator it = vpabs.begin(); - it != vpabs.end(); it++) { + for (const auto& snippet : vpabs) { string chunk; - if (it->page > 0) { + if (snippet.page > 0) { ostringstream ss; - ss << it->page; + ss << snippet.page; chunk += string(" [p ") + ss.str() + "] "; } - chunk += it->snippet; + chunk += snippet.snippet; abstract.push_back(chunk); } return true; @@ -316,9 +315,8 @@ bool Query::makeDocAbstract(const Doc &doc, string& abstract) vector vpabs; if (!makeDocAbstract(doc, vpabs)) return false; - for (vector::const_iterator it = vpabs.begin(); - it != vpabs.end(); it++) { - abstract.append(it->snippet); + for (const auto& snippet : vpabs) { + abstract.append(snippet.snippet); abstract.append(cstr_ellipsis); } return m_reason.empty() ? true : false;