diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
index f8ec8392..16885cc8 100644
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@@ -7307,6 +7307,26 @@ for doc in results:
also perform highlighting. See the highlight
method.
+ Query.getsnippets(doc,
+ maxoccs = -1, ctxwords = -1, sortbypage=False,
+ methods = object)
+
+ Will return a list of extracts from the
+ result document by selecting text around the
+ match terms. Each entry in the result list is a
+ triple: page number, term, text. By default,
+ the most relevants snippets appear first in the
+ list. Set sortbypage to sort by page
+ number instead. If methods is set, the fragments
+ will be highlighted (see the highlight method).
+ If maxoccs is set,
+ it defines the maximum result list length.
+ ctxwords allows
+ adjusting the individual snippet context
+ size.
+
Query.__iter__() and
Query.next()
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index 76019ab0..385ead62 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -5601,6 +5601,22 @@ for doc in results:
+
+ Query.getsnippets(doc, maxoccs = -1, ctxwords = -1,
+ sortbypage=False, methods = object)
+ Will return a list of extracts from the result
+ document by selecting text around the match terms. Each
+ entry in the result list is a triple: page number, term,
+ text. By default, the most relevants snippets appear first
+ in the list. Set sortbypage to sort by
+ page number instead. If methods is set,
+ the fragments will be highlighted (see the highlight
+ method). If maxoccs is set, it defines
+ the maximum result list
+ length. ctxwords allows adjusting the
+ individual snippet context size.
+
+
Query.__iter__() and Query.next()
So that things like
diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp
index 8b48663e..9bde5fdf 100644
--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@@ -1210,42 +1210,38 @@ PyDoc_STRVAR(doc_Query_highlight,
class PyPlainToRich: public PlainToRich {
public:
+ PyPlainToRich() {}
PyPlainToRich(PyObject *methods, bool eolbr = false)
- : m_methods(methods)
- {
- m_eolbr = eolbr;
- }
- virtual ~PyPlainToRich()
- {
- }
- virtual string startMatch(unsigned int idx)
- {
- PyObject *res = 0;
- if (m_methods)
- res = PyObject_CallMethod(m_methods, (char *)"startMatch",
- (char *)"(i)", idx);
- if (res == 0)
- return "";
- PyObject *res1 = res;
- if (PyUnicode_Check(res))
- res1 = PyUnicode_AsUTF8String(res);
- return PyBytes_AsString(res1);
- }
+ : m_methods(methods) {
+ m_eolbr = eolbr;
+ }
+ virtual ~PyPlainToRich() {}
+ virtual string startMatch(unsigned int idx) {
+ PyObject *res = 0;
+ if (m_methods)
+ res = PyObject_CallMethod(m_methods, (char *)"startMatch",
+ (char *)"(i)", idx);
+ if (res == 0)
+ return "";
+ PyObject *res1 = res;
+ if (PyUnicode_Check(res))
+ res1 = PyUnicode_AsUTF8String(res);
+ return PyBytes_AsString(res1);
+ }
- virtual string endMatch()
- {
- PyObject *res = 0;
- if (m_methods)
- res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
- if (res == 0)
- return "";
- PyObject *res1 = res;
- if (PyUnicode_Check(res))
- res1 = PyUnicode_AsUTF8String(res);
- return PyBytes_AsString(res1);
- }
+ virtual string endMatch() {
+ PyObject *res = 0;
+ if (m_methods)
+ res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
+ if (res == 0)
+ return "";
+ PyObject *res1 = res;
+ if (PyUnicode_Check(res))
+ res1 = PyUnicode_AsUTF8String(res);
+ return PyBytes_AsString(res1);
+ }
- PyObject *m_methods;
+ PyObject *m_methods{nullptr};
};
static PyObject *
@@ -1373,6 +1369,78 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
"UTF-8", "replace");
}
+PyDoc_STRVAR(doc_Query_getsnippets,
+ "getsnippets(doc, maxoccs = -1, ctxwords = -1, sortbypage=False, methods = object))\n"
+ "Will return a list of snippets for doc by selecting text around the match terms\n"
+ "If methods is set, will also perform highlighting. See the highlight method\n"
+ );
+
+static PyObject *
+Query_getsnippets(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
+{
+ LOGDEB0("Query_getSnippets\n");
+ static const char *kwlist[] = {"doc", "methods", "maxoccs", "ctxwords", "sortbypage", NULL};
+ recoll_DocObject *pydoc = 0;
+ PyObject *hlmethods = 0;
+ int maxoccs = -1;
+ int ctxwords = -1;
+ PyObject *osortbp = 0;
+ bool sortbypage = false;
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OiiO:Query_getSnippets",
+ (char **)kwlist,
+ &recoll_DocType, &pydoc,
+ &hlmethods,
+ &maxoccs,
+ &ctxwords,
+ &osortbp)) {
+ return 0;
+ }
+ if (osortbp && PyObject_IsTrue(osortbp))
+ sortbypage = true;
+
+ if (pydoc->doc == 0) {
+ LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n");
+ PyErr_SetString(PyExc_AttributeError, "doc");
+ return 0;
+ }
+ if (self->query == 0) {
+ LOGERR("Query_makeDocAbstract: query not found " << self->query<< "\n");
+ PyErr_SetString(PyExc_AttributeError, "query");
+ return 0;
+ }
+ std::shared_ptr sd = self->query->getSD();
+ if (!sd) {
+ PyErr_SetString(PyExc_ValueError, "Query not initialized");
+ return 0;
+ }
+ std::vector snippets;
+ self->query->makeDocAbstract(*(pydoc->doc), snippets, maxoccs, ctxwords, sortbypage);
+ PyObject *sniplist = PyList_New(snippets.size());
+ int i = 0;
+ HighlightData hldata;
+ PyPlainToRich hler;
+ if (hlmethods) {
+ sd->getTerms(hldata);
+ hler = PyPlainToRich(hlmethods);
+ hler.set_inputhtml(0);
+ }
+ for (const auto& snip : snippets) {
+ const std::string *textp = &snip.snippet;
+ list lr;
+ if (hlmethods) {
+ hler.plaintorich(snip.snippet, lr, hldata);
+ textp = &lr.front();
+ }
+ PyList_SetItem(
+ sniplist, i++,
+ Py_BuildValue(
+ "(iOO)", snip.page,
+ PyUnicode_Decode(snip.term.c_str(), snip.term.size(), "UTF-8", "replace"),
+ PyUnicode_Decode(textp->c_str(), textp->size(), "UTF-8", "replace")));
+ }
+ return sniplist;
+}
+
PyDoc_STRVAR(doc_Query_getxquery,
"getxquery(None) -> Unicode string\n"
"\n"
@@ -1483,6 +1551,8 @@ static PyMethodDef Query_methods[] = {
doc_Query_getgroups},
{"makedocabstract", (PyCFunction)Query_makedocabstract,
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
+ {"getsnippets", (PyCFunction)Query_getsnippets,
+ METH_VARARGS|METH_KEYWORDS, doc_Query_getsnippets},
{"scroll", (PyCFunction)Query_scroll,
METH_VARARGS|METH_KEYWORDS, doc_Query_scroll},
{NULL} /* Sentinel */
diff --git a/src/python/samples/snippets.py b/src/python/samples/snippets.py
new file mode 100644
index 00000000..068304f9
--- /dev/null
+++ b/src/python/samples/snippets.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+
+from recoll import recoll
+
+db = recoll.connect()
+q = db.query()
+q.execute("orographic")
+
+class HL:
+ def startMatch(self, i):
+ return ""
+ def endMatch(self):
+ return "";
+
+hlmeths = HL()
+
+for doc in q:
+ print("DOC %s" % doc.title)
+ snippets = q.getsnippets(doc, maxoccs=-1, ctxwords=10, methods=hlmeths, sortbypage=False)
+ print("Got %d snippets" % len(snippets))
+ for snip in snippets:
+ try:
+ print("Page %d term [%s] snippet [%s]" % (snip[0], snip[1], snip[2]))
+ except Exception as ex:
+ print("Print failed: %s" % ex)
diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp
index c3d14b78..795d14a9 100644
--- a/src/rcldb/rclquery.cpp
+++ b/src/rcldb/rclquery.cpp
@@ -297,15 +297,14 @@ bool Query::makeDocAbstract(const Doc &doc, vector& abstract)
vector vpabs;
if (!makeDocAbstract(doc, vpabs))
return false;
- for (vector::const_iterator it = vpabs.begin();
- it != vpabs.end(); it++) {
+ for (const auto& snippet : vpabs) {
string chunk;
- if (it->page > 0) {
+ if (snippet.page > 0) {
ostringstream ss;
- ss << it->page;
+ ss << snippet.page;
chunk += string(" [p ") + ss.str() + "] ";
}
- chunk += it->snippet;
+ chunk += snippet.snippet;
abstract.push_back(chunk);
}
return true;
@@ -316,9 +315,8 @@ bool Query::makeDocAbstract(const Doc &doc, string& abstract)
vector vpabs;
if (!makeDocAbstract(doc, vpabs))
return false;
- for (vector::const_iterator it = vpabs.begin();
- it != vpabs.end(); it++) {
- abstract.append(it->snippet);
+ for (const auto& snippet : vpabs) {
+ abstract.append(snippet.snippet);
abstract.append(cstr_ellipsis);
}
return m_reason.empty() ? true : false;