Python module: add method to retrieve the full snippets list

This commit is contained in:
Jean-Francois Dockes 2021-03-10 13:29:22 +01:00
parent 9b24501f57
commit f57530e2a6
5 changed files with 170 additions and 41 deletions

View File

@ -7307,6 +7307,26 @@ for doc in results:
also perform highlighting. See the highlight
method.</p>
</dd>
<dt><span class="term">Query.getsnippets(doc,
maxoccs = -1, ctxwords = -1, sortbypage=False,
methods = object)</span></dt>
<dd>
<p>Will return a list of extracts from the
result document by selecting text around the
match terms. Each entry in the result list is a
triple: page number, term, text. By default,
the most relevants snippets appear first in the
list. Set <code class=
"literal">sortbypage</code> to sort by page
number instead. If <code class=
"literal">methods</code> is set, the fragments
will be highlighted (see the highlight method).
If <code class="literal">maxoccs</code> is set,
it defines the maximum result list length.
<code class="literal">ctxwords</code> allows
adjusting the individual snippet context
size.</p>
</dd>
<dt><span class="term">Query.__iter__() and
Query.next()</span></dt>
<dd>

View File

@ -5601,6 +5601,22 @@ for doc in results:
</para></listitem>
</varlistentry>
<varlistentry>
<term>Query.getsnippets(doc, maxoccs = -1, ctxwords = -1,
sortbypage=False, methods = object)</term>
<listitem><para>Will return a list of extracts from the result
document by selecting text around the match terms. Each
entry in the result list is a triple: page number, term,
text. By default, the most relevants snippets appear first
in the list. Set <literal>sortbypage</literal> to sort by
page number instead. If <literal>methods</literal> is set,
the fragments will be highlighted (see the highlight
method). If <literal>maxoccs</literal> is set, it defines
the maximum result list
length. <literal>ctxwords</literal> allows adjusting the
individual snippet context size. </para></listitem>
</varlistentry>
<varlistentry>
<term>Query.__iter__() and Query.next()</term>
<listitem><para>So that things like

View File

@ -1210,16 +1210,13 @@ PyDoc_STRVAR(doc_Query_highlight,
class PyPlainToRich: public PlainToRich {
public:
PyPlainToRich() {}
PyPlainToRich(PyObject *methods, bool eolbr = false)
: m_methods(methods)
{
: m_methods(methods) {
m_eolbr = eolbr;
}
virtual ~PyPlainToRich()
{
}
virtual string startMatch(unsigned int idx)
{
virtual ~PyPlainToRich() {}
virtual string startMatch(unsigned int idx) {
PyObject *res = 0;
if (m_methods)
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
@ -1232,8 +1229,7 @@ public:
return PyBytes_AsString(res1);
}
virtual string endMatch()
{
virtual string endMatch() {
PyObject *res = 0;
if (m_methods)
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
@ -1245,7 +1241,7 @@ public:
return PyBytes_AsString(res1);
}
PyObject *m_methods;
PyObject *m_methods{nullptr};
};
static PyObject *
@ -1373,6 +1369,78 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
"UTF-8", "replace");
}
PyDoc_STRVAR(doc_Query_getsnippets,
"getsnippets(doc, maxoccs = -1, ctxwords = -1, sortbypage=False, methods = object))\n"
"Will return a list of snippets for doc by selecting text around the match terms\n"
"If methods is set, will also perform highlighting. See the highlight method\n"
);
static PyObject *
Query_getsnippets(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_getSnippets\n");
static const char *kwlist[] = {"doc", "methods", "maxoccs", "ctxwords", "sortbypage", NULL};
recoll_DocObject *pydoc = 0;
PyObject *hlmethods = 0;
int maxoccs = -1;
int ctxwords = -1;
PyObject *osortbp = 0;
bool sortbypage = false;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OiiO:Query_getSnippets",
(char **)kwlist,
&recoll_DocType, &pydoc,
&hlmethods,
&maxoccs,
&ctxwords,
&osortbp)) {
return 0;
}
if (osortbp && PyObject_IsTrue(osortbp))
sortbypage = true;
if (pydoc->doc == 0) {
LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n");
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (self->query == 0) {
LOGERR("Query_makeDocAbstract: query not found " << self->query<< "\n");
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
std::shared_ptr<Rcl::SearchData> sd = self->query->getSD();
if (!sd) {
PyErr_SetString(PyExc_ValueError, "Query not initialized");
return 0;
}
std::vector<Rcl::Snippet> snippets;
self->query->makeDocAbstract(*(pydoc->doc), snippets, maxoccs, ctxwords, sortbypage);
PyObject *sniplist = PyList_New(snippets.size());
int i = 0;
HighlightData hldata;
PyPlainToRich hler;
if (hlmethods) {
sd->getTerms(hldata);
hler = PyPlainToRich(hlmethods);
hler.set_inputhtml(0);
}
for (const auto& snip : snippets) {
const std::string *textp = &snip.snippet;
list<string> lr;
if (hlmethods) {
hler.plaintorich(snip.snippet, lr, hldata);
textp = &lr.front();
}
PyList_SetItem(
sniplist, i++,
Py_BuildValue(
"(iOO)", snip.page,
PyUnicode_Decode(snip.term.c_str(), snip.term.size(), "UTF-8", "replace"),
PyUnicode_Decode(textp->c_str(), textp->size(), "UTF-8", "replace")));
}
return sniplist;
}
PyDoc_STRVAR(doc_Query_getxquery,
"getxquery(None) -> Unicode string\n"
"\n"
@ -1483,6 +1551,8 @@ static PyMethodDef Query_methods[] = {
doc_Query_getgroups},
{"makedocabstract", (PyCFunction)Query_makedocabstract,
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
{"getsnippets", (PyCFunction)Query_getsnippets,
METH_VARARGS|METH_KEYWORDS, doc_Query_getsnippets},
{"scroll", (PyCFunction)Query_scroll,
METH_VARARGS|METH_KEYWORDS, doc_Query_scroll},
{NULL} /* Sentinel */

View File

@ -0,0 +1,25 @@
#!/usr/bin/python3
from recoll import recoll
db = recoll.connect()
q = db.query()
q.execute("orographic")
class HL:
def startMatch(self, i):
return "<span class='hit'>"
def endMatch(self):
return "</span>";
hlmeths = HL()
for doc in q:
print("DOC %s" % doc.title)
snippets = q.getsnippets(doc, maxoccs=-1, ctxwords=10, methods=hlmeths, sortbypage=False)
print("Got %d snippets" % len(snippets))
for snip in snippets:
try:
print("Page %d term [%s] snippet [%s]" % (snip[0], snip[1], snip[2]))
except Exception as ex:
print("Print failed: %s" % ex)

View File

@ -297,15 +297,14 @@ bool Query::makeDocAbstract(const Doc &doc, vector<string>& abstract)
vector<Snippet> vpabs;
if (!makeDocAbstract(doc, vpabs))
return false;
for (vector<Snippet>::const_iterator it = vpabs.begin();
it != vpabs.end(); it++) {
for (const auto& snippet : vpabs) {
string chunk;
if (it->page > 0) {
if (snippet.page > 0) {
ostringstream ss;
ss << it->page;
ss << snippet.page;
chunk += string(" [p ") + ss.str() + "] ";
}
chunk += it->snippet;
chunk += snippet.snippet;
abstract.push_back(chunk);
}
return true;
@ -316,9 +315,8 @@ bool Query::makeDocAbstract(const Doc &doc, string& abstract)
vector<Snippet> vpabs;
if (!makeDocAbstract(doc, vpabs))
return false;
for (vector<Snippet>::const_iterator it = vpabs.begin();
it != vpabs.end(); it++) {
abstract.append(it->snippet);
for (const auto& snippet : vpabs) {
abstract.append(snippet.snippet);
abstract.append(cstr_ellipsis);
}
return m_reason.empty() ? true : false;