Python module: add method to retrieve the full snippets list
This commit is contained in:
parent
9b24501f57
commit
f57530e2a6
@ -7307,6 +7307,26 @@ for doc in results:
|
||||
also perform highlighting. See the highlight
|
||||
method.</p>
|
||||
</dd>
|
||||
<dt><span class="term">Query.getsnippets(doc,
|
||||
maxoccs = -1, ctxwords = -1, sortbypage=False,
|
||||
methods = object)</span></dt>
|
||||
<dd>
|
||||
<p>Will return a list of extracts from the
|
||||
result document by selecting text around the
|
||||
match terms. Each entry in the result list is a
|
||||
triple: page number, term, text. By default,
|
||||
the most relevants snippets appear first in the
|
||||
list. Set <code class=
|
||||
"literal">sortbypage</code> to sort by page
|
||||
number instead. If <code class=
|
||||
"literal">methods</code> is set, the fragments
|
||||
will be highlighted (see the highlight method).
|
||||
If <code class="literal">maxoccs</code> is set,
|
||||
it defines the maximum result list length.
|
||||
<code class="literal">ctxwords</code> allows
|
||||
adjusting the individual snippet context
|
||||
size.</p>
|
||||
</dd>
|
||||
<dt><span class="term">Query.__iter__() and
|
||||
Query.next()</span></dt>
|
||||
<dd>
|
||||
|
||||
@ -5601,6 +5601,22 @@ for doc in results:
|
||||
</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>Query.getsnippets(doc, maxoccs = -1, ctxwords = -1,
|
||||
sortbypage=False, methods = object)</term>
|
||||
<listitem><para>Will return a list of extracts from the result
|
||||
document by selecting text around the match terms. Each
|
||||
entry in the result list is a triple: page number, term,
|
||||
text. By default, the most relevants snippets appear first
|
||||
in the list. Set <literal>sortbypage</literal> to sort by
|
||||
page number instead. If <literal>methods</literal> is set,
|
||||
the fragments will be highlighted (see the highlight
|
||||
method). If <literal>maxoccs</literal> is set, it defines
|
||||
the maximum result list
|
||||
length. <literal>ctxwords</literal> allows adjusting the
|
||||
individual snippet context size. </para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>Query.__iter__() and Query.next()</term>
|
||||
<listitem><para>So that things like
|
||||
|
||||
@ -1210,16 +1210,13 @@ PyDoc_STRVAR(doc_Query_highlight,
|
||||
|
||||
class PyPlainToRich: public PlainToRich {
|
||||
public:
|
||||
PyPlainToRich() {}
|
||||
PyPlainToRich(PyObject *methods, bool eolbr = false)
|
||||
: m_methods(methods)
|
||||
{
|
||||
: m_methods(methods) {
|
||||
m_eolbr = eolbr;
|
||||
}
|
||||
virtual ~PyPlainToRich()
|
||||
{
|
||||
}
|
||||
virtual string startMatch(unsigned int idx)
|
||||
{
|
||||
virtual ~PyPlainToRich() {}
|
||||
virtual string startMatch(unsigned int idx) {
|
||||
PyObject *res = 0;
|
||||
if (m_methods)
|
||||
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
|
||||
@ -1232,8 +1229,7 @@ public:
|
||||
return PyBytes_AsString(res1);
|
||||
}
|
||||
|
||||
virtual string endMatch()
|
||||
{
|
||||
virtual string endMatch() {
|
||||
PyObject *res = 0;
|
||||
if (m_methods)
|
||||
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
|
||||
@ -1245,7 +1241,7 @@ public:
|
||||
return PyBytes_AsString(res1);
|
||||
}
|
||||
|
||||
PyObject *m_methods;
|
||||
PyObject *m_methods{nullptr};
|
||||
};
|
||||
|
||||
static PyObject *
|
||||
@ -1373,6 +1369,78 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
|
||||
"UTF-8", "replace");
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Query_getsnippets,
|
||||
"getsnippets(doc, maxoccs = -1, ctxwords = -1, sortbypage=False, methods = object))\n"
|
||||
"Will return a list of snippets for doc by selecting text around the match terms\n"
|
||||
"If methods is set, will also perform highlighting. See the highlight method\n"
|
||||
);
|
||||
|
||||
static PyObject *
|
||||
Query_getsnippets(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
LOGDEB0("Query_getSnippets\n");
|
||||
static const char *kwlist[] = {"doc", "methods", "maxoccs", "ctxwords", "sortbypage", NULL};
|
||||
recoll_DocObject *pydoc = 0;
|
||||
PyObject *hlmethods = 0;
|
||||
int maxoccs = -1;
|
||||
int ctxwords = -1;
|
||||
PyObject *osortbp = 0;
|
||||
bool sortbypage = false;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OiiO:Query_getSnippets",
|
||||
(char **)kwlist,
|
||||
&recoll_DocType, &pydoc,
|
||||
&hlmethods,
|
||||
&maxoccs,
|
||||
&ctxwords,
|
||||
&osortbp)) {
|
||||
return 0;
|
||||
}
|
||||
if (osortbp && PyObject_IsTrue(osortbp))
|
||||
sortbypage = true;
|
||||
|
||||
if (pydoc->doc == 0) {
|
||||
LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n");
|
||||
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||
return 0;
|
||||
}
|
||||
if (self->query == 0) {
|
||||
LOGERR("Query_makeDocAbstract: query not found " << self->query<< "\n");
|
||||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
std::shared_ptr<Rcl::SearchData> sd = self->query->getSD();
|
||||
if (!sd) {
|
||||
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||
return 0;
|
||||
}
|
||||
std::vector<Rcl::Snippet> snippets;
|
||||
self->query->makeDocAbstract(*(pydoc->doc), snippets, maxoccs, ctxwords, sortbypage);
|
||||
PyObject *sniplist = PyList_New(snippets.size());
|
||||
int i = 0;
|
||||
HighlightData hldata;
|
||||
PyPlainToRich hler;
|
||||
if (hlmethods) {
|
||||
sd->getTerms(hldata);
|
||||
hler = PyPlainToRich(hlmethods);
|
||||
hler.set_inputhtml(0);
|
||||
}
|
||||
for (const auto& snip : snippets) {
|
||||
const std::string *textp = &snip.snippet;
|
||||
list<string> lr;
|
||||
if (hlmethods) {
|
||||
hler.plaintorich(snip.snippet, lr, hldata);
|
||||
textp = &lr.front();
|
||||
}
|
||||
PyList_SetItem(
|
||||
sniplist, i++,
|
||||
Py_BuildValue(
|
||||
"(iOO)", snip.page,
|
||||
PyUnicode_Decode(snip.term.c_str(), snip.term.size(), "UTF-8", "replace"),
|
||||
PyUnicode_Decode(textp->c_str(), textp->size(), "UTF-8", "replace")));
|
||||
}
|
||||
return sniplist;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Query_getxquery,
|
||||
"getxquery(None) -> Unicode string\n"
|
||||
"\n"
|
||||
@ -1483,6 +1551,8 @@ static PyMethodDef Query_methods[] = {
|
||||
doc_Query_getgroups},
|
||||
{"makedocabstract", (PyCFunction)Query_makedocabstract,
|
||||
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
|
||||
{"getsnippets", (PyCFunction)Query_getsnippets,
|
||||
METH_VARARGS|METH_KEYWORDS, doc_Query_getsnippets},
|
||||
{"scroll", (PyCFunction)Query_scroll,
|
||||
METH_VARARGS|METH_KEYWORDS, doc_Query_scroll},
|
||||
{NULL} /* Sentinel */
|
||||
|
||||
25
src/python/samples/snippets.py
Normal file
25
src/python/samples/snippets.py
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from recoll import recoll
|
||||
|
||||
db = recoll.connect()
|
||||
q = db.query()
|
||||
q.execute("orographic")
|
||||
|
||||
class HL:
|
||||
def startMatch(self, i):
|
||||
return "<span class='hit'>"
|
||||
def endMatch(self):
|
||||
return "</span>";
|
||||
|
||||
hlmeths = HL()
|
||||
|
||||
for doc in q:
|
||||
print("DOC %s" % doc.title)
|
||||
snippets = q.getsnippets(doc, maxoccs=-1, ctxwords=10, methods=hlmeths, sortbypage=False)
|
||||
print("Got %d snippets" % len(snippets))
|
||||
for snip in snippets:
|
||||
try:
|
||||
print("Page %d term [%s] snippet [%s]" % (snip[0], snip[1], snip[2]))
|
||||
except Exception as ex:
|
||||
print("Print failed: %s" % ex)
|
||||
@ -297,15 +297,14 @@ bool Query::makeDocAbstract(const Doc &doc, vector<string>& abstract)
|
||||
vector<Snippet> vpabs;
|
||||
if (!makeDocAbstract(doc, vpabs))
|
||||
return false;
|
||||
for (vector<Snippet>::const_iterator it = vpabs.begin();
|
||||
it != vpabs.end(); it++) {
|
||||
for (const auto& snippet : vpabs) {
|
||||
string chunk;
|
||||
if (it->page > 0) {
|
||||
if (snippet.page > 0) {
|
||||
ostringstream ss;
|
||||
ss << it->page;
|
||||
ss << snippet.page;
|
||||
chunk += string(" [p ") + ss.str() + "] ";
|
||||
}
|
||||
chunk += it->snippet;
|
||||
chunk += snippet.snippet;
|
||||
abstract.push_back(chunk);
|
||||
}
|
||||
return true;
|
||||
@ -316,9 +315,8 @@ bool Query::makeDocAbstract(const Doc &doc, string& abstract)
|
||||
vector<Snippet> vpabs;
|
||||
if (!makeDocAbstract(doc, vpabs))
|
||||
return false;
|
||||
for (vector<Snippet>::const_iterator it = vpabs.begin();
|
||||
it != vpabs.end(); it++) {
|
||||
abstract.append(it->snippet);
|
||||
for (const auto& snippet : vpabs) {
|
||||
abstract.append(snippet.snippet);
|
||||
abstract.append(cstr_ellipsis);
|
||||
}
|
||||
return m_reason.empty() ? true : false;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user