Python module: add method to retrieve the full snippets list
This commit is contained in:
parent
9b24501f57
commit
f57530e2a6
@ -7307,6 +7307,26 @@ for doc in results:
|
|||||||
also perform highlighting. See the highlight
|
also perform highlighting. See the highlight
|
||||||
method.</p>
|
method.</p>
|
||||||
</dd>
|
</dd>
|
||||||
|
<dt><span class="term">Query.getsnippets(doc,
|
||||||
|
maxoccs = -1, ctxwords = -1, sortbypage=False,
|
||||||
|
methods = object)</span></dt>
|
||||||
|
<dd>
|
||||||
|
<p>Will return a list of extracts from the
|
||||||
|
result document by selecting text around the
|
||||||
|
match terms. Each entry in the result list is a
|
||||||
|
triple: page number, term, text. By default,
|
||||||
|
the most relevants snippets appear first in the
|
||||||
|
list. Set <code class=
|
||||||
|
"literal">sortbypage</code> to sort by page
|
||||||
|
number instead. If <code class=
|
||||||
|
"literal">methods</code> is set, the fragments
|
||||||
|
will be highlighted (see the highlight method).
|
||||||
|
If <code class="literal">maxoccs</code> is set,
|
||||||
|
it defines the maximum result list length.
|
||||||
|
<code class="literal">ctxwords</code> allows
|
||||||
|
adjusting the individual snippet context
|
||||||
|
size.</p>
|
||||||
|
</dd>
|
||||||
<dt><span class="term">Query.__iter__() and
|
<dt><span class="term">Query.__iter__() and
|
||||||
Query.next()</span></dt>
|
Query.next()</span></dt>
|
||||||
<dd>
|
<dd>
|
||||||
|
|||||||
@ -5601,6 +5601,22 @@ for doc in results:
|
|||||||
</para></listitem>
|
</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<term>Query.getsnippets(doc, maxoccs = -1, ctxwords = -1,
|
||||||
|
sortbypage=False, methods = object)</term>
|
||||||
|
<listitem><para>Will return a list of extracts from the result
|
||||||
|
document by selecting text around the match terms. Each
|
||||||
|
entry in the result list is a triple: page number, term,
|
||||||
|
text. By default, the most relevants snippets appear first
|
||||||
|
in the list. Set <literal>sortbypage</literal> to sort by
|
||||||
|
page number instead. If <literal>methods</literal> is set,
|
||||||
|
the fragments will be highlighted (see the highlight
|
||||||
|
method). If <literal>maxoccs</literal> is set, it defines
|
||||||
|
the maximum result list
|
||||||
|
length. <literal>ctxwords</literal> allows adjusting the
|
||||||
|
individual snippet context size. </para></listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<term>Query.__iter__() and Query.next()</term>
|
<term>Query.__iter__() and Query.next()</term>
|
||||||
<listitem><para>So that things like
|
<listitem><para>So that things like
|
||||||
|
|||||||
@ -1210,16 +1210,13 @@ PyDoc_STRVAR(doc_Query_highlight,
|
|||||||
|
|
||||||
class PyPlainToRich: public PlainToRich {
|
class PyPlainToRich: public PlainToRich {
|
||||||
public:
|
public:
|
||||||
|
PyPlainToRich() {}
|
||||||
PyPlainToRich(PyObject *methods, bool eolbr = false)
|
PyPlainToRich(PyObject *methods, bool eolbr = false)
|
||||||
: m_methods(methods)
|
: m_methods(methods) {
|
||||||
{
|
|
||||||
m_eolbr = eolbr;
|
m_eolbr = eolbr;
|
||||||
}
|
}
|
||||||
virtual ~PyPlainToRich()
|
virtual ~PyPlainToRich() {}
|
||||||
{
|
virtual string startMatch(unsigned int idx) {
|
||||||
}
|
|
||||||
virtual string startMatch(unsigned int idx)
|
|
||||||
{
|
|
||||||
PyObject *res = 0;
|
PyObject *res = 0;
|
||||||
if (m_methods)
|
if (m_methods)
|
||||||
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
|
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
|
||||||
@ -1232,8 +1229,7 @@ public:
|
|||||||
return PyBytes_AsString(res1);
|
return PyBytes_AsString(res1);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual string endMatch()
|
virtual string endMatch() {
|
||||||
{
|
|
||||||
PyObject *res = 0;
|
PyObject *res = 0;
|
||||||
if (m_methods)
|
if (m_methods)
|
||||||
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
|
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
|
||||||
@ -1245,7 +1241,7 @@ public:
|
|||||||
return PyBytes_AsString(res1);
|
return PyBytes_AsString(res1);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *m_methods;
|
PyObject *m_methods{nullptr};
|
||||||
};
|
};
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
@ -1373,6 +1369,78 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
|
|||||||
"UTF-8", "replace");
|
"UTF-8", "replace");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(doc_Query_getsnippets,
|
||||||
|
"getsnippets(doc, maxoccs = -1, ctxwords = -1, sortbypage=False, methods = object))\n"
|
||||||
|
"Will return a list of snippets for doc by selecting text around the match terms\n"
|
||||||
|
"If methods is set, will also perform highlighting. See the highlight method\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
Query_getsnippets(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
||||||
|
{
|
||||||
|
LOGDEB0("Query_getSnippets\n");
|
||||||
|
static const char *kwlist[] = {"doc", "methods", "maxoccs", "ctxwords", "sortbypage", NULL};
|
||||||
|
recoll_DocObject *pydoc = 0;
|
||||||
|
PyObject *hlmethods = 0;
|
||||||
|
int maxoccs = -1;
|
||||||
|
int ctxwords = -1;
|
||||||
|
PyObject *osortbp = 0;
|
||||||
|
bool sortbypage = false;
|
||||||
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|OiiO:Query_getSnippets",
|
||||||
|
(char **)kwlist,
|
||||||
|
&recoll_DocType, &pydoc,
|
||||||
|
&hlmethods,
|
||||||
|
&maxoccs,
|
||||||
|
&ctxwords,
|
||||||
|
&osortbp)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (osortbp && PyObject_IsTrue(osortbp))
|
||||||
|
sortbypage = true;
|
||||||
|
|
||||||
|
if (pydoc->doc == 0) {
|
||||||
|
LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n");
|
||||||
|
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (self->query == 0) {
|
||||||
|
LOGERR("Query_makeDocAbstract: query not found " << self->query<< "\n");
|
||||||
|
PyErr_SetString(PyExc_AttributeError, "query");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
std::shared_ptr<Rcl::SearchData> sd = self->query->getSD();
|
||||||
|
if (!sd) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
std::vector<Rcl::Snippet> snippets;
|
||||||
|
self->query->makeDocAbstract(*(pydoc->doc), snippets, maxoccs, ctxwords, sortbypage);
|
||||||
|
PyObject *sniplist = PyList_New(snippets.size());
|
||||||
|
int i = 0;
|
||||||
|
HighlightData hldata;
|
||||||
|
PyPlainToRich hler;
|
||||||
|
if (hlmethods) {
|
||||||
|
sd->getTerms(hldata);
|
||||||
|
hler = PyPlainToRich(hlmethods);
|
||||||
|
hler.set_inputhtml(0);
|
||||||
|
}
|
||||||
|
for (const auto& snip : snippets) {
|
||||||
|
const std::string *textp = &snip.snippet;
|
||||||
|
list<string> lr;
|
||||||
|
if (hlmethods) {
|
||||||
|
hler.plaintorich(snip.snippet, lr, hldata);
|
||||||
|
textp = &lr.front();
|
||||||
|
}
|
||||||
|
PyList_SetItem(
|
||||||
|
sniplist, i++,
|
||||||
|
Py_BuildValue(
|
||||||
|
"(iOO)", snip.page,
|
||||||
|
PyUnicode_Decode(snip.term.c_str(), snip.term.size(), "UTF-8", "replace"),
|
||||||
|
PyUnicode_Decode(textp->c_str(), textp->size(), "UTF-8", "replace")));
|
||||||
|
}
|
||||||
|
return sniplist;
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(doc_Query_getxquery,
|
PyDoc_STRVAR(doc_Query_getxquery,
|
||||||
"getxquery(None) -> Unicode string\n"
|
"getxquery(None) -> Unicode string\n"
|
||||||
"\n"
|
"\n"
|
||||||
@ -1483,6 +1551,8 @@ static PyMethodDef Query_methods[] = {
|
|||||||
doc_Query_getgroups},
|
doc_Query_getgroups},
|
||||||
{"makedocabstract", (PyCFunction)Query_makedocabstract,
|
{"makedocabstract", (PyCFunction)Query_makedocabstract,
|
||||||
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
|
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
|
||||||
|
{"getsnippets", (PyCFunction)Query_getsnippets,
|
||||||
|
METH_VARARGS|METH_KEYWORDS, doc_Query_getsnippets},
|
||||||
{"scroll", (PyCFunction)Query_scroll,
|
{"scroll", (PyCFunction)Query_scroll,
|
||||||
METH_VARARGS|METH_KEYWORDS, doc_Query_scroll},
|
METH_VARARGS|METH_KEYWORDS, doc_Query_scroll},
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
|
|||||||
25
src/python/samples/snippets.py
Normal file
25
src/python/samples/snippets.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from recoll import recoll
|
||||||
|
|
||||||
|
db = recoll.connect()
|
||||||
|
q = db.query()
|
||||||
|
q.execute("orographic")
|
||||||
|
|
||||||
|
class HL:
|
||||||
|
def startMatch(self, i):
|
||||||
|
return "<span class='hit'>"
|
||||||
|
def endMatch(self):
|
||||||
|
return "</span>";
|
||||||
|
|
||||||
|
hlmeths = HL()
|
||||||
|
|
||||||
|
for doc in q:
|
||||||
|
print("DOC %s" % doc.title)
|
||||||
|
snippets = q.getsnippets(doc, maxoccs=-1, ctxwords=10, methods=hlmeths, sortbypage=False)
|
||||||
|
print("Got %d snippets" % len(snippets))
|
||||||
|
for snip in snippets:
|
||||||
|
try:
|
||||||
|
print("Page %d term [%s] snippet [%s]" % (snip[0], snip[1], snip[2]))
|
||||||
|
except Exception as ex:
|
||||||
|
print("Print failed: %s" % ex)
|
||||||
@ -297,15 +297,14 @@ bool Query::makeDocAbstract(const Doc &doc, vector<string>& abstract)
|
|||||||
vector<Snippet> vpabs;
|
vector<Snippet> vpabs;
|
||||||
if (!makeDocAbstract(doc, vpabs))
|
if (!makeDocAbstract(doc, vpabs))
|
||||||
return false;
|
return false;
|
||||||
for (vector<Snippet>::const_iterator it = vpabs.begin();
|
for (const auto& snippet : vpabs) {
|
||||||
it != vpabs.end(); it++) {
|
|
||||||
string chunk;
|
string chunk;
|
||||||
if (it->page > 0) {
|
if (snippet.page > 0) {
|
||||||
ostringstream ss;
|
ostringstream ss;
|
||||||
ss << it->page;
|
ss << snippet.page;
|
||||||
chunk += string(" [p ") + ss.str() + "] ";
|
chunk += string(" [p ") + ss.str() + "] ";
|
||||||
}
|
}
|
||||||
chunk += it->snippet;
|
chunk += snippet.snippet;
|
||||||
abstract.push_back(chunk);
|
abstract.push_back(chunk);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -316,9 +315,8 @@ bool Query::makeDocAbstract(const Doc &doc, string& abstract)
|
|||||||
vector<Snippet> vpabs;
|
vector<Snippet> vpabs;
|
||||||
if (!makeDocAbstract(doc, vpabs))
|
if (!makeDocAbstract(doc, vpabs))
|
||||||
return false;
|
return false;
|
||||||
for (vector<Snippet>::const_iterator it = vpabs.begin();
|
for (const auto& snippet : vpabs) {
|
||||||
it != vpabs.end(); it++) {
|
abstract.append(snippet.snippet);
|
||||||
abstract.append(it->snippet);
|
|
||||||
abstract.append(cstr_ellipsis);
|
abstract.append(cstr_ellipsis);
|
||||||
}
|
}
|
||||||
return m_reason.empty() ? true : false;
|
return m_reason.empty() ? true : false;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user