temporary checkpoint for python module modifs
This commit is contained in:
parent
a9599be5f9
commit
3be5e982b7
223
src/python/recoll/pyrclextract.cpp
Normal file
223
src/python/recoll/pyrclextract.cpp
Normal file
@ -0,0 +1,223 @@
|
||||
/* Copyright (C) 2007 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#include <Python.h>
|
||||
#include <structmember.h>
|
||||
#include <bytearrayobject.h>
|
||||
|
||||
#include <strings.h>
|
||||
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rcldoc.h"
|
||||
#include "internfile.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
#include "pyrecoll.h"
|
||||
|
||||
static PyObject *recoll_DocType;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
/// Extractor object code
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
/* Type-specific fields go here. */
|
||||
FileInterner *xtr;
|
||||
TempDir *tmpdir;
|
||||
RclConfig *rclconfig;
|
||||
} rclx_ExtractorObject;
|
||||
|
||||
static void
|
||||
Extractor_dealloc(rclx_ExtractorObject *self)
|
||||
{
|
||||
LOGDEB(("Extractor_dealloc\n"));
|
||||
delete self->xtr;
|
||||
delete self->tmpdir;
|
||||
self->ob_type->tp_free((PyObject*)self);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||
{
|
||||
LOGDEB(("Extractor_new\n"));
|
||||
rclx_ExtractorObject *self =
|
||||
(rclx_ExtractorObject *)type->tp_alloc(type, 0);
|
||||
if (self == 0)
|
||||
return 0;
|
||||
self->xtr = 0;
|
||||
self->tmpdir = 0;
|
||||
self->rclconfig = 0;
|
||||
return (PyObject *)self;
|
||||
}
|
||||
|
||||
static int
|
||||
Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
LOGDEB(("Extractor_init\n"));
|
||||
static const char* kwlist[] = {"doc", NULL};
|
||||
PyObject *pdobj;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!", (char**)kwlist,
|
||||
recoll_DocType, &pdobj))
|
||||
return -1;
|
||||
recoll_DocObject *dobj = (recoll_DocObject *)pdobj;
|
||||
self->tmpdir = new TempDir;
|
||||
if (dobj->doc == 0) {
|
||||
PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
|
||||
return -1;
|
||||
}
|
||||
self->rclconfig = dobj->rclconfig;
|
||||
self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
|
||||
FileInterner::FIF_forPreview);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
LOGDEB(("Extractor_extract\n"));
|
||||
static const char* kwlist[] = {"ipath", NULL};
|
||||
char *sipath = 0;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract",
|
||||
(char**)kwlist,
|
||||
"utf-8", &sipath))
|
||||
return 0;
|
||||
|
||||
string ipath(sipath);
|
||||
PyMem_Free(sipath);
|
||||
|
||||
if (self->xtr == 0) {
|
||||
PyErr_SetString(PyExc_AttributeError, "extract: null object");
|
||||
return 0;
|
||||
}
|
||||
/* Call the doc class object to create a new doc. */
|
||||
recoll_DocObject *result =
|
||||
(recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
|
||||
if (!result) {
|
||||
LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
|
||||
return 0;
|
||||
}
|
||||
FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
|
||||
if (status != FileInterner::FIDone) {
|
||||
PyErr_SetString(PyExc_AttributeError, "internfile failure");
|
||||
return 0;
|
||||
}
|
||||
|
||||
string html = self->xtr->get_html();
|
||||
if (!html.empty()) {
|
||||
result->doc->text = html;
|
||||
result->doc->mimetype = "text/html";
|
||||
}
|
||||
|
||||
// fetching attributes easier. Is this actually needed ? Useful for
|
||||
// url which is also formatted .
|
||||
Rcl::Doc *doc = result->doc;
|
||||
printableUrl(self->rclconfig->getDefCharset(), doc->url,
|
||||
doc->meta[Rcl::Doc::keyurl]);
|
||||
doc->meta[Rcl::Doc::keytp] = doc->mimetype;
|
||||
doc->meta[Rcl::Doc::keyipt] = doc->ipath;
|
||||
doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
|
||||
doc->meta[Rcl::Doc::keyds] = doc->dbytes;
|
||||
return (PyObject *)result;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_extract,
|
||||
"extract(ipath)\n"
|
||||
"Extract document defined by ipath and return a doc object.\n"
|
||||
);
|
||||
|
||||
static PyMethodDef Extractor_methods[] = {
|
||||
{"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
|
||||
doc_extract},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
PyDoc_STRVAR(doc_ExtractorObject,
|
||||
"Extractor()\n"
|
||||
"\n"
|
||||
"A Extractor object describes a query. It has a number of global\n"
|
||||
"parameters and a chain of search clauses.\n"
|
||||
);
|
||||
static PyTypeObject rclx_ExtractorType = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /*ob_size*/
|
||||
"rclextract.Extractor", /*tp_name*/
|
||||
sizeof(rclx_ExtractorObject), /*tp_basicsize*/
|
||||
0, /*tp_itemsize*/
|
||||
(destructor)Extractor_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
0, /*tp_getattr*/
|
||||
0, /*tp_setattr*/
|
||||
0, /*tp_compare*/
|
||||
0, /*tp_repr*/
|
||||
0, /*tp_as_number*/
|
||||
0, /*tp_as_sequence*/
|
||||
0, /*tp_as_mapping*/
|
||||
0, /*tp_hash */
|
||||
0, /*tp_call*/
|
||||
0, /*tp_str*/
|
||||
0, /*tp_getattro*/
|
||||
0, /*tp_setattro*/
|
||||
0, /*tp_as_buffer*/
|
||||
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
|
||||
doc_ExtractorObject, /* tp_doc */
|
||||
0, /* tp_traverse */
|
||||
0, /* tp_clear */
|
||||
0, /* tp_richcompare */
|
||||
0, /* tp_weaklistoffset */
|
||||
0, /* tp_iter */
|
||||
0, /* tp_iternext */
|
||||
Extractor_methods, /* tp_methods */
|
||||
0, /* tp_members */
|
||||
0, /* tp_getset */
|
||||
0, /* tp_base */
|
||||
0, /* tp_dict */
|
||||
0, /* tp_descr_get */
|
||||
0, /* tp_descr_set */
|
||||
0, /* tp_dictoffset */
|
||||
(initproc)Extractor_init, /* tp_init */
|
||||
0, /* tp_alloc */
|
||||
Extractor_new, /* tp_new */
|
||||
};
|
||||
|
||||
///////////////////////////////////// Module-level stuff
|
||||
static PyMethodDef rclxMethods[] = {
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
PyDoc_STRVAR(rclx_doc_string,
|
||||
"This is an interface to the Recoll text extraction features.");
|
||||
|
||||
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
|
||||
#define PyMODINIT_FUNC void
|
||||
#endif
|
||||
PyMODINIT_FUNC
|
||||
initrclextract(void)
|
||||
{
|
||||
PyObject* m = Py_InitModule("rclextract", rclxMethods);
|
||||
PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
|
||||
|
||||
if (PyType_Ready(&rclx_ExtractorType) < 0)
|
||||
return;
|
||||
Py_INCREF(&rclx_ExtractorType);
|
||||
PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
|
||||
|
||||
recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
|
||||
}
|
||||
@ -15,7 +15,6 @@
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#include <Python.h>
|
||||
#include <structmember.h>
|
||||
#include <bytearrayobject.h>
|
||||
@ -37,6 +36,10 @@ using namespace std;
|
||||
#include "wasatorcl.h"
|
||||
#include "debuglog.h"
|
||||
#include "pathut.h"
|
||||
#include "plaintorich.h"
|
||||
#include "hldata.h"
|
||||
|
||||
#include "pyrecoll.h"
|
||||
|
||||
static set<Rcl::Db *> the_dbs;
|
||||
static set<Rcl::Query *> the_queries;
|
||||
@ -45,7 +48,7 @@ static set<Rcl::Doc *> the_docs;
|
||||
static RclConfig *rclconfig;
|
||||
|
||||
// This has to exist somewhere in the python api ??
|
||||
PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
|
||||
static PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
PyObject *result = tp->tp_new(tp, args, kwargs);
|
||||
if (result && tp->tp_init(result, args, kwargs) < 0)
|
||||
@ -252,11 +255,6 @@ SearchData_addclause(recoll_SearchDataObject* self, PyObject *args,
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
///// DOC Doc code
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
/* Type-specific fields go here. */
|
||||
Rcl::Doc *doc;
|
||||
} recoll_DocObject;
|
||||
|
||||
static void
|
||||
Doc_dealloc(recoll_DocObject *self)
|
||||
@ -292,11 +290,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
|
||||
self->doc = new Rcl::Doc;
|
||||
if (self->doc == 0)
|
||||
return -1;
|
||||
self->rclconfig = rclconfig;
|
||||
the_docs.insert(self->doc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_getbinurl,
|
||||
PyDoc_STRVAR(doc_Doc_getbinurl,
|
||||
"getbinurl(none) -> binary url\n"
|
||||
"\n"
|
||||
"Returns an URL with a path part which is a as bit for bit copy of the \n"
|
||||
@ -316,7 +315,7 @@ Doc_getbinurl(recoll_DocObject *self)
|
||||
self->doc->url.size());
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_setbinurl,
|
||||
PyDoc_STRVAR(doc_Doc_setbinurl,
|
||||
"setbinurl(url) -> binary url\n"
|
||||
"\n"
|
||||
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
|
||||
@ -340,12 +339,94 @@ Doc_setbinurl(recoll_DocObject *self, PyObject *value)
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Doc_keys,
|
||||
"keys() -> list of doc object keys (attribute names)\n"
|
||||
);
|
||||
static PyObject *
|
||||
Doc_keys(recoll_DocObject *self)
|
||||
{
|
||||
LOGDEB(("Doc_keys\n"));
|
||||
if (self->doc == 0 ||
|
||||
the_docs.find(self->doc) == the_docs.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *pkeys = PyList_New(0);
|
||||
for (map<string,string>::const_iterator it = self->doc->meta.begin();
|
||||
it != self->doc->meta.end(); it++) {
|
||||
PyList_Append(pkeys, PyUnicode_Decode(it->first.c_str(),
|
||||
it->first.size(),
|
||||
"UTF-8", "replace"));
|
||||
}
|
||||
return pkeys;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Doc_items,
|
||||
"items() -> dictionary of doc object keys/values\n"
|
||||
);
|
||||
static PyObject *
|
||||
Doc_items(recoll_DocObject *self)
|
||||
{
|
||||
LOGDEB(("Doc_getbinurl\n"));
|
||||
if (self->doc == 0 ||
|
||||
the_docs.find(self->doc) == the_docs.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *pdict = PyDict_New();
|
||||
for (map<string,string>::const_iterator it = self->doc->meta.begin();
|
||||
it != self->doc->meta.end(); it++) {
|
||||
PyDict_SetItem(pdict,
|
||||
PyUnicode_Decode(it->first.c_str(),
|
||||
it->first.size(),
|
||||
"UTF-8", "replace"),
|
||||
PyUnicode_Decode(it->second.c_str(),
|
||||
it->second.size(),
|
||||
"UTF-8", "replace"));
|
||||
}
|
||||
return pdict;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Doc_get,
|
||||
"get(key) -> value\n"
|
||||
"Retrieve the named doc attribute\n"
|
||||
);
|
||||
|
||||
static PyObject *
|
||||
Doc_get(recoll_DocObject *self, PyObject *args)
|
||||
{
|
||||
LOGDEB(("Doc_get\n"));
|
||||
char *sutf8 = 0; // needs freeing
|
||||
if (!PyArg_ParseTuple(args, "es:Doc_get",
|
||||
"utf-8", &sutf8)) {
|
||||
return 0;
|
||||
}
|
||||
string key(sutf8);
|
||||
PyMem_Free(sutf8);
|
||||
|
||||
if (self->doc == 0 ||
|
||||
the_docs.find(self->doc) == the_docs.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "doc??");
|
||||
return 0;
|
||||
}
|
||||
string value;
|
||||
if (self->doc->getmeta(key, 0)) {
|
||||
value = self->doc->meta[key];
|
||||
return PyUnicode_Decode(value.c_str(),
|
||||
value.size(),
|
||||
"UTF-8", "replace");
|
||||
}
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyMethodDef Doc_methods[] = {
|
||||
{"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS,
|
||||
doc_getbinurl},
|
||||
{"setbinurl", (PyCFunction)Doc_setbinurl, METH_O,
|
||||
doc_setbinurl},
|
||||
{"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS, doc_Doc_getbinurl},
|
||||
{"setbinurl", (PyCFunction)Doc_setbinurl, METH_O, doc_Doc_setbinurl},
|
||||
{"keys", (PyCFunction)Doc_keys, METH_NOARGS, doc_Doc_keys},
|
||||
{"items", (PyCFunction)Doc_items, METH_NOARGS, doc_Doc_items},
|
||||
{"get", (PyCFunction)Doc_get, METH_VARARGS, doc_Doc_get},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
@ -380,8 +461,6 @@ Doc_getattr(recoll_DocObject *self, char *name)
|
||||
case 'f':
|
||||
if (!key.compare(Rcl::Doc::keyfs)) {
|
||||
value = self->doc->fbytes; found = true;
|
||||
} else if (!key.compare(Rcl::Doc::keyfs)) {
|
||||
value = self->doc->fbytes; found = true;
|
||||
} else if (!key.compare(Rcl::Doc::keyfmt)) {
|
||||
value = self->doc->fmtime; found = true;
|
||||
}
|
||||
@ -419,6 +498,11 @@ Doc_getattr(recoll_DocObject *self, char *name)
|
||||
self->doc->dbytes; found = true;
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
if (!key.compare("text")) {
|
||||
value = self->doc->text; found = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
@ -432,9 +516,16 @@ Doc_getattr(recoll_DocObject *self, char *name)
|
||||
|
||||
if (self->doc->getmeta(key, 0)) {
|
||||
value = self->doc->meta[key];
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
LOGDEB(("Doc_getattr: name [%s] key [%s] Not found\n",
|
||||
name, key.c_str()));
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n",
|
||||
name, key.c_str(), value.c_str()));
|
||||
// Return a python unicode object
|
||||
@ -683,10 +774,17 @@ Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
LOGDEB(("Query_sortby\n"));
|
||||
static const char *kwlist[] = {"field", "ascending", NULL};
|
||||
PyObject *ascobj = 0;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", (char**)kwlist,
|
||||
&self->sortfield,
|
||||
&self->ascending))
|
||||
&ascobj))
|
||||
return 0;
|
||||
|
||||
if (ascobj != 0 && !PyObject_IsTrue(ascobj))
|
||||
self->ascending = false;
|
||||
else
|
||||
self->ascending = true;
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
@ -707,13 +805,15 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
||||
char *sutf8 = 0; // needs freeing
|
||||
char *sstemlang = 0;
|
||||
int dostem = 1;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|ies:Query_execute",
|
||||
PyObject *dostemobj = 0;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute",
|
||||
(char**)kwlist, "utf-8", &sutf8,
|
||||
&dostem,
|
||||
&dostemobj,
|
||||
"utf-8", &sstemlang)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (dostemobj != 0 && !PyObject_IsTrue(dostemobj))
|
||||
dostem = 0;
|
||||
|
||||
string utf8(sutf8);
|
||||
PyMem_Free(sutf8);
|
||||
@ -828,15 +928,274 @@ Query_fetchone(recoll_QueryObject* self, PyObject *, PyObject *)
|
||||
return (PyObject *)result;
|
||||
}
|
||||
|
||||
|
||||
PyDoc_STRVAR(doc_Query_highlight,
|
||||
"highlight(text, ishtml = 0/1, eolbr = 0/1, methods = object))\n"
|
||||
"Will insert <span \"class=rclmatch\"></span> tags around the match areas\n"
|
||||
"in the input text and return the modified text\n"
|
||||
"ishtml can be set to indicate that the input text is html and html special\n"
|
||||
" characters should not be escaped\n"
|
||||
"methods if set should be an object with methods startMatch(i) and endMatch()\n"
|
||||
" which will be called for each match and should return a begin and end tag\n"
|
||||
);
|
||||
|
||||
class PyPlainToRich: public PlainToRich {
|
||||
public:
|
||||
PyPlainToRich(PyObject *methods)
|
||||
: m_methods(methods)
|
||||
{
|
||||
}
|
||||
virtual ~PyPlainToRich()
|
||||
{
|
||||
}
|
||||
virtual string startMatch(unsigned int idx)
|
||||
{
|
||||
PyObject *res = 0;
|
||||
if (m_methods)
|
||||
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
|
||||
(char *)"(i)", idx);
|
||||
if (res == 0)
|
||||
return "<span class=\"rclmatch\">";
|
||||
PyObject *res1 = res;
|
||||
if (PyUnicode_Check(res))
|
||||
res1 = PyUnicode_AsUTF8String(res);
|
||||
return PyString_AsString(res1);
|
||||
}
|
||||
|
||||
virtual string endMatch()
|
||||
{
|
||||
PyObject *res = 0;
|
||||
if (m_methods)
|
||||
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
|
||||
if (res == 0)
|
||||
return "</span res is null>";
|
||||
PyObject *res1 = res;
|
||||
if (PyUnicode_Check(res))
|
||||
res1 = PyUnicode_AsUTF8String(res);
|
||||
return PyString_AsString(res1);
|
||||
}
|
||||
|
||||
PyObject *m_methods;
|
||||
};
|
||||
|
||||
static PyObject *
|
||||
Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
LOGDEB1(("Query_highlight\n"));
|
||||
static const char *kwlist[] = {"text", "ishtml", "methods", NULL};
|
||||
char *sutf8 = 0; // needs freeing
|
||||
int ishtml = 0;
|
||||
PyObject *ishtmlobj = 0;
|
||||
PyObject *methods = 0;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OO:Query_highlight",
|
||||
(char**)kwlist,
|
||||
"utf-8", &sutf8,
|
||||
&ishtml,
|
||||
&methods)) {
|
||||
return 0;
|
||||
}
|
||||
string utf8(sutf8);
|
||||
LOGDEB(("Query_highlight: [%s] ishtml %d\n", sutf8, ishtml));
|
||||
PyMem_Free(sutf8);
|
||||
if (ishtmlobj != 0 && PyObject_IsTrue(ishtmlobj))
|
||||
ishtml = 1;
|
||||
|
||||
if (self->query == 0 ||
|
||||
the_queries.find(self->query) == the_queries.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
|
||||
RefCntr<Rcl::SearchData> sd = self->query->getSD();
|
||||
if (sd.isNull()) {
|
||||
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||
return 0;
|
||||
}
|
||||
HighlightData hldata;
|
||||
sd->getTerms(hldata);
|
||||
PyPlainToRich hler(methods);
|
||||
hler.set_inputhtml(ishtml);
|
||||
list<string> out;
|
||||
hler.plaintorich(utf8, out, hldata, 5000000);
|
||||
if (out.empty()) {
|
||||
PyErr_SetString(PyExc_ValueError, "Plaintorich failed");
|
||||
return 0;
|
||||
}
|
||||
PyObject* unicode = PyUnicode_FromStringAndSize(out.begin()->c_str(),
|
||||
out.begin()->size());
|
||||
return Py_BuildValue("u#", PyUnicode_AsUnicode(unicode),
|
||||
PyUnicode_GetSize(unicode));
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Query_makedocabstract,
|
||||
"makedocabstract(doc, methods = object))\n"
|
||||
"Will create a snippets abstract for doc by selecting text around the match\n"
|
||||
" terms\n"
|
||||
"If methods is set, will also perform highlighting. See the highlight method\n"
|
||||
);
|
||||
static PyObject *
|
||||
Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
|
||||
{
|
||||
LOGDEB(("Db_makeDocAbstract\n"));
|
||||
static const char *kwlist[] = {"doc", "methods", NULL};
|
||||
recoll_DocObject *pydoc = 0;
|
||||
PyObject *hlmethods = 0;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_makeDocAbstract",
|
||||
(char **)kwlist,
|
||||
&recoll_DocType, &pydoc,
|
||||
&hlmethods)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
|
||||
LOGERR(("Query_makeDocAbstract: doc not found %p\n", pydoc->doc));
|
||||
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||
return 0;
|
||||
}
|
||||
if (the_queries.find(self->query) == the_queries.end()) {
|
||||
LOGERR(("Query_makeDocAbstract: query not found %p\n", self->query));
|
||||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
RefCntr<Rcl::SearchData> sd = self->query->getSD();
|
||||
if (sd.isNull()) {
|
||||
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||
return 0;
|
||||
}
|
||||
string abstract;
|
||||
if (hlmethods == 0) {
|
||||
if (!self->query->makeDocAbstract(*(pydoc->doc), abstract)) {
|
||||
PyErr_SetString(PyExc_EnvironmentError,
|
||||
"rcl makeDocAbstract failed");
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
HighlightData hldata;
|
||||
sd->getTerms(hldata);
|
||||
PyPlainToRich hler(hlmethods);
|
||||
hler.set_inputhtml(0);
|
||||
vector<string> vabs;
|
||||
self->query->makeDocAbstract(*pydoc->doc, vabs);
|
||||
for (unsigned int i = 0; i < vabs.size(); i++) {
|
||||
if (vabs[i].empty())
|
||||
continue;
|
||||
list<string> lr;
|
||||
// There may be data like page numbers before the snippet text.
|
||||
// will be in brackets.
|
||||
string::size_type bckt = vabs[i].find("]");
|
||||
if (bckt == string::npos) {
|
||||
hler.plaintorich(vabs[i], lr, hldata);
|
||||
} else {
|
||||
hler.plaintorich(vabs[i].substr(bckt), lr, hldata);
|
||||
lr.front() = vabs[i].substr(0, bckt) + lr.front();
|
||||
}
|
||||
abstract += lr.front();
|
||||
abstract += "...";
|
||||
}
|
||||
}
|
||||
|
||||
// Return a python unicode object
|
||||
return PyUnicode_Decode(abstract.c_str(), abstract.size(),
|
||||
"UTF-8", "replace");
|
||||
}
|
||||
|
||||
|
||||
PyDoc_STRVAR(doc_Query_getxquery,
|
||||
"getxquery(None) -> Unicode string\n"
|
||||
"\n"
|
||||
"Retrieves the Xapian query description as a Unicode string.\n"
|
||||
"Meaningful only after executexx\n"
|
||||
);
|
||||
static PyObject *
|
||||
Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *)
|
||||
{
|
||||
LOGDEB(("Query_getxquery\n"));
|
||||
|
||||
if (self->query == 0 ||
|
||||
the_queries.find(self->query) == the_queries.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
RefCntr<Rcl::SearchData> sd = self->query->getSD();
|
||||
if (sd.isNull()) {
|
||||
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||
return 0;
|
||||
}
|
||||
string desc = sd->getDescription();
|
||||
return PyUnicode_Decode(desc.c_str(), desc.size(), "UTF-8", "replace");
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(doc_Query_getgroups,
|
||||
"getgroups(None) -> a list of pairs\n"
|
||||
"\n"
|
||||
"Retrieves the expanded query terms. Meaningful only after executexx\n"
|
||||
"In each pair, the first entry is a list of user terms, the second a list of\n"
|
||||
"query terms as derived from the user terms and used in the Xapian Query.\n"
|
||||
"The size of each list is one for simple terms, or more for group and phrase\n"
|
||||
"clauses\n"
|
||||
);
|
||||
static PyObject *
|
||||
Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *)
|
||||
{
|
||||
LOGDEB(("Query_getxquery\n"));
|
||||
|
||||
if (self->query == 0 ||
|
||||
the_queries.find(self->query) == the_queries.end()) {
|
||||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
RefCntr<Rcl::SearchData> sd = self->query->getSD();
|
||||
if (sd.isNull()) {
|
||||
PyErr_SetString(PyExc_ValueError, "Query not initialized");
|
||||
return 0;
|
||||
}
|
||||
HighlightData hld;
|
||||
sd->getTerms(hld);
|
||||
PyObject *mainlist = PyList_New(0);
|
||||
PyObject *ulist;
|
||||
PyObject *xlist;
|
||||
// We walk the groups vector. For each we retrieve the user group,
|
||||
// make a python list of each, then group those in a pair, and
|
||||
// append this to the main list.
|
||||
for (unsigned int i = 0; i < hld.groups.size(); i++) {
|
||||
unsigned int ugidx = hld.grpsugidx[i];
|
||||
ulist = PyList_New(hld.ugroups[ugidx].size());
|
||||
for (unsigned int j = 0; j < hld.ugroups[ugidx].size(); j++) {
|
||||
PyList_SetItem(ulist, j,
|
||||
PyUnicode_Decode(hld.ugroups[ugidx][j].c_str(),
|
||||
hld.ugroups[ugidx][j].size(),
|
||||
"UTF-8", "replace"));
|
||||
}
|
||||
|
||||
xlist = PyList_New(hld.groups[i].size());
|
||||
for (unsigned int j = 0; j < hld.groups[i].size(); j++) {
|
||||
PyList_SetItem(xlist, j,
|
||||
PyUnicode_Decode(hld.groups[i][j].c_str(),
|
||||
hld.groups[i][j].size(),
|
||||
"UTF-8", "replace"));
|
||||
}
|
||||
PyList_Append(mainlist, Py_BuildValue("(OO)", ulist, xlist));
|
||||
}
|
||||
return mainlist;
|
||||
}
|
||||
|
||||
static PyMethodDef Query_methods[] = {
|
||||
{"execute", (PyCFunction)Query_execute, METH_VARARGS|METH_KEYWORDS,
|
||||
doc_Query_execute},
|
||||
{"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS,
|
||||
doc_Query_executesd},
|
||||
{"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,
|
||||
{"fetchone", (PyCFunction)Query_fetchone, METH_NOARGS,
|
||||
doc_Query_fetchone},
|
||||
{"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS,
|
||||
doc_Query_sortby},
|
||||
{"highlight", (PyCFunction)Query_highlight, METH_VARARGS|METH_KEYWORDS,
|
||||
doc_Query_highlight},
|
||||
{"getxquery", (PyCFunction)Query_getxquery, METH_NOARGS,
|
||||
doc_Query_getxquery},
|
||||
{"getgroups", (PyCFunction)Query_getgroups, METH_NOARGS,
|
||||
doc_Query_getgroups},
|
||||
{"makedocabstract", (PyCFunction)Query_makedocabstract,
|
||||
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
@ -1037,12 +1396,13 @@ Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
|
||||
PyErr_SetString(PyExc_AttributeError, "db id not found");
|
||||
return 0;
|
||||
}
|
||||
LOGDEB(("Db_setAbstractParams: mxchrs %d, ctxwrds %d\n", maxchars, ctxwords));
|
||||
self->db->setAbstractParams(-1, maxchars, ctxwords);
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *)
|
||||
Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
|
||||
{
|
||||
LOGDEB(("Db_makeDocAbstract\n"));
|
||||
recoll_DocObject *pydoc = 0;
|
||||
@ -1341,4 +1701,8 @@ initrecoll(void)
|
||||
PyModule_AddStringConstant(m, "__doc__",
|
||||
pyrecoll_doc_string);
|
||||
|
||||
|
||||
PyObject* doctypecapsule =
|
||||
PyCapsule_New(&recoll_DocType, "recoll.doctype", 0);
|
||||
PyModule_AddObject(m, "doctype", doctypecapsule);
|
||||
}
|
||||
|
||||
31
src/python/recoll/pyrecoll.h
Normal file
31
src/python/recoll/pyrecoll.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* Copyright (C) 2012 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _PYRECOLL_H_INCLUDED_
|
||||
#define _PYRECOLL_H_INCLUDED_
|
||||
|
||||
#include <Python.h>
|
||||
|
||||
class RclConfig;
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
/* Type-specific fields go here. */
|
||||
Rcl::Doc *doc;
|
||||
/* Each doc object has a pointer to the global config, for convenience */
|
||||
RclConfig *rclconfig;
|
||||
} recoll_DocObject;
|
||||
|
||||
#endif // _PYRECOLL_H_INCLUDED_
|
||||
@ -53,6 +53,23 @@ module1 = Extension('recoll',
|
||||
sources = ['pyrecoll.cpp',
|
||||
])
|
||||
|
||||
module2 = Extension('rclextract',
|
||||
define_macros = [('MAJOR_VERSION', '1'),
|
||||
('MINOR_VERSION', '0'),
|
||||
('UNAC_VERSION', '"1.0.7"'),
|
||||
('RECOLL_DATADIR', '"@QTRECOLL_DATADIR@"')
|
||||
],
|
||||
include_dirs = ['/usr/local/include',
|
||||
os.path.join(top, 'utils'),
|
||||
os.path.join(top, 'common'),
|
||||
os.path.join(top, 'internfile'),
|
||||
os.path.join(top, 'rcldb'),
|
||||
],
|
||||
libraries = libs,
|
||||
library_dirs = libdirs,
|
||||
sources = ['pyrclextract.cpp',
|
||||
])
|
||||
|
||||
setup (name = 'Recoll',
|
||||
version = '1.0',
|
||||
description = 'Query/Augment a Recoll full text index',
|
||||
@ -60,4 +77,4 @@ setup (name = 'Recoll',
|
||||
author_email = 'jfd@recoll.org',
|
||||
long_description = '''
|
||||
''',
|
||||
ext_modules = [module1])
|
||||
ext_modules = [module1, module2])
|
||||
|
||||
@ -7,6 +7,7 @@ This could actually be useful for something after some customization
|
||||
import sys
|
||||
from getopt import getopt
|
||||
import recoll
|
||||
import rclextract
|
||||
|
||||
allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime",
|
||||
"ipath", "fbytes", "dbytes", "relevancyrating")
|
||||
@ -15,26 +16,59 @@ def Usage():
|
||||
print >> sys.stderr, "Usage: recollq.py [-c conf] [-i extra_index] <recoll query>"
|
||||
sys.exit(1);
|
||||
|
||||
class ptrmeths:
|
||||
def __init__(self, groups):
|
||||
self.groups = groups
|
||||
def startMatch(self, idx):
|
||||
ugroup = " ".join(self.groups[idx][1])
|
||||
return '<span class="pyrclstart" idx="%d" ugroup="%s">' % (idx, ugroup)
|
||||
def endMatch(self):
|
||||
return '</span>'
|
||||
|
||||
def extract(doc):
|
||||
extractor = rclextract.Extractor(doc)
|
||||
newdoc = extractor.extract(doc.ipath)
|
||||
return newdoc
|
||||
|
||||
def doquery(db, q):
|
||||
# Get query object
|
||||
query = db.query()
|
||||
# Parse/run input query string
|
||||
nres = query.execute(q, stemming = 1, stemlang="english")
|
||||
nres = query.execute(q, stemming = 0, stemlang="english")
|
||||
qs = u"Xapian query: [%s]" % query.getxquery()
|
||||
print(qs.encode("utf-8"))
|
||||
groups = query.getgroups()
|
||||
print "Groups:", groups
|
||||
m = ptrmeths(groups)
|
||||
|
||||
# Print results:
|
||||
print "Result count: ", nres
|
||||
if nres > 20:
|
||||
nres = 20
|
||||
while query.next >= 0 and query.next < nres:
|
||||
doc = query.fetchone()
|
||||
print query.next, ":",
|
||||
# for k,v in doc.items().items():
|
||||
# print "KEY:", k.encode('utf-8'), "VALUE", v.encode('utf-8')
|
||||
# continue
|
||||
for k in ("title", "mtime", "author"):
|
||||
print k, ":", getattr(doc, k).encode('utf-8')
|
||||
value = getattr(doc, k)
|
||||
# value = doc.get(k)
|
||||
if value is None:
|
||||
print k, ":", "(None)"
|
||||
else:
|
||||
print k, ":", value.encode('utf-8')
|
||||
#doc.setbinurl(bytearray("toto"))
|
||||
#burl = doc.getbinurl(); print "Bin URL :", doc.getbinurl()
|
||||
abs = db.makeDocAbstract(doc, query).encode('utf-8')
|
||||
print abs
|
||||
abs = query.makedocabstract(doc, methods=m)
|
||||
print abs.encode('utf-8')
|
||||
print
|
||||
# fulldoc = extract(doc)
|
||||
# print "FULLDOC MIMETYPE", fulldoc.mimetype, "TEXT:", fulldoc.text.encode("utf-8")
|
||||
|
||||
|
||||
########################################### MAIN
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
Usage()
|
||||
|
||||
@ -68,5 +102,3 @@ db = recoll.connect(confdir=confdir,
|
||||
db.setAbstractParams(maxchars=maxchars, contextwords=contextwords)
|
||||
|
||||
doquery(db, q)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user