temporary checkpoint for python module modifs

This commit is contained in:
Jean-Francois Dockes 2012-12-19 19:59:31 +01:00
parent a9599be5f9
commit 3be5e982b7
5 changed files with 695 additions and 28 deletions

View File

@ -0,0 +1,223 @@
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <Python.h>
#include <structmember.h>
#include <bytearrayobject.h>
#include <strings.h>
#include <string>
using namespace std;
#include "debuglog.h"
#include "rcldoc.h"
#include "internfile.h"
#include "rclconfig.h"
#include "pyrecoll.h"
static PyObject *recoll_DocType;
//////////////////////////////////////////////////////////////////////
/// Extractor object code
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
FileInterner *xtr;
TempDir *tmpdir;
RclConfig *rclconfig;
} rclx_ExtractorObject;
static void
Extractor_dealloc(rclx_ExtractorObject *self)
{
LOGDEB(("Extractor_dealloc\n"));
delete self->xtr;
delete self->tmpdir;
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB(("Extractor_new\n"));
rclx_ExtractorObject *self =
(rclx_ExtractorObject *)type->tp_alloc(type, 0);
if (self == 0)
return 0;
self->xtr = 0;
self->tmpdir = 0;
self->rclconfig = 0;
return (PyObject *)self;
}
static int
Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB(("Extractor_init\n"));
static const char* kwlist[] = {"doc", NULL};
PyObject *pdobj;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!", (char**)kwlist,
recoll_DocType, &pdobj))
return -1;
recoll_DocObject *dobj = (recoll_DocObject *)pdobj;
self->tmpdir = new TempDir;
if (dobj->doc == 0) {
PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
return -1;
}
self->rclconfig = dobj->rclconfig;
self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
FileInterner::FIF_forPreview);
return 0;
}
static PyObject *
Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB(("Extractor_extract\n"));
static const char* kwlist[] = {"ipath", NULL};
char *sipath = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract",
(char**)kwlist,
"utf-8", &sipath))
return 0;
string ipath(sipath);
PyMem_Free(sipath);
if (self->xtr == 0) {
PyErr_SetString(PyExc_AttributeError, "extract: null object");
return 0;
}
/* Call the doc class object to create a new doc. */
recoll_DocObject *result =
(recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
if (!result) {
LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
return 0;
}
FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
if (status != FileInterner::FIDone) {
PyErr_SetString(PyExc_AttributeError, "internfile failure");
return 0;
}
string html = self->xtr->get_html();
if (!html.empty()) {
result->doc->text = html;
result->doc->mimetype = "text/html";
}
// fetching attributes easier. Is this actually needed ? Useful for
// url which is also formatted .
Rcl::Doc *doc = result->doc;
printableUrl(self->rclconfig->getDefCharset(), doc->url,
doc->meta[Rcl::Doc::keyurl]);
doc->meta[Rcl::Doc::keytp] = doc->mimetype;
doc->meta[Rcl::Doc::keyipt] = doc->ipath;
doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
doc->meta[Rcl::Doc::keyds] = doc->dbytes;
return (PyObject *)result;
}
PyDoc_STRVAR(doc_extract,
"extract(ipath)\n"
"Extract document defined by ipath and return a doc object.\n"
);
static PyMethodDef Extractor_methods[] = {
{"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
doc_extract},
{NULL} /* Sentinel */
};
PyDoc_STRVAR(doc_ExtractorObject,
"Extractor()\n"
"\n"
"A Extractor object describes a query. It has a number of global\n"
"parameters and a chain of search clauses.\n"
);
static PyTypeObject rclx_ExtractorType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"rclextract.Extractor", /*tp_name*/
sizeof(rclx_ExtractorObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)Extractor_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
doc_ExtractorObject, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Extractor_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)Extractor_init, /* tp_init */
0, /* tp_alloc */
Extractor_new, /* tp_new */
};
///////////////////////////////////// Module-level stuff
static PyMethodDef rclxMethods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyDoc_STRVAR(rclx_doc_string,
"This is an interface to the Recoll text extraction features.");
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
PyMODINIT_FUNC
initrclextract(void)
{
PyObject* m = Py_InitModule("rclextract", rclxMethods);
PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
if (PyType_Ready(&rclx_ExtractorType) < 0)
return;
Py_INCREF(&rclx_ExtractorType);
PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
}

View File

@ -15,7 +15,6 @@
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <Python.h>
#include <structmember.h>
#include <bytearrayobject.h>
@ -37,6 +36,10 @@ using namespace std;
#include "wasatorcl.h"
#include "debuglog.h"
#include "pathut.h"
#include "plaintorich.h"
#include "hldata.h"
#include "pyrecoll.h"
static set<Rcl::Db *> the_dbs;
static set<Rcl::Query *> the_queries;
@ -45,7 +48,7 @@ static set<Rcl::Doc *> the_docs;
static RclConfig *rclconfig;
// This has to exist somewhere in the python api ??
PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
static PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
{
PyObject *result = tp->tp_new(tp, args, kwargs);
if (result && tp->tp_init(result, args, kwargs) < 0)
@ -252,11 +255,6 @@ SearchData_addclause(recoll_SearchDataObject* self, PyObject *args,
///////////////////////////////////////////////////////////////////////
///// DOC Doc code
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Doc *doc;
} recoll_DocObject;
static void
Doc_dealloc(recoll_DocObject *self)
@ -292,11 +290,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
self->doc = new Rcl::Doc;
if (self->doc == 0)
return -1;
self->rclconfig = rclconfig;
the_docs.insert(self->doc);
return 0;
}
PyDoc_STRVAR(doc_getbinurl,
PyDoc_STRVAR(doc_Doc_getbinurl,
"getbinurl(none) -> binary url\n"
"\n"
"Returns an URL with a path part which is a as bit for bit copy of the \n"
@ -316,7 +315,7 @@ Doc_getbinurl(recoll_DocObject *self)
self->doc->url.size());
}
PyDoc_STRVAR(doc_setbinurl,
PyDoc_STRVAR(doc_Doc_setbinurl,
"setbinurl(url) -> binary url\n"
"\n"
"Set the URL from binary path like file://may/contain/unencodable/bytes\n"
@ -340,12 +339,94 @@ Doc_setbinurl(recoll_DocObject *self, PyObject *value)
Py_RETURN_NONE;
}
PyDoc_STRVAR(doc_Doc_keys,
"keys() -> list of doc object keys (attribute names)\n"
);
static PyObject *
Doc_keys(recoll_DocObject *self)
{
LOGDEB(("Doc_keys\n"));
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
PyObject *pkeys = PyList_New(0);
for (map<string,string>::const_iterator it = self->doc->meta.begin();
it != self->doc->meta.end(); it++) {
PyList_Append(pkeys, PyUnicode_Decode(it->first.c_str(),
it->first.size(),
"UTF-8", "replace"));
}
return pkeys;
}
PyDoc_STRVAR(doc_Doc_items,
"items() -> dictionary of doc object keys/values\n"
);
static PyObject *
Doc_items(recoll_DocObject *self)
{
LOGDEB(("Doc_getbinurl\n"));
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
PyObject *pdict = PyDict_New();
for (map<string,string>::const_iterator it = self->doc->meta.begin();
it != self->doc->meta.end(); it++) {
PyDict_SetItem(pdict,
PyUnicode_Decode(it->first.c_str(),
it->first.size(),
"UTF-8", "replace"),
PyUnicode_Decode(it->second.c_str(),
it->second.size(),
"UTF-8", "replace"));
}
return pdict;
}
PyDoc_STRVAR(doc_Doc_get,
"get(key) -> value\n"
"Retrieve the named doc attribute\n"
);
static PyObject *
Doc_get(recoll_DocObject *self, PyObject *args)
{
LOGDEB(("Doc_get\n"));
char *sutf8 = 0; // needs freeing
if (!PyArg_ParseTuple(args, "es:Doc_get",
"utf-8", &sutf8)) {
return 0;
}
string key(sutf8);
PyMem_Free(sutf8);
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc??");
return 0;
}
string value;
if (self->doc->getmeta(key, 0)) {
value = self->doc->meta[key];
return PyUnicode_Decode(value.c_str(),
value.size(),
"UTF-8", "replace");
}
Py_RETURN_NONE;
}
static PyMethodDef Doc_methods[] = {
{"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS,
doc_getbinurl},
{"setbinurl", (PyCFunction)Doc_setbinurl, METH_O,
doc_setbinurl},
{"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS, doc_Doc_getbinurl},
{"setbinurl", (PyCFunction)Doc_setbinurl, METH_O, doc_Doc_setbinurl},
{"keys", (PyCFunction)Doc_keys, METH_NOARGS, doc_Doc_keys},
{"items", (PyCFunction)Doc_items, METH_NOARGS, doc_Doc_items},
{"get", (PyCFunction)Doc_get, METH_VARARGS, doc_Doc_get},
{NULL} /* Sentinel */
};
@ -380,8 +461,6 @@ Doc_getattr(recoll_DocObject *self, char *name)
case 'f':
if (!key.compare(Rcl::Doc::keyfs)) {
value = self->doc->fbytes; found = true;
} else if (!key.compare(Rcl::Doc::keyfs)) {
value = self->doc->fbytes; found = true;
} else if (!key.compare(Rcl::Doc::keyfmt)) {
value = self->doc->fmtime; found = true;
}
@ -419,6 +498,11 @@ Doc_getattr(recoll_DocObject *self, char *name)
self->doc->dbytes; found = true;
}
break;
case 't':
if (!key.compare("text")) {
value = self->doc->text; found = true;
}
break;
}
if (!found) {
@ -432,9 +516,16 @@ Doc_getattr(recoll_DocObject *self, char *name)
if (self->doc->getmeta(key, 0)) {
value = self->doc->meta[key];
found = true;
}
}
if (!found) {
LOGDEB(("Doc_getattr: name [%s] key [%s] Not found\n",
name, key.c_str()));
Py_RETURN_NONE;
}
LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n",
name, key.c_str(), value.c_str()));
// Return a python unicode object
@ -683,10 +774,17 @@ Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB(("Query_sortby\n"));
static const char *kwlist[] = {"field", "ascending", NULL};
PyObject *ascobj = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", (char**)kwlist,
&self->sortfield,
&self->ascending))
&ascobj))
return 0;
if (ascobj != 0 && !PyObject_IsTrue(ascobj))
self->ascending = false;
else
self->ascending = true;
Py_RETURN_NONE;
}
@ -707,13 +805,15 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
char *sutf8 = 0; // needs freeing
char *sstemlang = 0;
int dostem = 1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|ies:Query_execute",
PyObject *dostemobj = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute",
(char**)kwlist, "utf-8", &sutf8,
&dostem,
&dostemobj,
"utf-8", &sstemlang)) {
return 0;
}
if (dostemobj != 0 && !PyObject_IsTrue(dostemobj))
dostem = 0;
string utf8(sutf8);
PyMem_Free(sutf8);
@ -828,15 +928,274 @@ Query_fetchone(recoll_QueryObject* self, PyObject *, PyObject *)
return (PyObject *)result;
}
PyDoc_STRVAR(doc_Query_highlight,
"highlight(text, ishtml = 0/1, eolbr = 0/1, methods = object))\n"
"Will insert <span \"class=rclmatch\"></span> tags around the match areas\n"
"in the input text and return the modified text\n"
"ishtml can be set to indicate that the input text is html and html special\n"
" characters should not be escaped\n"
"methods if set should be an object with methods startMatch(i) and endMatch()\n"
" which will be called for each match and should return a begin and end tag\n"
);
class PyPlainToRich: public PlainToRich {
public:
PyPlainToRich(PyObject *methods)
: m_methods(methods)
{
}
virtual ~PyPlainToRich()
{
}
virtual string startMatch(unsigned int idx)
{
PyObject *res = 0;
if (m_methods)
res = PyObject_CallMethod(m_methods, (char *)"startMatch",
(char *)"(i)", idx);
if (res == 0)
return "<span class=\"rclmatch\">";
PyObject *res1 = res;
if (PyUnicode_Check(res))
res1 = PyUnicode_AsUTF8String(res);
return PyString_AsString(res1);
}
virtual string endMatch()
{
PyObject *res = 0;
if (m_methods)
res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
if (res == 0)
return "</span res is null>";
PyObject *res1 = res;
if (PyUnicode_Check(res))
res1 = PyUnicode_AsUTF8String(res);
return PyString_AsString(res1);
}
PyObject *m_methods;
};
static PyObject *
Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB1(("Query_highlight\n"));
static const char *kwlist[] = {"text", "ishtml", "methods", NULL};
char *sutf8 = 0; // needs freeing
int ishtml = 0;
PyObject *ishtmlobj = 0;
PyObject *methods = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OO:Query_highlight",
(char**)kwlist,
"utf-8", &sutf8,
&ishtml,
&methods)) {
return 0;
}
string utf8(sutf8);
LOGDEB(("Query_highlight: [%s] ishtml %d\n", sutf8, ishtml));
PyMem_Free(sutf8);
if (ishtmlobj != 0 && PyObject_IsTrue(ishtmlobj))
ishtml = 1;
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
RefCntr<Rcl::SearchData> sd = self->query->getSD();
if (sd.isNull()) {
PyErr_SetString(PyExc_ValueError, "Query not initialized");
return 0;
}
HighlightData hldata;
sd->getTerms(hldata);
PyPlainToRich hler(methods);
hler.set_inputhtml(ishtml);
list<string> out;
hler.plaintorich(utf8, out, hldata, 5000000);
if (out.empty()) {
PyErr_SetString(PyExc_ValueError, "Plaintorich failed");
return 0;
}
PyObject* unicode = PyUnicode_FromStringAndSize(out.begin()->c_str(),
out.begin()->size());
return Py_BuildValue("u#", PyUnicode_AsUnicode(unicode),
PyUnicode_GetSize(unicode));
}
PyDoc_STRVAR(doc_Query_makedocabstract,
"makedocabstract(doc, methods = object))\n"
"Will create a snippets abstract for doc by selecting text around the match\n"
" terms\n"
"If methods is set, will also perform highlighting. See the highlight method\n"
);
static PyObject *
Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
{
LOGDEB(("Db_makeDocAbstract\n"));
static const char *kwlist[] = {"doc", "methods", NULL};
recoll_DocObject *pydoc = 0;
PyObject *hlmethods = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_makeDocAbstract",
(char **)kwlist,
&recoll_DocType, &pydoc,
&hlmethods)) {
return 0;
}
if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
LOGERR(("Query_makeDocAbstract: doc not found %p\n", pydoc->doc));
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (the_queries.find(self->query) == the_queries.end()) {
LOGERR(("Query_makeDocAbstract: query not found %p\n", self->query));
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
RefCntr<Rcl::SearchData> sd = self->query->getSD();
if (sd.isNull()) {
PyErr_SetString(PyExc_ValueError, "Query not initialized");
return 0;
}
string abstract;
if (hlmethods == 0) {
if (!self->query->makeDocAbstract(*(pydoc->doc), abstract)) {
PyErr_SetString(PyExc_EnvironmentError,
"rcl makeDocAbstract failed");
return 0;
}
} else {
HighlightData hldata;
sd->getTerms(hldata);
PyPlainToRich hler(hlmethods);
hler.set_inputhtml(0);
vector<string> vabs;
self->query->makeDocAbstract(*pydoc->doc, vabs);
for (unsigned int i = 0; i < vabs.size(); i++) {
if (vabs[i].empty())
continue;
list<string> lr;
// There may be data like page numbers before the snippet text.
// will be in brackets.
string::size_type bckt = vabs[i].find("]");
if (bckt == string::npos) {
hler.plaintorich(vabs[i], lr, hldata);
} else {
hler.plaintorich(vabs[i].substr(bckt), lr, hldata);
lr.front() = vabs[i].substr(0, bckt) + lr.front();
}
abstract += lr.front();
abstract += "...";
}
}
// Return a python unicode object
return PyUnicode_Decode(abstract.c_str(), abstract.size(),
"UTF-8", "replace");
}
PyDoc_STRVAR(doc_Query_getxquery,
"getxquery(None) -> Unicode string\n"
"\n"
"Retrieves the Xapian query description as a Unicode string.\n"
"Meaningful only after executexx\n"
);
static PyObject *
Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *)
{
LOGDEB(("Query_getxquery\n"));
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
RefCntr<Rcl::SearchData> sd = self->query->getSD();
if (sd.isNull()) {
PyErr_SetString(PyExc_ValueError, "Query not initialized");
return 0;
}
string desc = sd->getDescription();
return PyUnicode_Decode(desc.c_str(), desc.size(), "UTF-8", "replace");
}
PyDoc_STRVAR(doc_Query_getgroups,
"getgroups(None) -> a list of pairs\n"
"\n"
"Retrieves the expanded query terms. Meaningful only after executexx\n"
"In each pair, the first entry is a list of user terms, the second a list of\n"
"query terms as derived from the user terms and used in the Xapian Query.\n"
"The size of each list is one for simple terms, or more for group and phrase\n"
"clauses\n"
);
static PyObject *
Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *)
{
LOGDEB(("Query_getxquery\n"));
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
RefCntr<Rcl::SearchData> sd = self->query->getSD();
if (sd.isNull()) {
PyErr_SetString(PyExc_ValueError, "Query not initialized");
return 0;
}
HighlightData hld;
sd->getTerms(hld);
PyObject *mainlist = PyList_New(0);
PyObject *ulist;
PyObject *xlist;
// We walk the groups vector. For each we retrieve the user group,
// make a python list of each, then group those in a pair, and
// append this to the main list.
for (unsigned int i = 0; i < hld.groups.size(); i++) {
unsigned int ugidx = hld.grpsugidx[i];
ulist = PyList_New(hld.ugroups[ugidx].size());
for (unsigned int j = 0; j < hld.ugroups[ugidx].size(); j++) {
PyList_SetItem(ulist, j,
PyUnicode_Decode(hld.ugroups[ugidx][j].c_str(),
hld.ugroups[ugidx][j].size(),
"UTF-8", "replace"));
}
xlist = PyList_New(hld.groups[i].size());
for (unsigned int j = 0; j < hld.groups[i].size(); j++) {
PyList_SetItem(xlist, j,
PyUnicode_Decode(hld.groups[i][j].c_str(),
hld.groups[i][j].size(),
"UTF-8", "replace"));
}
PyList_Append(mainlist, Py_BuildValue("(OO)", ulist, xlist));
}
return mainlist;
}
static PyMethodDef Query_methods[] = {
{"execute", (PyCFunction)Query_execute, METH_VARARGS|METH_KEYWORDS,
doc_Query_execute},
{"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS,
doc_Query_executesd},
{"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,
{"fetchone", (PyCFunction)Query_fetchone, METH_NOARGS,
doc_Query_fetchone},
{"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS,
doc_Query_sortby},
{"highlight", (PyCFunction)Query_highlight, METH_VARARGS|METH_KEYWORDS,
doc_Query_highlight},
{"getxquery", (PyCFunction)Query_getxquery, METH_NOARGS,
doc_Query_getxquery},
{"getgroups", (PyCFunction)Query_getgroups, METH_NOARGS,
doc_Query_getgroups},
{"makedocabstract", (PyCFunction)Query_makedocabstract,
METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
{NULL} /* Sentinel */
};
@ -1037,12 +1396,13 @@ Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "db id not found");
return 0;
}
LOGDEB(("Db_setAbstractParams: mxchrs %d, ctxwrds %d\n", maxchars, ctxwords));
self->db->setAbstractParams(-1, maxchars, ctxwords);
Py_RETURN_NONE;
}
static PyObject *
Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *)
Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
{
LOGDEB(("Db_makeDocAbstract\n"));
recoll_DocObject *pydoc = 0;
@ -1341,4 +1701,8 @@ initrecoll(void)
PyModule_AddStringConstant(m, "__doc__",
pyrecoll_doc_string);
PyObject* doctypecapsule =
PyCapsule_New(&recoll_DocType, "recoll.doctype", 0);
PyModule_AddObject(m, "doctype", doctypecapsule);
}

View File

@ -0,0 +1,31 @@
/* Copyright (C) 2012 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _PYRECOLL_H_INCLUDED_
#define _PYRECOLL_H_INCLUDED_
#include <Python.h>
class RclConfig;
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Rcl::Doc *doc;
/* Each doc object has a pointer to the global config, for convenience */
RclConfig *rclconfig;
} recoll_DocObject;
#endif // _PYRECOLL_H_INCLUDED_

View File

@ -53,6 +53,23 @@ module1 = Extension('recoll',
sources = ['pyrecoll.cpp',
])
module2 = Extension('rclextract',
define_macros = [('MAJOR_VERSION', '1'),
('MINOR_VERSION', '0'),
('UNAC_VERSION', '"1.0.7"'),
('RECOLL_DATADIR', '"@QTRECOLL_DATADIR@"')
],
include_dirs = ['/usr/local/include',
os.path.join(top, 'utils'),
os.path.join(top, 'common'),
os.path.join(top, 'internfile'),
os.path.join(top, 'rcldb'),
],
libraries = libs,
library_dirs = libdirs,
sources = ['pyrclextract.cpp',
])
setup (name = 'Recoll',
version = '1.0',
description = 'Query/Augment a Recoll full text index',
@ -60,4 +77,4 @@ setup (name = 'Recoll',
author_email = 'jfd@recoll.org',
long_description = '''
''',
ext_modules = [module1])
ext_modules = [module1, module2])

View File

@ -7,6 +7,7 @@ This could actually be useful for something after some customization
import sys
from getopt import getopt
import recoll
import rclextract
allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime",
"ipath", "fbytes", "dbytes", "relevancyrating")
@ -15,26 +16,59 @@ def Usage():
print >> sys.stderr, "Usage: recollq.py [-c conf] [-i extra_index] <recoll query>"
sys.exit(1);
class ptrmeths:
def __init__(self, groups):
self.groups = groups
def startMatch(self, idx):
ugroup = " ".join(self.groups[idx][1])
return '<span class="pyrclstart" idx="%d" ugroup="%s">' % (idx, ugroup)
def endMatch(self):
return '</span>'
def extract(doc):
extractor = rclextract.Extractor(doc)
newdoc = extractor.extract(doc.ipath)
return newdoc
def doquery(db, q):
# Get query object
query = db.query()
# Parse/run input query string
nres = query.execute(q, stemming = 1, stemlang="english")
nres = query.execute(q, stemming = 0, stemlang="english")
qs = u"Xapian query: [%s]" % query.getxquery()
print(qs.encode("utf-8"))
groups = query.getgroups()
print "Groups:", groups
m = ptrmeths(groups)
# Print results:
print "Result count: ", nres
if nres > 20:
nres = 20
while query.next >= 0 and query.next < nres:
doc = query.fetchone()
print query.next, ":",
# for k,v in doc.items().items():
# print "KEY:", k.encode('utf-8'), "VALUE", v.encode('utf-8')
# continue
for k in ("title", "mtime", "author"):
print k, ":", getattr(doc, k).encode('utf-8')
value = getattr(doc, k)
# value = doc.get(k)
if value is None:
print k, ":", "(None)"
else:
print k, ":", value.encode('utf-8')
#doc.setbinurl(bytearray("toto"))
#burl = doc.getbinurl(); print "Bin URL :", doc.getbinurl()
abs = db.makeDocAbstract(doc, query).encode('utf-8')
print abs
abs = query.makedocabstract(doc, methods=m)
print abs.encode('utf-8')
print
# fulldoc = extract(doc)
# print "FULLDOC MIMETYPE", fulldoc.mimetype, "TEXT:", fulldoc.text.encode("utf-8")
########################################### MAIN
if len(sys.argv) < 2:
Usage()
@ -68,5 +102,3 @@ db = recoll.connect(confdir=confdir,
db.setAbstractParams(maxchars=maxchars, contextwords=contextwords)
doquery(db, q)