temporary checkpoint for python module modifs

2012-12-19 19:59:31 +01:00 · 2012-12-19 19:59:31 +01:00 · 3be5e982b7
commit 3be5e982b7
parent a9599be5f9
5 changed files with 695 additions and 28 deletions
--- a/src/python/recoll/pyrclextract.cpp
+++ b/src/python/recoll/pyrclextract.cpp
@ -0,0 +1,223 @@
+/* Copyright (C) 2007 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+
+#include <Python.h>
+#include <structmember.h>
+#include <bytearrayobject.h>
+
+#include <strings.h>
+
+#include <string>
+using namespace std;
+
+#include "debuglog.h"
+#include "rcldoc.h"
+#include "internfile.h"
+#include "rclconfig.h"
+
+#include "pyrecoll.h"
+
+static PyObject *recoll_DocType;
+
+//////////////////////////////////////////////////////////////////////
+/// Extractor object code
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+    FileInterner *xtr;
+    TempDir *tmpdir;
+    RclConfig *rclconfig;
+} rclx_ExtractorObject;
+
+static void 
+Extractor_dealloc(rclx_ExtractorObject *self)
+{
+    LOGDEB(("Extractor_dealloc\n"));
+    delete self->xtr;
+    delete self->tmpdir;
+    self->ob_type->tp_free((PyObject*)self);
+}
+
+static PyObject *
+Extractor_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    LOGDEB(("Extractor_new\n"));
+    rclx_ExtractorObject *self = 
+	(rclx_ExtractorObject *)type->tp_alloc(type, 0);
+    if (self == 0) 
+	return 0;
+    self->xtr = 0;
+    self->tmpdir = 0;
+    self->rclconfig = 0;
+    return (PyObject *)self;
+}
+
+static int
+Extractor_init(rclx_ExtractorObject *self, PyObject *args, PyObject *kwargs)
+{
+    LOGDEB(("Extractor_init\n"));
+    static const char* kwlist[] = {"doc", NULL};
+    PyObject *pdobj;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!", (char**)kwlist, 
+				     recoll_DocType, &pdobj))
+	return -1;
+    recoll_DocObject *dobj = (recoll_DocObject *)pdobj;
+    self->tmpdir = new TempDir;
+    if (dobj->doc == 0) {
+        PyErr_SetString(PyExc_AttributeError, "Null Doc ?");
+	return -1;
+    }
+    self->rclconfig = dobj->rclconfig;
+    self->xtr = new FileInterner(*dobj->doc, self->rclconfig, *self->tmpdir,
+				 FileInterner::FIF_forPreview);
+    return 0;
+}
+
+static PyObject *
+Extractor_extract(rclx_ExtractorObject* self, PyObject *args, PyObject *kwargs)
+{
+    LOGDEB(("Extractor_extract\n"));
+    static const char* kwlist[] = {"ipath", NULL};
+    char *sipath = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es:Extractor_extract", 
+				     (char**)kwlist, 
+				     "utf-8", &sipath))
+	return 0;
+
+    string ipath(sipath);
+    PyMem_Free(sipath);
+
+    if (self->xtr == 0) {
+        PyErr_SetString(PyExc_AttributeError, "extract: null object");
+	return 0;
+    }
+    /* Call the doc class object to create a new doc. */
+    recoll_DocObject *result = 
+       (recoll_DocObject *)PyObject_CallObject((PyObject *)recoll_DocType, 0);
+    if (!result) {
+	LOGERR(("Query_fetchone: couldn't create doc object for result\n"));
+	return 0;
+    }
+    FileInterner::Status status = self->xtr->internfile(*(result->doc), ipath);
+    if (status != FileInterner::FIDone) {
+        PyErr_SetString(PyExc_AttributeError, "internfile failure");
+        return 0;
+    }
+
+    string html = self->xtr->get_html();
+    if (!html.empty()) {
+	result->doc->text = html;
+	result->doc->mimetype = "text/html";
+    }
+
+    // fetching attributes easier. Is this actually needed ? Useful for
+    // url which is also formatted .
+    Rcl::Doc *doc = result->doc;
+    printableUrl(self->rclconfig->getDefCharset(), doc->url, 
+		 doc->meta[Rcl::Doc::keyurl]);
+    doc->meta[Rcl::Doc::keytp] = doc->mimetype;
+    doc->meta[Rcl::Doc::keyipt] = doc->ipath;
+    doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
+    doc->meta[Rcl::Doc::keyds] = doc->dbytes;
+    return (PyObject *)result;
+}
+
+PyDoc_STRVAR(doc_extract,
+"extract(ipath)\n"
+"Extract document defined by ipath and return a doc object.\n"
+);
+
+static PyMethodDef Extractor_methods[] = {
+    {"extract", (PyCFunction)Extractor_extract, METH_VARARGS|METH_KEYWORDS,
+     doc_extract},
+    {NULL}  /* Sentinel */
+};
+
+PyDoc_STRVAR(doc_ExtractorObject,
+"Extractor()\n"
+"\n"
+"A Extractor object describes a query. It has a number of global\n"
+"parameters and a chain of search clauses.\n"
+);
+static PyTypeObject rclx_ExtractorType = {
+    PyObject_HEAD_INIT(NULL)
+    0,                         /*ob_size*/
+    "rclextract.Extractor",             /*tp_name*/
+    sizeof(rclx_ExtractorObject), /*tp_basicsize*/
+    0,                         /*tp_itemsize*/
+    (destructor)Extractor_dealloc,    /*tp_dealloc*/
+    0,                         /*tp_print*/
+    0,                         /*tp_getattr*/
+    0,                         /*tp_setattr*/
+    0,                         /*tp_compare*/
+    0,                         /*tp_repr*/
+    0,                         /*tp_as_number*/
+    0,                         /*tp_as_sequence*/
+    0,                         /*tp_as_mapping*/
+    0,                         /*tp_hash */
+    0,                         /*tp_call*/
+    0,                         /*tp_str*/
+    0,                         /*tp_getattro*/
+    0,                         /*tp_setattro*/
+    0,                         /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,  /*tp_flags*/
+    doc_ExtractorObject,      /* tp_doc */
+    0,		               /* tp_traverse */
+    0,		               /* tp_clear */
+    0,		               /* tp_richcompare */
+    0,		               /* tp_weaklistoffset */
+    0,		               /* tp_iter */
+    0,		               /* tp_iternext */
+    Extractor_methods,        /* tp_methods */
+    0,                         /* tp_members */
+    0,                         /* tp_getset */
+    0,                         /* tp_base */
+    0,                         /* tp_dict */
+    0,                         /* tp_descr_get */
+    0,                         /* tp_descr_set */
+    0,                         /* tp_dictoffset */
+    (initproc)Extractor_init, /* tp_init */
+    0,                         /* tp_alloc */
+    Extractor_new,            /* tp_new */
+};
+
+///////////////////////////////////// Module-level stuff
+static PyMethodDef rclxMethods[] = {
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+PyDoc_STRVAR(rclx_doc_string,
+	     "This is an interface to the Recoll text extraction features.");
+
+#ifndef PyMODINIT_FUNC	/* declarations for DLL import/export */
+#define PyMODINIT_FUNC void
+#endif
+PyMODINIT_FUNC
+initrclextract(void)
+{
+    PyObject* m = Py_InitModule("rclextract", rclxMethods);
+    PyModule_AddStringConstant(m, "__doc__", rclx_doc_string);
+
+    if (PyType_Ready(&rclx_ExtractorType) < 0)
+        return;
+    Py_INCREF(&rclx_ExtractorType);
+    PyModule_AddObject(m, "Extractor", (PyObject *)&rclx_ExtractorType);
+
+    recoll_DocType = (PyObject*)PyCapsule_Import("recoll.doctype", 0);
+}
--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@ -15,7 +15,6 @@
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

-
 #include <Python.h>
 #include <structmember.h>
 #include <bytearrayobject.h>
@ -37,6 +36,10 @@ using namespace std;
 #include "wasatorcl.h"
 #include "debuglog.h"
 #include "pathut.h"
+#include "plaintorich.h"
+#include "hldata.h"
+
+#include "pyrecoll.h"

 static set<Rcl::Db *> the_dbs;
 static set<Rcl::Query *> the_queries;
@ -45,7 +48,7 @@ static set<Rcl::Doc *> the_docs;
 static RclConfig *rclconfig;

 // This has to exist somewhere in the python api ??
-PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
+static PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs)
 {
    PyObject *result = tp->tp_new(tp, args, kwargs);
    if (result && tp->tp_init(result, args, kwargs) < 0)
@ -252,11 +255,6 @@ SearchData_addclause(recoll_SearchDataObject* self, PyObject *args,

 ///////////////////////////////////////////////////////////////////////
 ///// DOC Doc code
-typedef struct {
-    PyObject_HEAD
-    /* Type-specific fields go here. */
-    Rcl::Doc *doc;
-} recoll_DocObject;

 static void 
 Doc_dealloc(recoll_DocObject *self)
@ -292,11 +290,12 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
    self->doc = new Rcl::Doc;
    if (self->doc == 0)
 	return -1;
+    self->rclconfig = rclconfig;
    the_docs.insert(self->doc);
    return 0;
 }

-PyDoc_STRVAR(doc_getbinurl,
+PyDoc_STRVAR(doc_Doc_getbinurl,
 "getbinurl(none) -> binary url\n"
 "\n"
 "Returns an URL with a path part which is a as bit for bit copy of the \n"
@ -316,7 +315,7 @@ Doc_getbinurl(recoll_DocObject *self)
 					 self->doc->url.size());
 }

-PyDoc_STRVAR(doc_setbinurl,
+PyDoc_STRVAR(doc_Doc_setbinurl,
 "setbinurl(url) -> binary url\n"
 "\n"
 "Set the URL from binary path like file://may/contain/unencodable/bytes\n"
@ -340,12 +339,94 @@ Doc_setbinurl(recoll_DocObject *self, PyObject *value)
    Py_RETURN_NONE;
 }

+PyDoc_STRVAR(doc_Doc_keys,
+"keys() -> list of doc object keys (attribute names)\n"
+);
+static PyObject *
+Doc_keys(recoll_DocObject *self)
+{
+    LOGDEB(("Doc_keys\n"));
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc");
+	return 0;
+    }
+
+    PyObject *pkeys = PyList_New(0);
+    for (map<string,string>::const_iterator it = self->doc->meta.begin();
+	 it != self->doc->meta.end(); it++) {
+	PyList_Append(pkeys,  PyUnicode_Decode(it->first.c_str(), 
+					       it->first.size(), 
+					       "UTF-8", "replace"));
+    }
+    return pkeys;
+}
+
+PyDoc_STRVAR(doc_Doc_items,
+"items() -> dictionary of doc object keys/values\n"
+);
+static PyObject *
+Doc_items(recoll_DocObject *self)
+{
+    LOGDEB(("Doc_getbinurl\n"));
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc");
+	return 0;
+    }
+
+    PyObject *pdict = PyDict_New();
+    for (map<string,string>::const_iterator it = self->doc->meta.begin();
+	 it != self->doc->meta.end(); it++) {
+	PyDict_SetItem(pdict, 
+		       PyUnicode_Decode(it->first.c_str(), 
+					it->first.size(), 
+					"UTF-8", "replace"),
+		       PyUnicode_Decode(it->second.c_str(), 
+					it->second.size(), 
+					"UTF-8", "replace"));
+    }
+    return pdict;
+}
+
+PyDoc_STRVAR(doc_Doc_get,
+"get(key) -> value\n"
+"Retrieve the named doc attribute\n"
+);
+
+static PyObject *
+Doc_get(recoll_DocObject *self, PyObject *args)
+{
+    LOGDEB(("Doc_get\n"));
+    char *sutf8 = 0; // needs freeing
+    if (!PyArg_ParseTuple(args, "es:Doc_get",
+			  "utf-8", &sutf8)) {
+	return 0;
+    }
+    string key(sutf8);
+    PyMem_Free(sutf8);
+
+    if (self->doc == 0 || 
+	the_docs.find(self->doc) == the_docs.end()) {
+        PyErr_SetString(PyExc_AttributeError, "doc??");
+	return 0;
+    }
+    string value;
+    if (self->doc->getmeta(key, 0)) {
+	value = self->doc->meta[key];
+	return PyUnicode_Decode(value.c_str(), 
+				value.size(), 
+				"UTF-8", "replace");
+    }
+    Py_RETURN_NONE;
+}

 static PyMethodDef Doc_methods[] = {
-    {"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS,
-     doc_getbinurl},
-    {"setbinurl", (PyCFunction)Doc_setbinurl, METH_O,
-     doc_setbinurl},
+    {"getbinurl", (PyCFunction)Doc_getbinurl, METH_NOARGS, doc_Doc_getbinurl},
+    {"setbinurl", (PyCFunction)Doc_setbinurl, METH_O, doc_Doc_setbinurl},
+    {"keys", (PyCFunction)Doc_keys, METH_NOARGS, doc_Doc_keys},
+    {"items", (PyCFunction)Doc_items, METH_NOARGS, doc_Doc_items},
+    {"get", (PyCFunction)Doc_get, METH_VARARGS, doc_Doc_get},
    {NULL}  /* Sentinel */
 };

@ -380,8 +461,6 @@ Doc_getattr(recoll_DocObject *self, char *name)
    case 'f':
 	if (!key.compare(Rcl::Doc::keyfs)) {
 	    value = self->doc->fbytes; found = true;
-	} else if (!key.compare(Rcl::Doc::keyfs)) {
-	    value = self->doc->fbytes; found = true;
 	} else if (!key.compare(Rcl::Doc::keyfmt)) {
 	    value = self->doc->fmtime; found = true;
 	}
@ -419,6 +498,11 @@ Doc_getattr(recoll_DocObject *self, char *name)
 		self->doc->dbytes; found = true;
 	}
 	break;
+    case 't':
+	if (!key.compare("text")) {
+	    value = self->doc->text; found = true;
+	}
+	break;
    }

    if (!found) {
@ -432,9 +516,16 @@ Doc_getattr(recoll_DocObject *self, char *name)

 	if (self->doc->getmeta(key, 0)) {
 	    value = self->doc->meta[key];
+	    found = true;
 	}
    }

+    if (!found) {
+	LOGDEB(("Doc_getattr: name [%s] key [%s] Not found\n",
+		name, key.c_str()));
+	Py_RETURN_NONE;
+    }
+
    LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n",
 	    name, key.c_str(), value.c_str()));
    // Return a python unicode object
@ -683,10 +774,17 @@ Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
 {
    LOGDEB(("Query_sortby\n"));
    static const char *kwlist[] = {"field", "ascending", NULL};
+    PyObject *ascobj = 0;
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", (char**)kwlist,
 				     &self->sortfield,
-				     &self->ascending))
+				     &ascobj))
 	return 0;
+
+    if (ascobj != 0 && !PyObject_IsTrue(ascobj))
+	self->ascending = false;
+    else 
+	self->ascending = true;
+
    Py_RETURN_NONE;
 }

@ -707,13 +805,15 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
    char *sutf8 = 0; // needs freeing
    char *sstemlang = 0;
    int dostem = 1;
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|ies:Query_execute", 
+    PyObject *dostemobj = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute", 
 				     (char**)kwlist, "utf-8", &sutf8,
-				     &dostem, 
+				     &dostemobj, 
 				     "utf-8", &sstemlang)) {
 	return 0;
    }
-
+    if (dostemobj != 0 && !PyObject_IsTrue(dostemobj))
+	dostem = 0;

    string utf8(sutf8);
    PyMem_Free(sutf8);
@ -828,15 +928,274 @@ Query_fetchone(recoll_QueryObject* self, PyObject *, PyObject *)
    return (PyObject *)result;
 }

+
+PyDoc_STRVAR(doc_Query_highlight,
+"highlight(text, ishtml = 0/1, eolbr = 0/1, methods = object))\n"
+"Will insert <span \"class=rclmatch\"></span> tags around the match areas\n"
+"in the input text and return the modified text\n"
+"ishtml can be set to indicate that the input text is html and html special\n"
+" characters should not be escaped\n"
+"methods if set should be an object with methods startMatch(i) and endMatch()\n"
+"  which will be called for each match and should return a begin and end tag\n"
+);
+
+class PyPlainToRich: public PlainToRich {
+public:
+    PyPlainToRich(PyObject *methods)
+    : m_methods(methods)
+    {
+    }
+    virtual ~PyPlainToRich()
+    {
+    }
+    virtual string startMatch(unsigned int idx)
+    {
+	PyObject *res =  0;
+	if (m_methods)
+	    res = PyObject_CallMethod(m_methods, (char *)"startMatch", 
+				      (char *)"(i)", idx);
+	if (res == 0)
+	    return "<span class=\"rclmatch\">";
+	PyObject *res1 = res;
+	if (PyUnicode_Check(res))
+	    res1 = PyUnicode_AsUTF8String(res);
+	return PyString_AsString(res1);
+    } 
+
+    virtual string endMatch() 
+    {
+	PyObject *res =  0;
+	if (m_methods)
+	    res = PyObject_CallMethod(m_methods, (char *)"endMatch", 0);
+	if (res == 0)
+	    return "</span res is null>";
+	PyObject *res1 = res;
+	if (PyUnicode_Check(res))
+	    res1 = PyUnicode_AsUTF8String(res);
+	return PyString_AsString(res1);
+    }
+
+    PyObject *m_methods;
+};
+
+static PyObject *
+Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
+{
+    LOGDEB1(("Query_highlight\n"));
+    static const char *kwlist[] = {"text", "ishtml", "methods", NULL};
+    char *sutf8 = 0; // needs freeing
+    int ishtml = 0;
+    PyObject *ishtmlobj = 0;
+    PyObject *methods = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OO:Query_highlight",
+				     (char**)kwlist, 
+				     "utf-8", &sutf8,
+				     &ishtml,
+				     &methods)) {
+	return 0;
+    }
+    string utf8(sutf8);
+    LOGDEB(("Query_highlight: [%s] ishtml %d\n", sutf8, ishtml));
+    PyMem_Free(sutf8);
+    if (ishtmlobj != 0 && PyObject_IsTrue(ishtmlobj))
+	ishtml = 1;
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    HighlightData hldata;
+    sd->getTerms(hldata);
+    PyPlainToRich hler(methods);
+    hler.set_inputhtml(ishtml);
+    list<string> out;
+    hler.plaintorich(utf8, out, hldata, 5000000);
+    if (out.empty()) {
+	PyErr_SetString(PyExc_ValueError, "Plaintorich failed");
+	return 0;
+    }
+    PyObject* unicode = PyUnicode_FromStringAndSize(out.begin()->c_str(),
+						    out.begin()->size());
+    return Py_BuildValue("u#", PyUnicode_AsUnicode(unicode), 
+			 PyUnicode_GetSize(unicode));
+}
+
+PyDoc_STRVAR(doc_Query_makedocabstract,
+"makedocabstract(doc, methods = object))\n"
+"Will create a snippets abstract for doc by selecting text around the match\n"
+" terms\n"
+"If methods is set, will also perform highlighting. See the highlight method\n"
+);
+static PyObject *
+Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
+{
+    LOGDEB(("Db_makeDocAbstract\n"));
+    static const char *kwlist[] = {"doc", "methods", NULL};
+    recoll_DocObject *pydoc = 0;
+    PyObject *hlmethods = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_makeDocAbstract",
+				     (char **)kwlist,
+				     &recoll_DocType, &pydoc,
+				     &hlmethods)) {
+	return 0;
+    }
+
+    if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
+	LOGERR(("Query_makeDocAbstract: doc not found %p\n", pydoc->doc));
+        PyErr_SetString(PyExc_AttributeError, "doc");
+        return 0;
+    }
+    if (the_queries.find(self->query) == the_queries.end()) {
+	LOGERR(("Query_makeDocAbstract: query not found %p\n", self->query));
+        PyErr_SetString(PyExc_AttributeError, "query");
+        return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    string abstract;
+    if (hlmethods == 0) {
+	if (!self->query->makeDocAbstract(*(pydoc->doc), abstract)) {
+	    PyErr_SetString(PyExc_EnvironmentError, 
+			    "rcl makeDocAbstract failed");
+	    return 0;
+	}
+    } else {
+	HighlightData hldata;
+	sd->getTerms(hldata);
+	PyPlainToRich hler(hlmethods);
+	hler.set_inputhtml(0);
+	vector<string> vabs;
+	self->query->makeDocAbstract(*pydoc->doc, vabs);
+	for (unsigned int i = 0; i < vabs.size(); i++) {
+	    if (vabs[i].empty())
+		continue;
+	    list<string> lr;
+	    // There may be data like page numbers before the snippet text.
+	    // will be in brackets.
+	    string::size_type bckt = vabs[i].find("]");
+	    if (bckt == string::npos) {
+		hler.plaintorich(vabs[i], lr, hldata);
+	    } else {
+		hler.plaintorich(vabs[i].substr(bckt), lr, hldata);
+		lr.front() = vabs[i].substr(0, bckt) + lr.front();
+	    }
+	    abstract += lr.front();
+	    abstract += "...";
+	}
+    }
+
+    // Return a python unicode object
+    return PyUnicode_Decode(abstract.c_str(), abstract.size(), 
+				     "UTF-8", "replace");
+}
+
+
+PyDoc_STRVAR(doc_Query_getxquery,
+"getxquery(None) -> Unicode string\n"
+"\n"
+"Retrieves the Xapian query description as a Unicode string.\n"
+"Meaningful only after executexx\n"
+);
+static PyObject *
+Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *)
+{
+    LOGDEB(("Query_getxquery\n"));
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    string desc = sd->getDescription();
+    return PyUnicode_Decode(desc.c_str(), desc.size(), "UTF-8", "replace");
+}
+
+PyDoc_STRVAR(doc_Query_getgroups,
+"getgroups(None) -> a list of pairs\n"
+"\n"
+"Retrieves the expanded query terms. Meaningful only after executexx\n"
+"In each pair, the first entry is a list of user terms, the second a list of\n"
+"query terms as derived from the user terms and used in the Xapian Query.\n"
+"The size of each list is one for simple terms, or more for group and phrase\n"
+"clauses\n"
+);
+static PyObject *
+Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *)
+{
+    LOGDEB(("Query_getxquery\n"));
+
+    if (self->query == 0 || 
+	the_queries.find(self->query) == the_queries.end()) {
+        PyErr_SetString(PyExc_AttributeError, "query");
+	return 0;
+    }
+    RefCntr<Rcl::SearchData> sd = self->query->getSD();
+    if (sd.isNull()) {
+	PyErr_SetString(PyExc_ValueError, "Query not initialized");
+	return 0;
+    }
+    HighlightData hld;
+    sd->getTerms(hld);
+    PyObject *mainlist = PyList_New(0);
+    PyObject *ulist;
+    PyObject *xlist;
+    // We walk the groups vector. For each we retrieve the user group,
+    // make a python list of each, then group those in a pair, and
+    // append this to the main list.
+    for (unsigned int i = 0; i < hld.groups.size(); i++) {
+	unsigned int ugidx = hld.grpsugidx[i];
+	ulist = PyList_New(hld.ugroups[ugidx].size());
+	for (unsigned int j = 0; j < hld.ugroups[ugidx].size(); j++) {
+	    PyList_SetItem(ulist, j, 
+			   PyUnicode_Decode(hld.ugroups[ugidx][j].c_str(), 
+					    hld.ugroups[ugidx][j].size(), 
+					    "UTF-8", "replace"));
+	}
+
+	xlist = PyList_New(hld.groups[i].size());
+	for (unsigned int j = 0; j < hld.groups[i].size(); j++) {
+	    PyList_SetItem(xlist, j, 
+			   PyUnicode_Decode(hld.groups[i][j].c_str(), 
+					    hld.groups[i][j].size(), 
+					    "UTF-8", "replace"));
+	}
+	PyList_Append(mainlist,  Py_BuildValue("(OO)", ulist, xlist));
+    }
+    return mainlist;
+}
+
 static PyMethodDef Query_methods[] = {
    {"execute", (PyCFunction)Query_execute, METH_VARARGS|METH_KEYWORDS, 
     doc_Query_execute},
    {"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS, 
     doc_Query_executesd},
-    {"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,
+    {"fetchone", (PyCFunction)Query_fetchone, METH_NOARGS,
     doc_Query_fetchone},
    {"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS,
     doc_Query_sortby},
+    {"highlight", (PyCFunction)Query_highlight, METH_VARARGS|METH_KEYWORDS,
+     doc_Query_highlight},
+    {"getxquery", (PyCFunction)Query_getxquery, METH_NOARGS,
+     doc_Query_getxquery},
+    {"getgroups", (PyCFunction)Query_getgroups, METH_NOARGS,
+     doc_Query_getgroups},
+    {"makedocabstract", (PyCFunction)Query_makedocabstract, 
+     METH_VARARGS|METH_KEYWORDS, doc_Query_makedocabstract},
    {NULL}  /* Sentinel */
 };

@ -1037,12 +1396,13 @@ Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
        PyErr_SetString(PyExc_AttributeError, "db id not found");
        return 0;
    }
+    LOGDEB(("Db_setAbstractParams: mxchrs %d, ctxwrds %d\n", maxchars, ctxwords));
    self->db->setAbstractParams(-1, maxchars, ctxwords);
    Py_RETURN_NONE;
 }

 static PyObject *
-Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *)
+Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
 {
    LOGDEB(("Db_makeDocAbstract\n"));
    recoll_DocObject *pydoc = 0;
@ -1341,4 +1701,8 @@ initrecoll(void)
    PyModule_AddStringConstant(m, "__doc__",
                               pyrecoll_doc_string);

+    
+    PyObject* doctypecapsule = 
+	PyCapsule_New(&recoll_DocType, "recoll.doctype", 0);
+    PyModule_AddObject(m, "doctype", doctypecapsule);
 }
--- a/src/python/recoll/pyrecoll.h
+++ b/src/python/recoll/pyrecoll.h
@ -0,0 +1,31 @@
+/* Copyright (C) 2012 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#ifndef _PYRECOLL_H_INCLUDED_
+#define _PYRECOLL_H_INCLUDED_
+
+#include <Python.h>
+
+class RclConfig;
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+    Rcl::Doc *doc;
+    /* Each doc object has a pointer to the global config, for convenience */
+    RclConfig *rclconfig; 
+} recoll_DocObject;
+
+#endif // _PYRECOLL_H_INCLUDED_
--- a/src/python/recoll/setup.py.in
+++ b/src/python/recoll/setup.py.in
@ -53,6 +53,23 @@ module1 = Extension('recoll',
                    sources = ['pyrecoll.cpp',
                               ])

+module2 = Extension('rclextract',
+                    define_macros = [('MAJOR_VERSION', '1'),
+                                     ('MINOR_VERSION', '0'),
+                                     ('UNAC_VERSION', '"1.0.7"'),
+                                     ('RECOLL_DATADIR', '"@QTRECOLL_DATADIR@"')
+                                     ],
+                    include_dirs = ['/usr/local/include',
+                                    os.path.join(top, 'utils'), 
+                                    os.path.join(top, 'common'), 
+                                    os.path.join(top, 'internfile'), 
+                                    os.path.join(top, 'rcldb'), 
+                                    ],
+                    libraries = libs,
+                    library_dirs = libdirs,
+                    sources = ['pyrclextract.cpp',
+                               ])
+
 setup (name = 'Recoll',
       version = '1.0',
       description = 'Query/Augment a Recoll full text index',
@ -60,4 +77,4 @@ setup (name = 'Recoll',
       author_email = 'jfd@recoll.org',
       long_description = '''
 ''',
-       ext_modules = [module1])
+       ext_modules = [module1, module2])
--- a/src/python/samples/recollq.py
+++ b/src/python/samples/recollq.py
@ -7,6 +7,7 @@ This could actually be useful for something after some customization
 import sys
 from getopt import getopt
 import recoll
+import rclextract

 allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime",
           "ipath", "fbytes", "dbytes", "relevancyrating")
@ -15,26 +16,59 @@ def Usage():
    print >> sys.stderr, "Usage: recollq.py [-c conf] [-i extra_index] <recoll query>"
    sys.exit(1);

+class ptrmeths:
+    def __init__(self, groups):
+        self.groups = groups
+    def startMatch(self, idx):
+        ugroup = " ".join(self.groups[idx][1])
+        return '<span class="pyrclstart" idx="%d" ugroup="%s">' % (idx, ugroup)
+    def endMatch(self):
+        return '</span>'
+    
+def extract(doc):
+    extractor = rclextract.Extractor(doc)
+    newdoc = extractor.extract(doc.ipath)
+    return newdoc
+
 def doquery(db, q):
    # Get query object
    query = db.query()
    # Parse/run input query string
-    nres = query.execute(q, stemming = 1, stemlang="english")
+    nres = query.execute(q, stemming = 0, stemlang="english")
+    qs = u"Xapian query: [%s]" % query.getxquery()
+    print(qs.encode("utf-8"))
+    groups = query.getgroups()
+    print "Groups:", groups
+    m = ptrmeths(groups)

    # Print results:
    print "Result count: ", nres
+    if nres > 20:
+        nres = 20
    while query.next >= 0 and query.next < nres: 
        doc = query.fetchone()
        print query.next, ":",
+#        for k,v in doc.items().items():
+#            print "KEY:", k.encode('utf-8'), "VALUE", v.encode('utf-8')
+#        continue
        for k in ("title", "mtime", "author"):
-            print k, ":", getattr(doc, k).encode('utf-8')
+            value = getattr(doc, k)
+#            value = doc.get(k)
+            if value is None:
+                print k, ":", "(None)"
+            else:
+                print k, ":", value.encode('utf-8')
        #doc.setbinurl(bytearray("toto"))
        #burl = doc.getbinurl(); print "Bin URL :", doc.getbinurl()
-        abs = db.makeDocAbstract(doc, query).encode('utf-8')
-        print abs
+        abs = query.makedocabstract(doc, methods=m)
+        print abs.encode('utf-8')
        print
+#        fulldoc = extract(doc)
+#        print "FULLDOC MIMETYPE", fulldoc.mimetype, "TEXT:", fulldoc.text.encode("utf-8")


+########################################### MAIN
+
 if len(sys.argv) < 2:
    Usage()

@ -68,5 +102,3 @@ db = recoll.connect(confdir=confdir,
 db.setAbstractParams(maxchars=maxchars, contextwords=contextwords)

 doquery(db, q)
-
-