diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index a6c146ed..1b1a050c 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.5 2008-07-01 08:24:30 dockes Exp $ (C) 2007 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.6 2008-08-26 07:36:41 dockes Exp $ (C) 2007 J.F.Dockes"; #endif #include @@ -35,6 +35,8 @@ PyObject *obj_Create(PyTypeObject *tp, PyObject *args, PyObject *kwargs) return result; } +////////////////////////////////////////////////////// +////// Python object definitions for Db, Query, and Doc typedef struct { PyObject_HEAD /* Type-specific fields go here. */ @@ -46,7 +48,7 @@ static PyTypeObject recollq_DbType = { "recollq.Db", /*tp_name*/ sizeof(recollq_DbObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ - 0, /*tp_dealloc*/ + 0, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ @@ -62,26 +64,27 @@ static PyTypeObject recollq_DbType = { 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "Recollq Db objects", /* tp_doc */ + "Recollq Db objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ + 0, /* tp_methods */ + 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - 0, /* tp_init */ + 0, /* tp_init */ 0, /* tp_alloc */ - 0, /* tp_new */ + 0, /* tp_new */ }; + typedef struct { PyObject_HEAD /* Type-specific fields go here. */ @@ -111,24 +114,24 @@ static PyTypeObject recollq_QueryType = { 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "Recollq Query objects", /* tp_doc */ + "Recollq Query object", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ + 0, /* tp_methods */ + 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - 0, /* tp_init */ + 0, /* tp_init */ 0, /* tp_alloc */ - 0, /* tp_new */ + 0, /* tp_new */ }; typedef struct { PyObject_HEAD @@ -158,26 +161,28 @@ static PyTypeObject recollq_DocType = { 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "Recollq Doc objects", /* tp_doc */ + "Recollq Doc objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ + 0, /* tp_methods */ + 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - 0, /* tp_init */ + 0, /* tp_init */ 0, /* tp_alloc */ - 0, /* tp_new */ + 0, /* tp_new */ }; +/////////////////////////////////////////////// +////// Db object code static void Db_dealloc(recollq_DbObject *self) { @@ -206,12 +211,13 @@ static int Db_init(recollq_DbObject *self, PyObject *args, PyObject *kwargs) { LOGDEB(("Db_init\n")); - static char *kwlist[] = {"confdir", "extra_dbs", NULL}; + static char *kwlist[] = {"confdir", "extra_dbs", "writable", NULL}; PyObject *extradbs = 0; char *confdir = 0; + int writable = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|sO", kwlist, - &confdir, &extradbs)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|sOi", kwlist, + &confdir, &extradbs, &writable)) return -1; // If the user creates several dbs, changing the confdir, we call @@ -239,9 +245,10 @@ Db_init(recollq_DbObject *self, PyObject *args, PyObject *kwargs) self->db = new Rcl::Db; string dbdir = rclconfig->getDbDir(); LOGDEB(("Db_init: getdbdir ok: [%s]\n", dbdir.c_str())); - if (!self->db->open(dbdir, rclconfig->getStopfile(), Rcl::Db::DbRO)) { + if (!self->db->open(dbdir, rclconfig->getStopfile(), writable ? + Rcl::Db::DbUpd : Rcl::Db::DbRO)) { LOGDEB(("Db_init: db open error\n")); - PyErr_SetString(PyExc_EnvironmentError, "Cant open index"); + PyErr_SetString(PyExc_EnvironmentError, "Can't open index"); return -1; } @@ -355,20 +362,85 @@ Db_makeDocAbstract(recollq_DbObject* self, PyObject *args, PyObject *) "UTF-8", "replace"); } +static PyObject * +Db_needUpdate(recollq_DbObject* self, PyObject *args, PyObject *kwds) +{ + char *udi = 0; + char *sig = 0; + LOGDEB(("Db_needUpdate\n")); + if (!PyArg_ParseTuple(args, "eses:Db_needUpdate", + "utf-8", &udi, "utf-8", &sig)) { + return 0; + } + if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { + LOGERR(("Db_makeDocAbstract: db not found %p\n", self->db)); + PyErr_SetString(PyExc_AttributeError, "db"); + return 0; + } + bool result = self->db->needUpdate(udi, sig); + PyMem_Free(udi); + PyMem_Free(sig); + return Py_BuildValue("i", result); +} + +static PyObject * +Db_addOrUpdate(recollq_DbObject* self, PyObject *args, PyObject *) +{ + LOGDEB(("Db_addOrUpdate\n")); + char *udi = 0; + char *parent_udi = 0; + + recollq_DocObject *pydoc; + + if (!PyArg_ParseTuple(args, "esesO!:Db_makeDocAbstract", + "utf-8", &udi, "utf-8", &parent_udi, + &recollq_DocType, &pydoc)) { + return 0; + } + if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { + LOGERR(("Db_addOrUpdate: db not found %p\n", self->db)); + PyErr_SetString(PyExc_AttributeError, "db"); + return 0; + } + if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) { + LOGERR(("Db_addOrUpdate: doc not found %p\n", pydoc->doc)); + PyErr_SetString(PyExc_AttributeError, "doc"); + return 0; + } + if (!self->db->addOrUpdate(udi, parent_udi, *pydoc->doc)) { + LOGERR(("Db_addOrUpdate: rcldb error\n")); + PyErr_SetString(PyExc_AttributeError, "rcldb error"); + PyMem_Free(udi); + PyMem_Free(parent_udi); + return 0; + } + PyMem_Free(udi); + PyMem_Free(parent_udi); + Py_RETURN_NONE; +} + static PyMethodDef Db_methods[] = { {"query", (PyCFunction)Db_query, METH_NOARGS, "Return a new, blank query for this index" }, {"setAbstractParams", (PyCFunction)Db_setAbstractParams, METH_VARARGS|METH_KEYWORDS, - "Set abstract build params: maxchars and contextwords" + "Set abstract build parameters: maxchars and contextwords" }, {"makeDocAbstract", (PyCFunction)Db_makeDocAbstract, METH_VARARGS, - "Return a new, blank query for this index" + "Build keyword in context abstract for document and query" + }, + {"needUpdate", (PyCFunction)Db_needUpdate, METH_VARARGS, + "Check index up to date" + }, + {"addOrUpdate", (PyCFunction)Db_addOrUpdate, METH_VARARGS, + "Add or update document in index" }, {NULL} /* Sentinel */ }; +///////////////////////////////////////////// +/// Query object method static void Query_dealloc(recollq_QueryObject *self) { @@ -394,6 +466,9 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) return (PyObject *)self; } +// Query_init creates an unusable object. The only way to create a +// valid Query Object is through db_query(). (or we'd need to add a Db +// parameter to the Query object creation method) static int Query_init(recollq_QueryObject *self, PyObject *, PyObject *) { @@ -411,9 +486,8 @@ static PyObject * Query_execute(recollq_QueryObject* self, PyObject *args, PyObject *kwds) { char *utf8 = 0; - int len = 0; LOGDEB(("Query_execute\n")); - if (!PyArg_ParseTuple(args, "es#:Query_execute", "utf-8", &utf8, &len)) { + if (!PyArg_ParseTuple(args, "es:Query_execute", "utf-8", &utf8)) { return 0; } @@ -425,6 +499,7 @@ Query_execute(recollq_QueryObject* self, PyObject *args, PyObject *kwds) } string reason; Rcl::SearchData *sd = wasaStringToRcl(utf8, reason); + PyMem_Free(utf8); if (!sd) { PyErr_SetString(PyExc_ValueError, reason.c_str()); return 0; @@ -451,24 +526,22 @@ Query_fetchone(recollq_QueryObject* self, PyObject *, PyObject *) PyErr_SetString(PyExc_AttributeError, "query: no results"); return 0; } - Rcl::Doc *doc = new Rcl::Doc; + recollq_DocObject *result = + (recollq_DocObject *)obj_Create(&recollq_DocType, 0, 0); + if (!result) { + LOGERR(("Query_fetchone: couldn't create doc object for result\n")); + return 0; + } int percent; - if (!self->query->getDoc(self->next, *doc, &percent)) { + if (!self->query->getDoc(self->next, *result->doc, &percent)) { PyErr_SetString(PyExc_EnvironmentError, "query: cant fetch result"); self->next = -1; return 0; } self->next++; - recollq_DocObject *result = - (recollq_DocObject *)obj_Create(&recollq_DocType, 0, 0); - if (!result) { - delete doc; - return 0; - } - result->doc = doc; - the_docs.insert(result->doc); // Move some data from the dedicated fields to the meta array to make // fetching attributes easier + Rcl::Doc *doc = result->doc; printableUrl(rclconfig->getDefCharset(), doc->url, doc->meta["url"]); doc->meta["mimetype"] = doc->mimetype; doc->meta["mtime"] = doc->dmtime.empty() ? doc->fmtime : doc->dmtime; @@ -502,7 +575,8 @@ static PyMemberDef Query_members[] = { {NULL} /* Sentinel */ }; - +/////////////////////////////////////////////////////////////////////// +///// Doc object methods static void Doc_dealloc(recollq_DocObject *self) { @@ -534,14 +608,21 @@ Doc_init(recollq_DocObject *self, PyObject *, PyObject *) if (self->doc) the_docs.erase(self->doc); delete self->doc; - self->doc = 0; + self->doc = new Rcl::Doc; + if (self->doc == 0) + return -1; + the_docs.insert(self->doc); return 0; } +// The "closure" thing is actually the meta field name. This is how +// python allows one set of get/set functions to get/set different +// attributes (pass them an additional parameters as from the +// getseters table and call it a "closure" static PyObject * Doc_getmeta(recollq_DocObject *self, void *closure) { - LOGDEB(("Doc_getmeta\n")); + LOGDEB(("Doc_getmeta: [%s]\n", (const char *)closure)); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc"); @@ -568,32 +649,109 @@ Doc_getmeta(recollq_DocObject *self, void *closure) static int Doc_setmeta(recollq_DocObject *self, PyObject *value, void *closure) { - PyErr_SetString(PyExc_RuntimeError, "Cannot set attributes for now"); - return -1; + if (self->doc == 0 || + the_docs.find(self->doc) == the_docs.end()) { + PyErr_SetString(PyExc_AttributeError, "doc??"); + return -1; + } + LOGDEB2(("Doc_setmeta: doc %p\n", self->doc)); + if (PyString_Check(value)) { + value = PyUnicode_FromObject(value); + if (value == 0) + return -1; + } + + if (!PyUnicode_Check(value)) { + PyErr_SetString(PyExc_AttributeError, "value not str/unicode??"); + return -1; + } + + PyObject* putf8 = PyUnicode_AsUTF8String(value); + if (putf8 == 0) { + LOGERR(("Doc_setmeta: encoding to utf8 failed\n")); + PyErr_SetString(PyExc_AttributeError, "value??"); + return -1; + } + + char* uvalue = PyString_AsString(putf8); + const char *key = (const char *)closure; + if (key == 0) { + PyErr_SetString(PyExc_AttributeError, "key??"); + return -1; + } + + LOGDEB(("Doc_setmeta: setting [%s] to [%s]\n", key, uvalue)); + self->doc->meta[key] = uvalue; + switch (key[0]) { + case 'd': + if (!strcmp(key, "dbytes")) { + self->doc->dbytes = uvalue; + } + break; + case 'f': + if (!strcmp(key, "fbytes")) { + self->doc->fbytes = uvalue; + } + break; + case 'i': + if (!strcmp(key, "ipath")) { + self->doc->ipath = uvalue; + } + break; + case 'm': + if (!strcmp(key, "mimetype")) { + self->doc->mimetype = uvalue; + } else if (!strcmp(key, "mtime")) { + self->doc->dmtime = uvalue; + } + break; + case 's': + if (!strcmp(key, "sig")) { + self->doc->sig = uvalue; + } + break; + case 't': + if (!strcmp(key, "text")) { + self->doc->text = uvalue; + } + break; + case 'u': + if (!strcmp(key, "url")) { + self->doc->url = uvalue; + } + break; + } + return 0; } static PyGetSetDef Doc_getseters[] = { // Name, get, set, doc, closure - {"title", (getter)Doc_getmeta, (setter)Doc_setmeta, - "title", (void *)"title"}, - {"keywords", (getter)Doc_getmeta, (setter)Doc_setmeta, - "keywords", (void *)"keywords"}, - {"abstract", (getter)Doc_getmeta, (setter)Doc_setmeta, - "abstract", (void *)"abstract"}, {"url", (getter)Doc_getmeta, (setter)Doc_setmeta, "url", (void *)"url"}, + {"ipath", (getter)Doc_getmeta, (setter)Doc_setmeta, + "ipath", (void *)"ipath"}, {"mimetype", (getter)Doc_getmeta, (setter)Doc_setmeta, "mimetype", (void *)"mimetype"}, {"mtime", (getter)Doc_getmeta, (setter)Doc_setmeta, "mtime", (void *)"mtime"}, - {"ipath", (getter)Doc_getmeta, (setter)Doc_setmeta, - "ipath", (void *)"ipath"}, {"fbytes", (getter)Doc_getmeta, (setter)Doc_setmeta, "fbytes", (void *)"fbytes"}, {"dbytes", (getter)Doc_getmeta, (setter)Doc_setmeta, "dbytes", (void *)"dbytes"}, {"relevance", (getter)Doc_getmeta, (setter)Doc_setmeta, "relevance", (void *)"relevance"}, + {"title", (getter)Doc_getmeta, (setter)Doc_setmeta, + "title", (void *)"title"}, + {"keywords", (getter)Doc_getmeta, (setter)Doc_setmeta, + "keywords", (void *)"keywords"}, + {"abstract", (getter)Doc_getmeta, (setter)Doc_setmeta, + "abstract", (void *)"abstract"}, + {"author", (getter)Doc_getmeta, (setter)Doc_setmeta, + "author", (void *)"author"}, + {"text", (getter)Doc_getmeta, (setter)Doc_setmeta, + "text", (void *)"text"}, + {"sig", (getter)Doc_getmeta, (setter)Doc_setmeta, + "sig", (void *)"sig"}, {NULL} /* Sentinel */ }; diff --git a/src/python/recoll/setup.py b/src/python/recoll/setup.py index a561da10..4da0dc9e 100644 --- a/src/python/recoll/setup.py +++ b/src/python/recoll/setup.py @@ -1,4 +1,5 @@ from distutils.core import setup, Extension +top = '../../' module1 = Extension('recollq', define_macros = [('MAJOR_VERSION', '1'), @@ -9,37 +10,37 @@ module1 = Extension('recollq', '"/usr/local/share/recoll"') ], include_dirs = ['/usr/local/include', - '../utils', - '../common', - '../rcldb', - '../query', - '../unac' + top + 'utils', + top + 'common', + top + 'rcldb', + top + 'query', + top + 'unac' ], libraries = ['xapian', 'iconv'], library_dirs = ['/usr/local/lib'], sources = ['recoll_query.cpp', - '../common/rclconfig.cpp', - '../common/rclinit.cpp', - '../common/textsplit.cpp', - '../common/unacpp.cpp', - '../query/wasastringtoquery.cpp', - '../query/wasatorcl.cpp', - '../rcldb/pathhash.cpp', - '../rcldb/rcldb.cpp', - '../rcldb/rclquery.cpp', - '../rcldb/searchdata.cpp', - '../rcldb/stemdb.cpp', - '../rcldb/stoplist.cpp', - '../unac/unac.c', - '../utils/base64.cpp', - '../utils/conftree.cpp', - '../utils/debuglog.cpp', - '../utils/md5.cpp', - '../utils/pathut.cpp', - '../utils/readfile.cpp', - '../utils/smallut.cpp', - '../utils/transcode.cpp', - '../utils/wipedir.cpp' + top + 'common/rclconfig.cpp', + top + 'common/rclinit.cpp', + top + 'common/textsplit.cpp', + top + 'common/unacpp.cpp', + top + 'query/wasastringtoquery.cpp', + top + 'query/wasatorcl.cpp', + top + 'rcldb/pathhash.cpp', + top + 'rcldb/rcldb.cpp', + top + 'rcldb/rclquery.cpp', + top + 'rcldb/searchdata.cpp', + top + 'rcldb/stemdb.cpp', + top + 'rcldb/stoplist.cpp', + top + 'unac/unac.c', + top + 'utils/base64.cpp', + top + 'utils/conftree.cpp', + top + 'utils/debuglog.cpp', + top + 'utils/md5.cpp', + top + 'utils/pathut.cpp', + top + 'utils/readfile.cpp', + top + 'utils/smallut.cpp', + top + 'utils/transcode.cpp', + top + 'utils/wipedir.cpp' ]) diff --git a/src/python/samples/rcldlkp.py b/src/python/samples/rcldlkp.py new file mode 100755 index 00000000..040d1098 --- /dev/null +++ b/src/python/samples/rcldlkp.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +__doc__ = """ +''Lookup'' notes file indexing + +The file format has text notes separated by lines with a single '%' character + +If the script is called with just the file name as an argument, it will +(re)index the contents. + +If the script is called with second numeric argument, it will retrieve the +specified record and output it in html +""" + +import os +import stat +import sys +import re + +rclconf = "/Users/dockes/.recoll-test" + +def udi(docfile, numrec): + return docfile + "#" + str(numrec) + +############################################################### +def index_rec(db, numrec, rec): + doc = recollq.Doc() + # url + doc.url = "file://" + docfile + # utf8fn + # ipath + doc.ipath = str(numrec) + # mimetype + doc.mimetype = "text/plain" + # mtime + # origcharset + # title + lines = rec.split("\n") + if len(lines) >= 2: + doc.title = unicode(lines[1], "iso-8859-1") + if len(doc.title.strip()) == 0 and len(lines) >= 3: + doc.title = unicode(lines[2], "iso-8859-1") + # keywords + # abstract + # author + # fbytes + doc.fbytes = str(fbytes) + # text + doc.text = unicode(rec, "iso-8859-1") + # dbytes + doc.dbytes = str(len(rec)) + # sig + if numrec == 0: + doc.sig = str(fmtime) + db.addOrUpdate(udi(docfile, numrec), u"", doc) + +def output_rec(rec): + # Escape html + rec = unicode(rec, "iso-8859-1").encode("utf-8") + rec = rec.replace("<", "<"); + rec = rec.replace("&", "&"); + rec = rec.replace('"', "&dquot;"); + print '' + print '' + print '
'
+    print rec
+    print '
' + + +################################################################ + +def usage(): + sys.stderr.write("Usage: doclookup.py []\n") + exit(1) + +if len(sys.argv) < 2: + usage() + +docfile = sys.argv[1] + +if len(sys.argv) > 2: + targetnum = int(sys.argv[2]) +else: + targetnum = None + +#print docfile, targetnum + +stdata = os.stat(docfile) +fmtime = stdata[stat.ST_MTIME] +fbytes = stdata[stat.ST_SIZE] +f = open(docfile, 'r') + +if targetnum == None: + import recollq + db = recollq.connect(confdir=rclconf, writable=1) + if not db.needUpdate(udi(docfile, 0), str(fmtime)): + exit(0) + +rec = "" +numrec = 1 +for line in f: + if re.compile("^%[ \t]*").match(line): + if targetnum == None: + index_rec(db, numrec, rec) + elif targetnum == numrec: + output_rec(rec) + exit(0) + numrec += 1 + rec = "" + else: + rec += line + +if targetnum == None: + index_rec(db, 0, "") + diff --git a/src/python/samples/rclmbox.py b/src/python/samples/rclmbox.py new file mode 100644 index 00000000..3e0e3a80 --- /dev/null +++ b/src/python/samples/rclmbox.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +import mailbox +import email.header +import email.utils +#import sys +import recollq +import os +import stat + +#mbfile = "/Users/dockes/projets/fulltext/testrecoll/mail/fred" +mbfile = "/Users/dockes/mail/outmail" +rclconf = "/Users/dockes/.recoll-test" + +def header_value(msg, nm, to_utf = False): + value = msg.get(nm) + if value == None: + return "" + value = value.replace("\n", "") + value = value.replace("\r", "") + #print value + parts = email.header.decode_header(value) + #print parts + univalue = u"" + for part in parts: + if part[1] != None: + univalue += unicode(part[0], part[1]) + " " + else: + univalue += part[0] + " " + if to_utf: + return univalue.encode('utf-8') + else: + return univalue + +class mbox_indexer: + def __init__(self, mbfile): + self.mbfile = mbfile + stdata = os.stat(mbfile) + self.fmtime = stdata[stat.ST_MTIME] + self.fbytes = stdata[stat.ST_SIZE] + self.msgnum = 1 + + def sig(self): + return str(self.fmtime) + ":" + str(self.fbytes) + def udi(self, msgnum): + return self.mbfile + ":" + str(msgnum) + + def index(self, db): + if not db.needUpdate(self.udi(1), self.sig()): + return None + mb = mailbox.mbox(self.mbfile) + for msg in mb.values(): + self.index_message(db, msg) + self.msgnum += 1 + + def index_message(self, db, msg): + doc = recollq.Doc() + doc.author = header_value(msg, "From") + # url + doc.url = "file://" + self.mbfile + # utf8fn + # ipath + doc.ipath = str(self.msgnum) + # mimetype + doc.mimetype = "message/rfc822" + # mtime + dte = header_value(msg, "Date") + tm = email.utils.parsedate_tz(dte) + if tm == None: + doc.mtime = str(self.fmtime) + else: + doc.mtime = str(email.utils.mktime_tz(tm)) + # origcharset + # title + doc.title = header_value(msg, "Subject") + # keywords + # abstract + # author + # fbytes + doc.fbytes = str(self.fbytes) + # text + text = u"" + text += u"From: " + header_value(msg, "From") + u"\n" + text += u"To: " + header_value(msg, "To") + u"\n" + text += u"Subject: " + header_value(msg, "Subject") + u"\n" + #text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n" + text += u"\n" + for part in msg.walk(): + if part.is_multipart(): + pass #print "Multipart: " + part.get_content_type() + else: + ct = part.get_content_type() + #print "Simple: " + ct + if ct.lower() == "text/plain": + charset = part.get_content_charset("iso-8859-1") + text += unicode(part.get_payload(None, True), charset) + doc.text = text + # dbytes + doc.dbytes = str(len(text)) + # sig + doc.sig = self.sig() + udi = self.udi(self.msgnum) + db.addOrUpdate(udi, u"", doc) + + +db = recollq.connect(confdir=rclconf, writable=1) + +mbidx = mbox_indexer(mbfile) +mbidx.index(db) diff --git a/src/python/samples/recollq.py b/src/python/samples/recollq.py new file mode 100755 index 00000000..099e9141 --- /dev/null +++ b/src/python/samples/recollq.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import sys +import recollq +allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime", + "ipath", "fbytes", "dbytes", "relevance") + + +def dotest(db, q): + query = db.query() +#query1 = db.query() + + nres = query.execute(q) + print "Result count: ", nres + if nres > 10: + nres = 10 + while query.next >= 0 and query.next < nres: + doc = query.fetchone() + print query.next + for k in ("title",): + print k, ":", getattr(doc, k).encode('utf-8') + abs = db.makeDocAbstract(doc, query).encode('utf-8') + print abs + print + +# End dotest + +q = "essaouira" + +print "TESTING WITH .recoll" +db = recollq.connect() +db.setAbstractParams(maxchars=80, contextwords=2) +dotest(db, q) + +sys.exit(0) + +print "TESTING WITH .recoll-test" +db = recollq.connect(confdir="/Users/dockes/.recoll-test") +dotest(db, q) + +print "TESTING WITH .recoll-doc" +db = recollq.connect(confdir="/y/home/dockes/.recoll-doc") +dotest(db, q) + +print "TESTING WITH .recoll and .recoll-doc" +db = recollq.connect(confdir="/Users/dockes/.recoll", + extra_dbs=("/y/home/dockes/.recoll-doc",)) +dotest(db, q) +