diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 6a438728..ed50768e 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.57 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.58 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -467,27 +467,21 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) // Build a direct map avoiding all indirections for field to // prefix translation - // Add direct prefixes + // Add direct prefixes from the [prefixes] section listtps = m_fields->getNames("prefixes"); for (list::const_iterator it = tps.begin(); it != tps.end();it++) { string val; m_fields->get(*it, val, "prefixes"); - m_fldtopref[*it] = val; + m_fldtopfx[stringtolower(*it)] = val; } - // Add prefixes for aliases: + // Add prefixes for aliases (build alias-to-canonic map while we're at it) tps = m_fields->getNames("aliases"); for (list::const_iterator it = tps.begin(); it != tps.end();it++) { - string canonic = *it; // canonic name + string canonic = stringtolower(*it); // canonic name string pfx; - map::const_iterator pit = m_fldtopref.find(canonic); - if (pit != m_fldtopref.end()) { + map::const_iterator pit = m_fldtopfx.find(canonic); + if (pit != m_fldtopfx.end()) { pfx = pit->second; - } else { - // Note: it's perfectly normal to have no prefix for the canonic - // name, this could be a stored, not indexed field - LOGDEB2(("RclConfig::readFieldsConfig: no pfx for canonic [%s]\n", - canonic.c_str())); - continue; } string aliases; m_fields->get(canonic, aliases, "aliases"); @@ -495,12 +489,14 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) stringToStrings(aliases, l); for (list::const_iterator ait = l.begin(); ait != l.end(); ait++) { - m_fldtopref[*ait] = pfx; + if (!pfx.empty()) + m_fldtopfx[stringtolower(*ait)] = pfx; + m_aliastocanon[stringtolower(*ait)] = canonic; } } #if 0 - for (map::const_iterator it = m_fldtopref.begin(); - it != m_fldtopref.end(); it++) { + for (map::const_iterator it = m_fldtopfx.begin(); + it != m_fldtopfx.end(); it++) { LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n", it->first.c_str(), it->second.c_str())); } @@ -512,8 +508,9 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) stringToStrings(ss, sl); for (list::const_iterator it = sl.begin(); it != sl.end(); it++) { - LOGDEB(("Inserting [%s] in stored list\n", (*it).c_str())); - m_storedFields.insert(*it); + string fld = fieldCanon(stringtolower(*it)); + LOGDEB(("Inserting [%s] in stored list\n", fld.c_str())); + m_storedFields.insert(fld); } } @@ -521,10 +518,11 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) } // Return term indexing prefix for field name (ie: "filename" -> "XSFN") +// The input must be a canonical field name (alias translation done already) bool RclConfig::getFieldPrefix(const string& fld, string &pfx) { - map::const_iterator pit = m_fldtopref.find(fld); - if (pit != m_fldtopref.end()) { + map::const_iterator pit = m_fldtopfx.find(fld); + if (pit != m_fldtopfx.end()) { pfx = pit->second; return true; } else { @@ -572,10 +570,13 @@ bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, pfxes.unique(); return true; } -bool RclConfig::fieldIsStored(const string& fld) + +string RclConfig::fieldCanon(const string& fld) { - set::const_iterator it = m_storedFields.find(fld); - return it != m_storedFields.end(); + map::const_iterator it = m_aliastocanon.find(fld); + if (it != m_aliastocanon.end()) + return it->second; + return fld; } string RclConfig::getMimeViewerDef(const string &mtype) diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index faeb6edf..fc002520 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,7 +16,7 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.40 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.41 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -144,7 +144,7 @@ class RclConfig { /** mimeconf: get list of mime types for category */ bool getMimeCatTypes(const string& cat, list&); - /** mimeconf: get field prefix from field name */ + /** fields: get field prefix from field name */ bool getFieldPrefix(const string& fldname, string &pfx); /** Get implied meanings for field name (ie: author->[author, from]) */ bool getFieldSpecialisations(const string& fld, @@ -152,8 +152,9 @@ class RclConfig { /** Get prefixes for specialisations of field name */ bool getFieldSpecialisationPrefixes(const string& fld, list& pfxes); - bool fieldIsStored(const string& fld); const set& getStoredFields() {return m_storedFields;} + /** Get canonic name for possible alias */ + string fieldCanon(const string& fld); /** mimeview: get/set external viewer exec string(s) for mimetype(s) */ string getMimeViewerDef(const string &mimetype); @@ -196,7 +197,8 @@ class RclConfig { ConfStack *mimeconf; // but their content may depend on it. ConfStack *mimeview; // ConfStack *m_fields; - map m_fldtopref; + map m_fldtopfx; + map m_aliastocanon; set m_storedFields; void *m_stopsuffixes; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index a3fb3b03..540bd0ab 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.41 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.42 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -296,7 +296,6 @@ static const string keyds("description"); static const string keyfn("filename"); static const string keymd("modificationdate"); static const string keymt("mimetype"); -static const string keyoc("origcharset"); static const string keytt("title"); bool FileInterner::dijontorcl(Rcl::Doc& doc) @@ -310,7 +309,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) doc.text = it->second; } else if (it->first == keymd) { doc.dmtime = it->second; - } else if (it->first == keyoc) { + } else if (it->first == Rcl::Doc::keyoc) { doc.origcharset = it->second; } else if (it->first == keymt || it->first == keycs) { // don't need these. diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 71669f75..d7f825be 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.11 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.12 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes"; #endif @@ -84,31 +84,33 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs) return 0; } +/* Note: addclause necessite And/Or vient du fait que le string peut avoir + plusieurs mots. A transferer dans l'i/f Python ou pas ? */ PyDoc_STRVAR(doc_addClause, "addClause(type='and'|'or'|'excl'|'phrase'|'near'|'sub', qstring=string,\n" -" slack=int, field=string, subSearch=SearchData,\n" +" slack=int, field=string, subSearch=SearchData)\n" "Adds a simple clause to the SearchData And/Or chain, or a subquery\n" "defined by another SearchData object\n" ); -/* Note: necessite And/Or vient du fait que le string peut avoir - plusieurs mots. A transferer dans l'i/f Python ou pas ? */ -/* Forward decl, def needs recoll_searchDataTyep */ +/* Forward declaration only, definition needs recoll_searchDataType */ static PyObject * SearchData_addClause(recoll_SearchDataObject* self, PyObject *args, PyObject *kwargs); + + static PyMethodDef SearchData_methods[] = { {"addClause", (PyCFunction)SearchData_addClause, METH_VARARGS|METH_KEYWORDS, - doc_addClause - }, + doc_addClause}, {NULL} /* Sentinel */ }; PyDoc_STRVAR(doc_SearchDataObject, "SearchData()\n" "\n" -"A SearchData object describes a query.\n" +"A SearchData object describes a query. It has a number of global parameters\n" +"and a chain of search clauses.\n" ); static PyTypeObject recoll_SearchDataType = { PyObject_HEAD_INIT(NULL) @@ -165,9 +167,9 @@ SearchData_addClause(recoll_SearchDataObject* self, PyObject *args, static char *kwlist[] = {"type", "qstring", "slack", "field", "subsearch", NULL}; char *tp = 0; - char *qs = 0; + char *qs = 0; // needs freeing int slack = 0; - char *fld = 0; + char *fld = 0; // needs freeing recoll_SearchDataObject *sub = 0; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ses|iesO!", kwlist, &tp, "utf-8", &qs, &slack, @@ -221,11 +223,13 @@ SearchData_addClause(recoll_SearchDataObject* self, PyObject *args, PyErr_SetString(PyExc_AttributeError, "Bad tp arg"); return 0; } - + PyMem_Free(qs); + PyMem_Free(fld); self->sd->addClause(cl); Py_RETURN_NONE; } + /////////////////////////////////////////////////////////////////////// ///// Doc code typedef struct { @@ -272,38 +276,86 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *) return 0; } -// The "closure" thing is actually the meta field name. This is how -// python allows one set of get/set functions to get/set different -// attributes (pass them an additional parameters as from the -// getseters table and call it a "closure" static PyObject * -Doc_getmeta(recoll_DocObject *self, void *closure) +Doc_getattr(recoll_DocObject *self, char *name) { - LOGDEB0(("Doc_getmeta: [%s]\n", (const char *)closure)); + LOGDEB(("Doc_getattr: name [%s]\n", name)); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc"); return 0; } - #if 0 for (map::const_iterator it = self->doc->meta.begin(); it != self->doc->meta.end(); it++) { LOGDEB(("meta[%s] -> [%s]\n", it->first.c_str(), it->second.c_str())); } #endif + string key = rclconfig->fieldCanon(stringtolower(string(name))); - // Retrieve utf-8 coded value for meta field (if it doesnt exist, - // this inserts a null value in the array, we could be nicer. - string meta = self->doc->meta[(const char *)closure]; + // Handle special cases, then try retrieving key value from meta + // array + string value; + switch (key.at(0)) { + case 'f': + if (!key.compare(Rcl::Doc::keyfs)) { + value = self->doc->fbytes; + } else if (!key.compare(Rcl::Doc::keyfn)) { + value = self->doc->utf8fn; + } else if (!key.compare(Rcl::Doc::keyfs)) { + value = self->doc->fbytes; + } else if (!key.compare(Rcl::Doc::keyfmt)) { + value = self->doc->fmtime; + } + break; + case 'd': + if (!key.compare(Rcl::Doc::keyds)) { + value = self->doc->dbytes; + } else if (!key.compare(Rcl::Doc::keydmt)) { + value = self->doc->dmtime; + } + break; + case 'i': + if (!key.compare(Rcl::Doc::keyipt)) { + value = self->doc->ipath; + } + break; + case 'm': + if (!key.compare(Rcl::Doc::keytp)) { + value = self->doc->mimetype; + } else if (!key.compare(Rcl::Doc::keymt)) { + value = self->doc->dmtime.empty() ? self->doc->fmtime : + self->doc->dmtime; + } + break; + case 'o': + if (!key.compare(Rcl::Doc::keyoc)) { + value = self->doc->origcharset; + } + break; + case 's': + if (!key.compare(Rcl::Doc::keysig)) { + value = self->doc->sig; + } else if (!key.compare(Rcl::Doc::keysz)) { + value = self->doc->dbytes.empty() ? self->doc->fbytes : + self->doc->dbytes; + } + + break; + default: + value = self->doc->meta[key]; + } + + LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n", + name, key.c_str(), value.c_str())); // Return a python unicode object - PyObject* res = PyUnicode_Decode(meta.c_str(), meta.size(), "UTF-8", + PyObject* res = PyUnicode_Decode(value.c_str(), value.size(), "UTF-8", "replace"); return res; } static int -Doc_setmeta(recoll_DocObject *self, PyObject *value, void *closure) +Doc_setattr(recoll_DocObject *self, char *name, PyObject *value) { if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { @@ -330,56 +382,55 @@ Doc_setmeta(recoll_DocObject *self, PyObject *value, void *closure) } char* uvalue = PyString_AsString(putf8); - const char *key = (const char *)closure; - if (key == 0) { - PyErr_SetString(PyExc_AttributeError, "key??"); + if (name == 0) { + PyErr_SetString(PyExc_AttributeError, "name??"); return -1; } - LOGDEB0(("Doc_setmeta: setting [%s] to [%s]\n", key, uvalue)); - self->doc->meta[key] = uvalue; - switch (key[0]) { + LOGDEB0(("Doc_setattr: setting [%s] to [%s]\n", name, uvalue)); + self->doc->meta[name] = uvalue; + switch (name[0]) { case 'd': - if (!strcmp(key, "dbytes")) { + if (!strcmp(name, "dbytes")) { self->doc->dbytes = uvalue; } break; case 'f': - if (!strcmp(key, "fbytes")) { + if (!strcmp(name, "fbytes")) { self->doc->fbytes = uvalue; } break; case 'i': - if (!strcmp(key, "ipath")) { + if (!strcmp(name, "ipath")) { self->doc->ipath = uvalue; } break; case 'm': - if (!strcmp(key, "mimetype")) { + if (!strcmp(name, "mimetype")) { self->doc->mimetype = uvalue; - } else if (!strcmp(key, "mtime")) { + } else if (!strcmp(name, "mtime")) { self->doc->dmtime = uvalue; } break; case 's': - if (!strcmp(key, "sig")) { + if (!strcmp(name, "sig")) { self->doc->sig = uvalue; } break; case 't': - if (!strcmp(key, "text")) { + if (!strcmp(name, "text")) { self->doc->text = uvalue; } break; case 'u': - if (!strcmp(key, "url")) { + if (!strcmp(name, "url")) { self->doc->url = uvalue; } break; } return 0; } - +#if 0 static PyGetSetDef Doc_getseters[] = { // Name, get, set, doc, closure {"url", (getter)Doc_getmeta, (setter)Doc_setmeta, @@ -410,6 +461,7 @@ static PyGetSetDef Doc_getseters[] = { "sig", (void *)"sig"}, {NULL} /* Sentinel */ }; +#endif PyDoc_STRVAR(doc_DocObject, "Doc()\n" @@ -427,8 +479,8 @@ static PyTypeObject recoll_DocType = { 0, /*tp_itemsize*/ (destructor)Doc_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ + (getattrfunc)Doc_getattr, /*tp_getattr*/ + (setattrfunc)Doc_setattr, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ @@ -450,7 +502,7 @@ static PyTypeObject recoll_DocType = { 0, /* tp_iternext */ 0, /* tp_methods */ 0, /* tp_members */ - Doc_getseters, /* tp_getset */ + 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ @@ -470,7 +522,10 @@ typedef struct { /* Type-specific fields go here. */ Rcl::Query *query; int next; // Index of result to be fetched next or -1 if uninit + char *sortfield; + int ascending; } recoll_QueryObject; + ///////////////////////////////////////////// /// Query object static void @@ -481,6 +536,7 @@ Query_dealloc(recoll_QueryObject *self) the_queries.erase(self->query); delete self->query; self->query = 0; + self->sortfield = 0; self->ob_type->tp_free((PyObject*)self); } @@ -495,6 +551,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) return 0; self->query = 0; self->next = -1; + self->sortfield = 0; return (PyObject *)self; } @@ -511,9 +568,29 @@ Query_init(recoll_QueryObject *self, PyObject *, PyObject *) delete self->query; self->query = 0; self->next = -1; + self->sortfield = 0; + self->ascending = true; return 0; } +PyDoc_STRVAR(doc_Query_sortby, +"sortby(field=fieldname, ascending=true)\n" +"Sort results by 'fieldname', in ascending or descending order.\n" +"Only one field can be used, no subsorts for now.\n" +); + +static PyObject * +Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) +{ + LOGDEB(("Query_sortby\n")); + static char *kwlist[] = {"field", "ascending", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", kwlist, + &self->sortfield, + &self->ascending)) + return 0; + Py_RETURN_NONE; +} + PyDoc_STRVAR(doc_Query_execute, "execute(query_string, stemmming=1|0)\n" "\n" @@ -527,14 +604,17 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { LOGDEB(("Query_execute\n")); static char *kwlist[] = {"query_string", "stemming", NULL}; - char *utf8 = 0; + char *sutf8 = 0; // needs freeing int dostem = 1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|i:Query_execute", - kwlist, "utf-8", &utf8, + kwlist, "utf-8", &sutf8, &dostem)) { return 0; } - LOGDEB(("Query_execute: [%s]\n", utf8)); + LOGDEB(("Query_execute: [%s]\n", sutf8)); + + string utf8(sutf8); + PyMem_Free(sutf8); if (self->query == 0 || the_queries.find(self->query) == the_queries.end()) { PyErr_SetString(PyExc_AttributeError, "query"); @@ -542,11 +622,12 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) } string reason; Rcl::SearchData *sd = wasaStringToRcl(utf8, reason); - PyMem_Free(utf8); + if (!sd) { PyErr_SetString(PyExc_ValueError, reason.c_str()); return 0; } + sd->setSortBy(self->sortfield, self->ascending); RefCntr rq(sd); self->query->setQuery(rq, dostem?Rcl::Query::QO_STEM:Rcl::Query::QO_NONE); int cnt = self->query->getResCnt(); @@ -557,7 +638,7 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) PyDoc_STRVAR(doc_Query_executesd, "execute(SearchData, stemming=1|0)\n" "\n" -"Starts a search for the query defined by SearchData.\n" +"Starts a search for the query defined by the SearchData object.\n" ); static PyObject * @@ -576,6 +657,7 @@ Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_AttributeError, "query"); return 0; } + pysd->sd->setSortBy(self->sortfield, self->ascending); self->query->setQuery(pysd->sd, dostem ? Rcl::Query::QO_STEM : Rcl::Query::QO_NONE); int cnt = self->query->getResCnt(); @@ -616,18 +698,22 @@ Query_fetchone(recoll_QueryObject* self, PyObject *, PyObject *) return 0; } self->next++; + // Move some data from the dedicated fields to the meta array to make - // fetching attributes easier + // fetching attributes easier. Is this actually needed ? Useful for + // url and relevancy rating which are also formatted . Rcl::Doc *doc = result->doc; - printableUrl(rclconfig->getDefCharset(), doc->url, doc->meta["url"]); - doc->meta["mimetype"] = doc->mimetype; - doc->meta["mtime"] = doc->dmtime.empty() ? doc->fmtime : doc->dmtime; - doc->meta["ipath"] = doc->ipath; - doc->meta["fbytes"] = doc->fbytes; - doc->meta["dbytes"] = doc->dbytes; + printableUrl(rclconfig->getDefCharset(), doc->url, + doc->meta[Rcl::Doc::keyurl]); + doc->meta[Rcl::Doc::keytp] = doc->mimetype; + doc->meta[Rcl::Doc::keymt] = doc->dmtime.empty() ? + doc->fmtime : doc->dmtime; + doc->meta[Rcl::Doc::keyipt] = doc->ipath; + doc->meta[Rcl::Doc::keyfs] = doc->fbytes; + doc->meta[Rcl::Doc::keyds] = doc->dbytes; char pc[20]; sprintf(pc, "%02d %%", percent); - doc->meta["relevance"] = pc; + doc->meta[Rcl::Doc::keyrr] = pc; return (PyObject *)result; } @@ -637,7 +723,10 @@ static PyMethodDef Query_methods[] = { doc_Query_execute}, {"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS, doc_Query_executesd}, - {"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,doc_Query_fetchone}, + {"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS, + doc_Query_fetchone}, + {"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS, + doc_Query_sortby}, {NULL} /* Sentinel */ }; @@ -881,8 +970,8 @@ Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *) static PyObject * Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds) { - char *udi = 0; - char *sig = 0; + char *udi = 0; // needs freeing + char *sig = 0; // needs freeing LOGDEB(("Db_needUpdate\n")); if (!PyArg_ParseTuple(args, "eses:Db_needUpdate", "utf-8", &udi, "utf-8", &sig)) { @@ -891,6 +980,8 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds) if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { LOGERR(("Db_needUpdate: db not found %p\n", self->db)); PyErr_SetString(PyExc_AttributeError, "db"); + PyMem_Free(udi); + PyMem_Free(sig); return 0; } bool result = self->db->needUpdate(udi, sig); @@ -903,16 +994,20 @@ static PyObject * Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *) { LOGDEB(("Db_addOrUpdate\n")); - char *udi = 0; - char *parent_udi = 0; - + char *sudi = 0; // needs freeing + char *sparent_udi = 0; // needs freeing recoll_DocObject *pydoc; if (!PyArg_ParseTuple(args, "esO!|es:Db_addOrUpdate", - "utf-8", &udi, &recoll_DocType, &pydoc, - "utf-8", &parent_udi)) { + "utf-8", &sudi, &recoll_DocType, &pydoc, + "utf-8", &sparent_udi)) { return 0; } + string udi(sudi); + string parent_udi(sparent_udi ? sparent_udi : ""); + PyMem_Free(sudi); + PyMem_Free(sparent_udi); + if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { LOGERR(("Db_addOrUpdate: db not found %p\n", self->db)); PyErr_SetString(PyExc_AttributeError, "db"); @@ -923,16 +1018,11 @@ Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *) PyErr_SetString(PyExc_AttributeError, "doc"); return 0; } - if (!self->db->addOrUpdate(udi, parent_udi?parent_udi:"", *pydoc->doc)) { + if (!self->db->addOrUpdate(udi, parent_udi, *pydoc->doc)) { LOGERR(("Db_addOrUpdate: rcldb error\n")); PyErr_SetString(PyExc_AttributeError, "rcldb error"); - PyMem_Free(udi); - PyMem_Free(parent_udi); return 0; } - PyMem_Free(udi); - if (parent_udi) - PyMem_Free(parent_udi); Py_RETURN_NONE; } diff --git a/src/python/recoll/setup.py b/src/python/recoll/setup.py index 1a3f673a..cac6e8a7 100644 --- a/src/python/recoll/setup.py +++ b/src/python/recoll/setup.py @@ -27,6 +27,7 @@ module1 = Extension('recoll', top + 'query/wasatorcl.cpp', top + 'rcldb/pathhash.cpp', top + 'rcldb/rcldb.cpp', + top + 'rcldb/rcldoc.cpp', top + 'rcldb/rclquery.cpp', top + 'rcldb/searchdata.cpp', top + 'rcldb/stemdb.cpp', diff --git a/src/python/samples/recollqsd.py b/src/python/samples/recollqsd.py new file mode 100644 index 00000000..a25d72f3 --- /dev/null +++ b/src/python/samples/recollqsd.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import sys +import recoll + +def dotest(db, q): + query = db.query() + query.sortby("title", 1) + + nres = query.executesd(q, stemming = 1) + print "Result count: ", nres + if nres > 10: + nres = 10 + while query.next >= 0 and query.next < nres: + doc = query.fetchone() + print query.next + for k in ("url", "mtime", "title", "author", "abstract"): + print k, ":", getattr(doc, k).encode('utf-8') + #abs = db.makeDocAbstract(doc, query).encode('utf-8') + #print abs + print +# End dotest + +sd = recoll.SearchData() +sd.addClause("and", "essaouira maroc") +#sd.addClause("and", "dockes", field="author") +#sd.addClause("phrase", "jean francois", 1) +#sd.addClause("excl", "plage") + +db = recoll.connect() +dotest(db, sd) + +sys.exit(0) diff --git a/src/python/xesam/xesam-recoll-service b/src/python/xesam/xesam-recoll-service index 735f96f3..a9aafdcf 100755 --- a/src/python/xesam/xesam-recoll-service +++ b/src/python/xesam/xesam-recoll-service @@ -1,24 +1,25 @@ #!/usr/bin/env python """ -Demo implementation of a xesam server. Run it like +Recoll implementation of a xesam server. +Based on the example in the xesam-tools package by: + Mikkel Kamstrup Erlandsen - demo/xesam-dummy-service [-s|--state-messages] +Run it like + +xesam-recoll-service And launch a search on it via - ./xesam-tool search hello + xesam-tool search hello -You can use the -s or --state-messages switch to enable StateChanged -signal monitoring in xesam-tool as well as in xesam-dummy-service. """ - # Sets up path to uninstalled xesam module import demo import xesam import xesam.query -from xesam.server import * +import xesam.server import gobject import sys @@ -34,15 +35,16 @@ class RecollServer (xesam.server.Searcher): """ def __init__ (self): - h_fact = HandleFactory () - fact = ClientFactory (self, h_fact, RecollSession, RecollSearch) + h_fact = xesam.server.HandleFactory () + fact = xesam.server.ClientFactory (self, h_fact, + RecollSession, RecollSearch) xesam.server.Searcher.__init__ (self, h_fact, fact) self.set_echo_queries (True) self.rcldb = recoll.connect() def start (self): # Export our selves via a SearchServerStub - SearchServerStub(self).start() + xesam.server.SearchServerStub(self).start() def GetProperty (self, shandle, name): prop = xesam.server.Searcher.GetProperty(self, shandle, name) @@ -54,33 +56,24 @@ class RecollServer (xesam.server.Searcher): xesam.debug ("Set property request for '%s=%s', on session '%s', returning %s" % (name, value, shandle,val)) return val -class RecollSession (Session): +class RecollSession (xesam.server.Session): """ """ def __init__ (self, searcher, session_handle): - Session.__init__ (self, searcher, session_handle) + xesam.server.Session.__init__ (self, searcher, session_handle) self.set_property ("recoll.org", "xesam-recoll-service") -class RecollSearch (Search): +class RecollSearch (xesam.server.Search): """ """ - # Translation from known xesam/whatever field names to Recoll Doc elements - FLDTRANS = \ - { - "xesam:title" : lambda doc : doc.title, - "xesam:summary" : lambda doc : doc.abstract, - "xesam:mimeType" : lambda doc : doc.mimetype, - "xesam:contentModified" : lambda doc : \ - timestampToIso8601(doc.dmtime or doc.fmtime), - "xesam:url" : lambda doc : doc.url - } + SLICE = 10 def __init__ (self, searcher, session, search_handle, \ query=None, xml=None) : - Search.__init__ (self, searcher, session, search_handle, \ + xesam.server.Search.__init__ (self, searcher, session, search_handle, \ query=query, xml=xml) self._hit_fields = session.get_property (xesam.SESSION_HIT_FIELDS) @@ -88,10 +81,7 @@ class RecollSearch (Search): xesam.error ("Got property hit.fields as None." " Setting default xesam:url") self._hit_fields = ["xesam:url"] - print "RecollSearch: fields:", self._hit_fields - # TOBEDONE: if fields includes "snippet" we need to generate - # the synthetic abstract for each returned doc - # Also relevancyRating, ContentCategory et SourceCategory + xesam.debug("RecollSearch: fields:" % self._hit_fields) xesam.debug ("Created %s with handle %s and query:\n%s" % (self.__class__, self.get_handle(), self.get_query())) @@ -99,6 +89,21 @@ class RecollSearch (Search): if not isinstance(self.get_query(), xesam.query.UserQuery): raise Exception ("Only UserQuery supported ATM, sorry.") self.rclquery = self._searcher.rcldb.query() + + # In the latest version (>0.95), primary/secondary is replaced by + # a field list. + sortfield = session.get_property(xesam.SESSION_SORT_PRIMARY) + order = session.get_property(xesam.SESSION_SORT_ORDER) + + # xesam-tool does not know how to set these for now, so let's + # TEST here + sortfield = "contentModified" + order = "descending" + xesam.debug("Session sort primary %s order %s" % (sortfield, order)) + # END TEST + + if sortfield: + self.rclquery.sortby(sortfield, order == "ascending" and 1 or 0) def start (self): xesam.debug ("RecollSearch '%s' got [%s]" % @@ -110,10 +115,16 @@ class RecollSearch (Search): doc = self.rclquery.fetchone() data = [] for fld in self._hit_fields: - if self.FLDTRANS.has_key (fld): - data.append(self.FLDTRANS[fld](doc)) + # Need to handle ContentCategory and SourceCategory + fld = fld.lower().replace("xesam:", "") + xesam.debug("Adding data for fld %s" % (fld)) + if fld == "snippet": + data.append(self._searcher.rcldb.makeDocAbstract(doc, + self.rclquery)) + elif fld == "contentmodified": + data.append(timestampToIso8601(getattr(doc, "mtime"))) else: - data.append("") + data.append(getattr(doc, fld, "")) self.add_new_hit (self._hit_fields, data) hits += 1 if hits >= self.SLICE: @@ -135,7 +146,7 @@ class RecollSearch (Search): xesam.debug ("RecollSearch get_hits") if self._stopped: - return Search.get_hits(self, num_hits) + return xesam.server.Search.get_hits(self, num_hits) hits = 0 done = 0; @@ -163,7 +174,7 @@ class RecollSearch (Search): xesam.debug ("Search '%s' emitted 'done'" % self.get_handle()) self.stop() - return Search.get_hits(self, num_hits) + return xesam.server.Search.get_hits(self, num_hits) if __name__ == "__main__": diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index b7510a89..d2dde3ce 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollq.cpp,v 1.14 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollq.cpp,v 1.15 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -42,6 +42,33 @@ using namespace std; #include "internfile.h" #include "wipedir.h" +bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& doc) +{ + string fn = doc.url.substr(7); + struct stat st; + if (stat(fn.c_str(), &st) != 0) { + cout << "No such file: " << fn << endl; + return true; + } + if (tmpdir.empty() || access(tmpdir.c_str(), 0) < 0) { + string reason; + if (!maketmpdir(tmpdir, reason)) { + cerr << "Cannot create temporary directory: " + << reason << endl; + return false; + } + } + wipedir(tmpdir); + FileInterner interner(fn, &st, rclconfig, tmpdir, &doc.mimetype); + if (interner.internfile(doc, doc.ipath)) { + cout << doc.text << endl; + } else { + cout << "Cant intern: " << fn << endl; + } + return true; +} + + static char *thisprog; static char usage [] = " [-o|-a|-f] \n" @@ -60,6 +87,8 @@ static char usage [] = " -n limit the maximum number of results (0->no limit, default 2000)\n" " -b : basic. Just output urls, no mime types or titles\n" " -m : dump the whole document meta[] array\n" +" -S fld : sort by field name\n" +" -D : sort descending\n" ; static void Usage(void) @@ -82,10 +111,15 @@ static int op_flags; #define OPT_q 0x200 #define OPT_t 0x400 #define OPT_m 0x800 +#define OPT_D 0x1000 +#define OPT_S 0x2000 + int recollq(RclConfig **cfp, int argc, char **argv) { string a_config; + string sortfield; + int limit = 2000; thisprog = argv[0]; argc--; argv++; @@ -103,6 +137,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) a_config = *(++argv); argc--; goto b1; case 'd': op_flags |= OPT_d; break; + case 'D': op_flags |= OPT_D; break; case 'f': op_flags |= OPT_f; break; case 'l': op_flags |= OPT_l; break; case 'm': op_flags |= OPT_m; break; @@ -112,6 +147,9 @@ int recollq(RclConfig **cfp, int argc, char **argv) argc--; goto b1; case 'o': op_flags |= OPT_o; break; case 'q': op_flags |= OPT_q; break; + case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); + sortfield = *(++argv); + argc--; goto b1; case 't': op_flags |= OPT_t; break; default: Usage(); break; } @@ -168,6 +206,10 @@ int recollq(RclConfig **cfp, int argc, char **argv) return 1; } + if (op_flags & OPT_S) { + sd->setSortBy(sortfield, (op_flags & OPT_D) ? false : true); + } + RefCntr rq(sd); Rcl::Query query(&rcldb); query.setQuery(rq, Rcl::Query::QO_STEM); @@ -197,6 +239,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) << "[" << doc.url.c_str() << "]" << "\t" << "[" << doc.meta[Rcl::Doc::keytt].c_str() << "]" << "\t" << doc.fbytes.c_str() << "\tbytes" << "\t" + << doc.dmtime.c_str() << "\tSecs" << "\t" << endl; if (op_flags & OPT_m) { for (map::const_iterator it = doc.meta.begin(); @@ -204,32 +247,11 @@ int recollq(RclConfig **cfp, int argc, char **argv) cout << it->first << " = " << it->second << endl; } } - cout << endl; } + if (op_flags & OPT_d) { - string fn = doc.url.substr(7); - struct stat st; - if (stat(fn.c_str(), &st) != 0) { - cout << "No such file: " << fn << endl; - continue; - } - if (tmpdir.empty() || access(tmpdir.c_str(), 0) < 0) { - string reason; - if (!maketmpdir(tmpdir, reason)) { - cerr << "Cannot create temporary directory: " - << reason << endl; - return 1; - } - } - wipedir(tmpdir); - FileInterner interner(fn, &st, rclconfig, tmpdir, &doc.mimetype); - if (interner.internfile(doc, doc.ipath)) { - cout << doc.text << endl; - } else { - cout << "Cant intern: " << fn << endl; - } - } - + dump_contents(rclconfig, tmpdir, doc); + } } // Maybe clean up temporary directory diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index fa7fe568..70322dec 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.144 2008-09-09 12:58:23 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.145 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -129,12 +129,10 @@ bool Db::Native::subDocs(const string &udi, vector& docids) return false; } +// Only ONE field name inside the index data record differs from the +// Rcl::Doc ones: caption<->title, for a remnant of compatibility with +// omega static const string keycap("caption"); -static const string keymtp("mtype"); -static const string keyfmt("fmtime"); -static const string keydmt("dmtime"); -static const string keyoc("origcharset"); -static const string keyurl("url"); // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, @@ -144,11 +142,11 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, ConfSimple parms(&data); if (!parms.ok()) return false; - parms.get(keyurl, doc.url); - parms.get(keymtp, doc.mimetype); - parms.get(keyfmt, doc.fmtime); - parms.get(keydmt, doc.dmtime); - parms.get(keyoc, doc.origcharset); + parms.get(Doc::keyurl, doc.url); + parms.get(Doc::keytp, doc.mimetype); + parms.get(Doc::keyfmt, doc.fmtime); + parms.get(Doc::keydmt, doc.dmtime); + parms.get(Doc::keyoc, doc.origcharset); parms.get(keycap, doc.meta[Doc::keytt]); parms.get(Doc::keykw, doc.meta[Doc::keykw]); parms.get(Doc::keyabs, doc.meta[Doc::keyabs]); @@ -162,10 +160,10 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, char buf[20]; sprintf(buf,"%.2f", float(percent) / 100.0); doc.meta[Doc::keyrr] = buf; - parms.get(string("ipath"), doc.ipath); - parms.get(string("fbytes"), doc.fbytes); - parms.get(string("dbytes"), doc.dbytes); - parms.get(string("sig"), doc.sig); + parms.get(Doc::keyipt, doc.ipath); + parms.get(Doc::keyfs, doc.fbytes); + parms.get(Doc::keyds, doc.dbytes); + parms.get(Doc::keysig, doc.sig); doc.xdocid = docid; // Other, not predefined meta fields: @@ -691,24 +689,25 @@ bool Db::isopen() // indexed with no prefix (ie: abstract) bool Db::fieldToPrefix(const string& fldname, string &pfx) { - // This is the default table + // This is the default table. We prefer the data from rclconfig if + // available static map fldToPrefs; if (fldToPrefs.empty()) { fldToPrefs[Doc::keyabs] = string(); fldToPrefs["ext"] = "XE"; - fldToPrefs["filename"] = "XSFN"; + fldToPrefs[Doc::keyfn] = "XSFN"; - fldToPrefs["title"] = "S"; fldToPrefs[keycap] = "S"; + fldToPrefs[Doc::keytt] = "S"; fldToPrefs["subject"] = "S"; fldToPrefs[Doc::keyau] = "A"; fldToPrefs["creator"] = "A"; fldToPrefs["from"] = "A"; + fldToPrefs[Doc::keykw] = "K"; fldToPrefs["keyword"] = "K"; fldToPrefs["tag"] = "K"; - fldToPrefs[Doc::keykw] = "K"; fldToPrefs["tags"] = "K"; } @@ -719,6 +718,7 @@ bool Db::fieldToPrefix(const string& fldname, string &pfx) if (config && config->getFieldPrefix(fld, pfx)) return true; + // No data in rclconfig? Check default values map::const_iterator it = fldToPrefs.find(fld); if (it != fldToPrefs.end()) { pfx = it->second; @@ -816,9 +816,17 @@ void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) m_synthAbsWordCtxLen = syntctxlen; } +static inline void leftzeropad(string& s, unsigned len) +{ + if (s.length() && s.length() < len) + s = s.insert(0, len-s.length(), '0'); +} + static const int MB = 1024 * 1024; static const string nc("\n\r\x0c"); +#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";} + // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more @@ -958,39 +966,43 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, // reasonable lengths and suppress newlines (so that the data // record can keep a simple syntax) - string record = "url=" + doc.url; - record += "\nmtype=" + doc.mimetype; - record += "\nfmtime=" + doc.fmtime; + string record; + RECORD_APPEND(record, Doc::keyurl, doc.url); + RECORD_APPEND(record, Doc::keytp, doc.mimetype); + // We left-zero-pad the times so that they are lexico-sortable + leftzeropad(doc.fmtime, 11); + RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); if (!doc.dmtime.empty()) { - record += "\ndmtime=" + doc.dmtime; + leftzeropad(doc.dmtime, 11); + RECORD_APPEND(record, Doc::keydmt, doc.dmtime); } - record += "\norigcharset=" + doc.origcharset; + RECORD_APPEND(record, Doc::keyoc, doc.origcharset); if (!doc.fbytes.empty()) - record += string("\nfbytes=") + doc.fbytes; + RECORD_APPEND(record, Doc::keyfs, doc.fbytes); // Note that we add the signature both as a value and in the data record if (!doc.sig.empty()) - record += string("\nsig=") + doc.sig; + RECORD_APPEND(record, Doc::keysig, doc.sig); newdocument.add_value(VALUE_SIG, doc.sig); char sizebuf[30]; sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); - record += string("\ndbytes=") + sizebuf; + RECORD_APPEND(record, Doc::keyds, sizebuf); if (!doc.ipath.empty()) - record += "\nipath=" + doc.ipath; + RECORD_APPEND(record, Doc::keyipt, doc.ipath); if (doc.meta[Doc::keytt].empty()) doc.meta[Doc::keytt] = doc.utf8fn; doc.meta[Doc::keytt] = neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc); if (!doc.meta[Doc::keytt].empty()) - record += "\n" + keycap + "=" + doc.meta[Doc::keytt]; + RECORD_APPEND(record, keycap, doc.meta[Doc::keytt]); doc.meta[Doc::keykw] = neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc); if (!doc.meta[Doc::keykw].empty()) - record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw]; + RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]); // If abstract is empty, we make up one with the beginning of the // document. This is then not indexed, but part of the doc data so @@ -1010,22 +1022,23 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, nc); } if (!doc.meta[Doc::keyabs].empty()) - record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs]; + RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]); RclConfig *config = RclConfig::getMainConfig(); if (config) { const set& stored = config->getStoredFields(); for (set::const_iterator it = stored.begin(); it != stored.end(); it++) { + string nm = stringtolower(config->fieldCanon(*it)); if (!doc.meta[*it].empty()) { string value = neutchars(truncate_to_word(doc.meta[*it], 150), nc); - record += "\n" + *it + "=" + value; + RECORD_APPEND(record, nm, value); } } } - record += "\n"; - LOGDEB0(("Rcl::Db::add: new doc record:\n %s\n", record.c_str())); + + LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str())); newdocument.set_data(record); const char *fnc = udi.c_str(); @@ -1105,21 +1118,6 @@ bool Db::needUpdate(const string &udi, const string& sig) // Retrieve old file/doc signature from value string osig = doc.get_value(VALUE_SIG); -#if 0 - // Get old sig from data record - string data = doc.get_data(); - string::size_type i1, i2; - i1 = data.find("sig="); - if (i1 == string::npos) - return true; - i1 += 4; - if (i1 >= data.length()) - return true; - i2 = data.find_first_of("\n\r", i1); - if (i2 == string::npos) - return true; - string osig = data.substr(i1, i2-i1); -#endif LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n", osig.c_str(), sig.c_str())); // Compare new/old sig @@ -1287,14 +1285,12 @@ bool Db::purgeFile(const string &udi) return false; } +// File name wild card expansion. This is a specialisation ot termMatch bool Db::filenameWildExp(const string& fnexp, list& names) { - // File name search, with possible wildcards. - // We expand wildcards by scanning the filename terms (prefixed - // with XSFN) from the database. - // We build an OR query with the expanded values if any. string pattern; dumb_string(fnexp, pattern); + names.clear(); // If pattern is not quoted, and has no wildcards, we add * at // each end: match any substring @@ -1303,33 +1299,14 @@ bool Db::filenameWildExp(const string& fnexp, list& names) } else if (pattern.find_first_of("*?[") == string::npos) { pattern = "*" + pattern + "*"; } // else let it be + LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str())); - LOGDEB((" pattern: [%s]\n", pattern.c_str())); - - // Match pattern against all file names in the db - string ermsg; - try { - Xapian::TermIterator it = m_ndb->db.allterms_begin(); - it.skip_to("XSFN"); - for (;it != m_ndb->db.allterms_end(); it++) { - if ((*it).find("XSFN") != 0) - break; - string fn = (*it).substr(4); - LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str())); - if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) { - names.push_back((*it).c_str()); - } - // Limit the match count - if (names.size() > 1000) { - LOGERR(("Db::filenameWildExp: too many matched file names\n")); - break; - } - } - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR(("filenameWildExp: xapian error: %s\n", ermsg.c_str())); + list entries; + if (!termMatch(ET_WILD, string(), pattern, entries, 1000, Doc::keyfn)) return false; - } + for (list::const_iterator it = entries.begin(); + it != entries.end(); it++) + names.push_back("XSFN"+it->term); if (names.empty()) { // Build an impossible query: we know its impossible because we @@ -1385,11 +1362,11 @@ const string regSpecChars = "(.[{"; bool Db::termMatch(MatchType typ, const string &lang, const string &root, list& res, - int max) + int max, + const string& field) { if (!m_ndb || !m_ndb->m_isopen) return false; - Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db; res.clear(); @@ -1399,6 +1376,11 @@ bool Db::termMatch(MatchType typ, const string &lang, dumb_string(root, droot); string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars; + string prefix; + if (!field.empty()) { + (void)fieldToPrefix(field, prefix); + } + if (typ == ET_STEM) { if (!stemExpand(lang, root, res, max)) return false; @@ -1429,33 +1411,43 @@ bool Db::termMatch(MatchType typ, const string &lang, string::size_type es = droot.find_first_of(nochars); string is; switch (es) { - case string::npos: is = droot;break; - case 0: break; - default: is = droot.substr(0, es);break; + case string::npos: is = prefix + droot; break; + case 0: is = prefix; break; + default: is = prefix + droot.substr(0, es); break; } LOGDEB(("termMatch: initsec: [%s]\n", is.c_str())); - Xapian::TermIterator it = db.allterms_begin(); - if (!is.empty()) - it.skip_to(is.c_str()); - for (int n = 0;it != db.allterms_end(); it++) { - // If we're beyond the terms matching the initial string, end - if (!is.empty() && (*it).find(is) != 0) - break; - // Don't match special internal terms beginning with uppercase ascii - if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z') - continue; - if (typ == ET_WILD) { - if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH) - continue; - } else { - if (regexec(®, (*it).c_str(), 0, 0, 0)) - continue; + string ermsg; + try { + Xapian::TermIterator it = db.allterms_begin(); + if (!is.empty()) + it.skip_to(is.c_str()); + for (int n = 0; it != db.allterms_end(); it++) { + // If we're beyond the terms matching the initial string, end + if (!is.empty() && (*it).find(is) != 0) + break; + string term; + if (!prefix.empty()) + term = (*it).substr(prefix.length()); + else + term = *it; + if (typ == ET_WILD) { + if (fnmatch(droot.c_str(), term.c_str(), 0) == FNM_NOMATCH) + continue; + } else { + if (regexec(®, term.c_str(), 0, 0, 0)) + continue; + } + // Do we want stem expansion here? We don't do it for now + res.push_back(TermMatchEntry(term, it.get_termfreq())); + ++n; } - // Do we want stem expansion here? We don't do it for now - res.push_back(TermMatchEntry(*it, it.get_termfreq())); - ++n; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("termMatch: %s\n", ermsg.c_str())); + return false; } + if (typ == ET_REGEXP) { regfree(®); } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 73d945cc..10a73977 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.61 2008-08-26 07:38:29 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.62 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -156,9 +156,11 @@ class Db { * Stem expansion is performed if lang is not empty */ enum MatchType {ET_WILD, ET_REGEXP, ET_STEM}; bool termMatch(MatchType typ, const string &lang, const string &s, - list& result, int max = -1); + list& result, int max = -1, + const string& field = ""); - /** Specific filename wildcard expansion */ + /** Special filename wildcard to XSFN terms expansion. + internal/searchdata use only */ bool filenameWildExp(const string& exp, list& names); /** Set parameters for synthetic abstract generation */ diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 72a0f10e..bf73915d 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -4,7 +4,7 @@ #include "xapian.h" namespace Rcl { -/* @(#$Id: rcldb_p.h,v 1.4 2008-09-05 10:34:17 dockes Exp $ (C) 2007 J.F.Dockes */ +/* @(#$Id: rcldb_p.h,v 1.5 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes */ // Generic Xapian exception catching code. We do this quite often, // and I have no idea how to do this except for a macro @@ -70,5 +70,14 @@ class Db::Native { bool subDocs(const string &udi, vector& docids); }; + +// Field names inside the index data record may differ from the rcldoc ones +// (esp.: caption / title) +inline const string& docfToDatf(const string& df) +{ + static const string keycap("caption"); + return df.compare(Doc::keytt) ? df : keycap; +} + } #endif /* _rcldb_p_h_included_ */ diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp index 2ace4310..8ff4f285 100644 --- a/src/rcldb/rcldoc.cpp +++ b/src/rcldb/rcldoc.cpp @@ -1,14 +1,25 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.1 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.2 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes"; #endif #include "rcldoc.h" namespace Rcl { -const string Doc::keyabs("abstract"); -const string Doc::keyau("author"); -const string Doc::keyfn("filename"); -const string Doc::keykw("keywords"); -const string Doc::keyrr("relevancyrating"); -const string Doc::keytt("title"); + const string Doc::keyurl("url"); + const string Doc::keyfn("filename"); + const string Doc::keyipt("ipath"); + const string Doc::keytp("mtype"); + const string Doc::keyfmt("fmtime"); + const string Doc::keydmt("dmtime"); + const string Doc::keymt("mtime"); + const string Doc::keyoc("origcharset"); + const string Doc::keyfs("fbytes"); + const string Doc::keyds("dbytes"); + const string Doc::keysz("size"); + const string Doc::keysig("sig"); + const string Doc::keyrr("relevancyrating"); + const string Doc::keyabs("abstract"); + const string Doc::keyau("author"); + const string Doc::keytt("title"); + const string Doc::keykw("keywords"); } diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index d0e840ee..6e15a98f 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -16,7 +16,7 @@ */ #ifndef _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_ -/* @(#$Id: rcldoc.h,v 1.9 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: rcldoc.h,v 1.10 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes */ #include #include @@ -44,9 +44,9 @@ class Doc { // can be accessed after a query without fetching the actual document). // We indicate the routine that sets them up during indexing - // This is just "file://" + binary filename. No transcoding: this - // is used to access files - // Index: computed from fn by Db::add caller. Query: from doc data. + // This is just "file://" + binary or url-encoded filename. No + // transcoding: this is used to access files Index: computed from + // fn by Db::add caller. Query: from doc data. string url; // Transcoded version of the simple file name for SFN-prefixed @@ -134,12 +134,29 @@ class Doc { pc = 0; xdocid = 0; } - static const string keyfn; - static const string keyrr; - static const string keyabs; - static const string keyau; - static const string keytt; - static const string keykw; + + // The official names for recoll native fields when used in a text + // context (ie: the python interface duplicates some of the fixed + // fields in the meta array, these are the names used). Defined in + // rcldoc.cpp. For fields stored in the meta[] array (ie, title, + // author), filters _must_ use these values + static const string keyurl; // url + static const string keyfn; // file name + static const string keyipt; // ipath + static const string keytp; // mime type + static const string keyfmt; // file mtime + static const string keydmt; // document mtime + static const string keymt; // mtime dmtime if set else fmtime + static const string keyoc; // original charset + static const string keyfs; // file size + static const string keyds; // document size + static const string keysz; // dbytes if set else fbytes + static const string keysig; // sig + static const string keyrr; // relevancy rating + static const string keyabs; // abstract + static const string keyau; // author + static const string keytt; // title + static const string keykw; // keywords }; diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index bf5c44ea..b7127eb5 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp $ (C) 2008 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.6 2008-09-16 08:18:30 dockes Exp $ (C) 2008 J.F.Dockes"; #endif #include @@ -8,6 +8,8 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp #include #include +#include "xapian/sorter.h" + #include "rcldb.h" #include "rcldb_p.h" #include "rclquery.h" @@ -20,6 +22,8 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp #ifndef NO_NAMESPACES namespace Rcl { #endif + + class FilterMatcher : public Xapian::MatchDecider { public: FilterMatcher(const string &topdir) @@ -41,7 +45,7 @@ public: // The only filtering for now is on file path (subtree) string url; - parms.get(string("url"), url); + parms.get(Doc::keyurl, url); LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n", m_topdir.c_str(), url.c_str())); if (url.find(m_topdir, 7) == 7) { @@ -55,14 +59,46 @@ private: string m_topdir; }; +// Sort helper class +class QSorter : public Xapian::Sorter { +public: + QSorter(const string& f) : m_fld(docfToDatf(f) + "=") {} + + virtual std::string operator()(const Xapian::Document& xdoc) const { + string data = xdoc.get_data(); + + // It would be simpler to do the record->Rcl::Doc thing, but + // hand-doing this will be faster. It makes more assumptions + // about the format than a ConfTree though: + string::size_type i1, i2; + i1 = data.find(m_fld); + if (i1 == string::npos) + return string(); + i1 += m_fld.length(); + if (i1 >= data.length()) + return string(); + i2 = data.find_first_of("\n\r", i1); + if (i2 == string::npos) + return string(); + return data.substr(i1, i2-i1); + } + +private: + string m_fld; +}; + Query::Query(Db *db) - : m_nq(new Native(this)), m_db(db) + : m_nq(new Native(this)), m_db(db), m_sorter(0) { } Query::~Query() { deleteZ(m_nq); + if (m_sorter) { + delete (QSorter*)m_sorter; + m_sorter = 0; + } } string Query::getReason() const @@ -75,6 +111,7 @@ Db *Query::whatDb() return m_db; } + //#define ISNULL(X) (X).isNull() #define ISNULL(X) !(X) @@ -114,6 +151,17 @@ bool Query::setQuery(RefCntr sdata, int opts, try { m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db); m_nq->enquire->set_query(m_nq->query); + if (!sdata->getSortBy().empty()) { + if (m_sorter) { + delete (QSorter*)m_sorter; + m_sorter = 0; + } + m_sorter = new QSorter(sdata->getSortBy()); + // It really seems there is a xapian bug about sort order, we + // invert here. + m_nq->enquire->set_sort_by_key((QSorter*)m_sorter, + !sdata->getSortAscending()); + } m_nq->mset = Xapian::MSet(); // Get the query description and trim the "Xapian::Query" d = m_nq->query.get_description(); diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index f78dc077..1f3909dd 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -1,6 +1,6 @@ #ifndef _rclquery_h_included_ #define _rclquery_h_included_ -/* @(#$Id: rclquery.h,v 1.2 2008-07-01 08:31:08 dockes Exp $ (C) 2008 J.F.Dockes */ +/* @(#$Id: rclquery.h,v 1.3 2008-09-16 08:18:30 dockes Exp $ (C) 2008 J.F.Dockes */ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -87,6 +87,7 @@ private: string m_filterTopDir; // Current query filter on subtree top directory string m_reason; // Error explanation Db *m_db; + void *m_sorter; unsigned int m_qOpts; /* Copyconst and assignement private and forbidden */ Query(const Query &) {} diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 44f18b6c..81c46df9 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.22 2008-08-28 15:42:43 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.23 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -153,6 +153,14 @@ void SearchData::erase() { m_reason.erase(); } +void SearchData::setSortBy(const string& fld, bool ascending) { + RclConfig *cfg = RclConfig::getMainConfig(); + m_sortField = cfg->fieldCanon(stringtolower(fld)); + m_sortAscending = ascending; + LOGDEB0(("SearchData::setSortBy: [%s] %s\n", m_sortField.c_str(), + m_sortAscending ? "ascending" : "descending")); +} + // Am I a file name only search ? This is to turn off term highlighting bool SearchData::fileNameOnly() { @@ -572,9 +580,9 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, list names; for (list::iterator it = patterns.begin(); it != patterns.end(); it++) { - // This relies on filenameWildExp not resetting and always - // adding to the input - db.filenameWildExp(*it, names); + list more; + db.filenameWildExp(*it, more); + names.splice(names.end(), more); } // Build a query out of the matching file name terms. *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index bdb23686..51db2de7 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -16,7 +16,7 @@ */ #ifndef _SEARCHDATA_H_INCLUDED_ #define _SEARCHDATA_H_INCLUDED_ -/* @(#$Id: searchdata.h,v 1.17 2008-09-08 15:47:44 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: searchdata.h,v 1.18 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */ /** * Structures to hold data coming almost directly from the gui @@ -110,11 +110,17 @@ public: /** Add file type for filtering results */ void addFiletype(const string& ft) {m_filetypes.push_back(ft);} + /** Choose sort order. Should this be in RclQuery instead ? */ + void setSortBy(const string& fld, bool ascending = true); + const string& getSortBy() const {return m_sortField;} + bool getSortAscending() const {return m_sortAscending;} private: - SClType m_tp; // Only SCLT_AND or SCLT_OR here - vector m_query; - vector m_filetypes; // Restrict to filetypes if set. - string m_topdir; // Restrict to subtree. + SClType m_tp; // Only SCLT_AND or SCLT_OR here + vector m_query; + vector m_filetypes; // Restrict to filetypes if set. + string m_topdir; // Restrict to subtree. + string m_sortField; + bool m_sortAscending; // Printable expanded version of the complete query, retrieved/set // from rcldb after the Xapian::setQuery() call string m_description; diff --git a/src/sampleconf/fields b/src/sampleconf/fields index 831cb48d..711b7b70 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -1,4 +1,4 @@ -# @(#$Id: fields,v 1.2 2008-09-15 08:03:37 dockes Exp $ (C) 2007 J.F.Dockes +# @(#$Id: fields,v 1.3 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes # Field names configuration. This defines how one may search ie for # author:Hemingway # Important: @@ -43,13 +43,18 @@ stored = author ########################## # This section defines field names aliases or synonyms. Any right hand side # value will be turned into the lhs canonic name before further treatment -# Left-hand values must match names in the prefixes section or -# data-record fields. -# Note to filter writers: only canonic names should be used when indexing. +# +# The left-hand values in the recoll distribution file are well known and +# must match names used in the c++ code, or even the index data +# record. They can't change! But you can add others. +# +# Filters should only add canonic names to the meta array when indexing, +# not aliases. + [aliases] abstract = summary dc:summary description xesam:description author = creator dc:creator xesam:author xesam:creator -caption = title dc:title subject +title = title dc:title subject # catg = dc:type contentCategory dbytes = size xesam:size dmtime = date dc:date dc:datemodified datemodified contentmodified \ @@ -64,5 +69,6 @@ url = dc:identifier xesam:url ######################### # This section defines a hierarchy for field names. Searching for a lhs # ancestor will be expanded to a search for itself and all rhs descendants +# This is not used for now [specialisations] author = from