general field name handling cleanup + sort facility in rclquery

This commit is contained in:
dockes 2008-09-16 08:18:30 +00:00
parent 5cc1de9aad
commit 7d30485f87
18 changed files with 556 additions and 297 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.57 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.58 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -467,27 +467,21 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
// Build a direct map avoiding all indirections for field to
// prefix translation
// Add direct prefixes
// Add direct prefixes from the [prefixes] section
list<string>tps = m_fields->getNames("prefixes");
for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
string val;
m_fields->get(*it, val, "prefixes");
m_fldtopref[*it] = val;
m_fldtopfx[stringtolower(*it)] = val;
}
// Add prefixes for aliases:
// Add prefixes for aliases (build alias-to-canonic map while we're at it)
tps = m_fields->getNames("aliases");
for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
string canonic = *it; // canonic name
string canonic = stringtolower(*it); // canonic name
string pfx;
map<string,string>::const_iterator pit = m_fldtopref.find(canonic);
if (pit != m_fldtopref.end()) {
map<string,string>::const_iterator pit = m_fldtopfx.find(canonic);
if (pit != m_fldtopfx.end()) {
pfx = pit->second;
} else {
// Note: it's perfectly normal to have no prefix for the canonic
// name, this could be a stored, not indexed field
LOGDEB2(("RclConfig::readFieldsConfig: no pfx for canonic [%s]\n",
canonic.c_str()));
continue;
}
string aliases;
m_fields->get(canonic, aliases, "aliases");
@ -495,12 +489,14 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
stringToStrings(aliases, l);
for (list<string>::const_iterator ait = l.begin();
ait != l.end(); ait++) {
m_fldtopref[*ait] = pfx;
if (!pfx.empty())
m_fldtopfx[stringtolower(*ait)] = pfx;
m_aliastocanon[stringtolower(*ait)] = canonic;
}
}
#if 0
for (map<string,string>::const_iterator it = m_fldtopref.begin();
it != m_fldtopref.end(); it++) {
for (map<string,string>::const_iterator it = m_fldtopfx.begin();
it != m_fldtopfx.end(); it++) {
LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n",
it->first.c_str(), it->second.c_str()));
}
@ -512,8 +508,9 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
stringToStrings(ss, sl);
for (list<string>::const_iterator it = sl.begin();
it != sl.end(); it++) {
LOGDEB(("Inserting [%s] in stored list\n", (*it).c_str()));
m_storedFields.insert(*it);
string fld = fieldCanon(stringtolower(*it));
LOGDEB(("Inserting [%s] in stored list\n", fld.c_str()));
m_storedFields.insert(fld);
}
}
@ -521,10 +518,11 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
}
// Return term indexing prefix for field name (ie: "filename" -> "XSFN")
// The input must be a canonical field name (alias translation done already)
bool RclConfig::getFieldPrefix(const string& fld, string &pfx)
{
map<string,string>::const_iterator pit = m_fldtopref.find(fld);
if (pit != m_fldtopref.end()) {
map<string,string>::const_iterator pit = m_fldtopfx.find(fld);
if (pit != m_fldtopfx.end()) {
pfx = pit->second;
return true;
} else {
@ -572,10 +570,13 @@ bool RclConfig::getFieldSpecialisationPrefixes(const string& fld,
pfxes.unique();
return true;
}
bool RclConfig::fieldIsStored(const string& fld)
string RclConfig::fieldCanon(const string& fld)
{
set<string>::const_iterator it = m_storedFields.find(fld);
return it != m_storedFields.end();
map<string, string>::const_iterator it = m_aliastocanon.find(fld);
if (it != m_aliastocanon.end())
return it->second;
return fld;
}
string RclConfig::getMimeViewerDef(const string &mtype)

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.40 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rclconfig.h,v 1.41 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list>
#include <string>
@ -144,7 +144,7 @@ class RclConfig {
/** mimeconf: get list of mime types for category */
bool getMimeCatTypes(const string& cat, list<string>&);
/** mimeconf: get field prefix from field name */
/** fields: get field prefix from field name */
bool getFieldPrefix(const string& fldname, string &pfx);
/** Get implied meanings for field name (ie: author->[author, from]) */
bool getFieldSpecialisations(const string& fld,
@ -152,8 +152,9 @@ class RclConfig {
/** Get prefixes for specialisations of field name */
bool getFieldSpecialisationPrefixes(const string& fld,
list<string>& pfxes);
bool fieldIsStored(const string& fld);
const set<string>& getStoredFields() {return m_storedFields;}
/** Get canonic name for possible alias */
string fieldCanon(const string& fld);
/** mimeview: get/set external viewer exec string(s) for mimetype(s) */
string getMimeViewerDef(const string &mimetype);
@ -196,7 +197,8 @@ class RclConfig {
ConfStack<ConfSimple> *mimeconf; // but their content may depend on it.
ConfStack<ConfSimple> *mimeview; //
ConfStack<ConfSimple> *m_fields;
map<string, string> m_fldtopref;
map<string, string> m_fldtopfx;
map<string, string> m_aliastocanon;
set<string> m_storedFields;
void *m_stopsuffixes;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.41 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.42 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -296,7 +296,6 @@ static const string keyds("description");
static const string keyfn("filename");
static const string keymd("modificationdate");
static const string keymt("mimetype");
static const string keyoc("origcharset");
static const string keytt("title");
bool FileInterner::dijontorcl(Rcl::Doc& doc)
@ -310,7 +309,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
doc.text = it->second;
} else if (it->first == keymd) {
doc.dmtime = it->second;
} else if (it->first == keyoc) {
} else if (it->first == Rcl::Doc::keyoc) {
doc.origcharset = it->second;
} else if (it->first == keymt || it->first == keycs) {
// don't need these.

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.11 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes";
static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.12 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes";
#endif
@ -84,31 +84,33 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
return 0;
}
/* Note: addclause necessite And/Or vient du fait que le string peut avoir
plusieurs mots. A transferer dans l'i/f Python ou pas ? */
PyDoc_STRVAR(doc_addClause,
"addClause(type='and'|'or'|'excl'|'phrase'|'near'|'sub', qstring=string,\n"
" slack=int, field=string, subSearch=SearchData,\n"
" slack=int, field=string, subSearch=SearchData)\n"
"Adds a simple clause to the SearchData And/Or chain, or a subquery\n"
"defined by another SearchData object\n"
);
/* Note: necessite And/Or vient du fait que le string peut avoir
plusieurs mots. A transferer dans l'i/f Python ou pas ? */
/* Forward decl, def needs recoll_searchDataTyep */
/* Forward declaration only, definition needs recoll_searchDataType */
static PyObject *
SearchData_addClause(recoll_SearchDataObject* self, PyObject *args,
PyObject *kwargs);
static PyMethodDef SearchData_methods[] = {
{"addClause", (PyCFunction)SearchData_addClause, METH_VARARGS|METH_KEYWORDS,
doc_addClause
},
doc_addClause},
{NULL} /* Sentinel */
};
PyDoc_STRVAR(doc_SearchDataObject,
"SearchData()\n"
"\n"
"A SearchData object describes a query.\n"
"A SearchData object describes a query. It has a number of global parameters\n"
"and a chain of search clauses.\n"
);
static PyTypeObject recoll_SearchDataType = {
PyObject_HEAD_INIT(NULL)
@ -165,9 +167,9 @@ SearchData_addClause(recoll_SearchDataObject* self, PyObject *args,
static char *kwlist[] = {"type", "qstring", "slack", "field",
"subsearch", NULL};
char *tp = 0;
char *qs = 0;
char *qs = 0; // needs freeing
int slack = 0;
char *fld = 0;
char *fld = 0; // needs freeing
recoll_SearchDataObject *sub = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ses|iesO!", kwlist,
&tp, "utf-8", &qs, &slack,
@ -221,11 +223,13 @@ SearchData_addClause(recoll_SearchDataObject* self, PyObject *args,
PyErr_SetString(PyExc_AttributeError, "Bad tp arg");
return 0;
}
PyMem_Free(qs);
PyMem_Free(fld);
self->sd->addClause(cl);
Py_RETURN_NONE;
}
///////////////////////////////////////////////////////////////////////
///// Doc code
typedef struct {
@ -272,38 +276,86 @@ Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
return 0;
}
// The "closure" thing is actually the meta field name. This is how
// python allows one set of get/set functions to get/set different
// attributes (pass them an additional parameters as from the
// getseters table and call it a "closure"
static PyObject *
Doc_getmeta(recoll_DocObject *self, void *closure)
Doc_getattr(recoll_DocObject *self, char *name)
{
LOGDEB0(("Doc_getmeta: [%s]\n", (const char *)closure));
LOGDEB(("Doc_getattr: name [%s]\n", name));
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
#if 0
for (map<string,string>::const_iterator it = self->doc->meta.begin();
it != self->doc->meta.end(); it++) {
LOGDEB(("meta[%s] -> [%s]\n", it->first.c_str(), it->second.c_str()));
}
#endif
string key = rclconfig->fieldCanon(stringtolower(string(name)));
// Retrieve utf-8 coded value for meta field (if it doesnt exist,
// this inserts a null value in the array, we could be nicer.
string meta = self->doc->meta[(const char *)closure];
// Handle special cases, then try retrieving key value from meta
// array
string value;
switch (key.at(0)) {
case 'f':
if (!key.compare(Rcl::Doc::keyfs)) {
value = self->doc->fbytes;
} else if (!key.compare(Rcl::Doc::keyfn)) {
value = self->doc->utf8fn;
} else if (!key.compare(Rcl::Doc::keyfs)) {
value = self->doc->fbytes;
} else if (!key.compare(Rcl::Doc::keyfmt)) {
value = self->doc->fmtime;
}
break;
case 'd':
if (!key.compare(Rcl::Doc::keyds)) {
value = self->doc->dbytes;
} else if (!key.compare(Rcl::Doc::keydmt)) {
value = self->doc->dmtime;
}
break;
case 'i':
if (!key.compare(Rcl::Doc::keyipt)) {
value = self->doc->ipath;
}
break;
case 'm':
if (!key.compare(Rcl::Doc::keytp)) {
value = self->doc->mimetype;
} else if (!key.compare(Rcl::Doc::keymt)) {
value = self->doc->dmtime.empty() ? self->doc->fmtime :
self->doc->dmtime;
}
break;
case 'o':
if (!key.compare(Rcl::Doc::keyoc)) {
value = self->doc->origcharset;
}
break;
case 's':
if (!key.compare(Rcl::Doc::keysig)) {
value = self->doc->sig;
} else if (!key.compare(Rcl::Doc::keysz)) {
value = self->doc->dbytes.empty() ? self->doc->fbytes :
self->doc->dbytes;
}
break;
default:
value = self->doc->meta[key];
}
LOGDEB(("Doc_getattr: [%s] (%s) -> [%s]\n",
name, key.c_str(), value.c_str()));
// Return a python unicode object
PyObject* res = PyUnicode_Decode(meta.c_str(), meta.size(), "UTF-8",
PyObject* res = PyUnicode_Decode(value.c_str(), value.size(), "UTF-8",
"replace");
return res;
}
static int
Doc_setmeta(recoll_DocObject *self, PyObject *value, void *closure)
Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
{
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
@ -330,56 +382,55 @@ Doc_setmeta(recoll_DocObject *self, PyObject *value, void *closure)
}
char* uvalue = PyString_AsString(putf8);
const char *key = (const char *)closure;
if (key == 0) {
PyErr_SetString(PyExc_AttributeError, "key??");
if (name == 0) {
PyErr_SetString(PyExc_AttributeError, "name??");
return -1;
}
LOGDEB0(("Doc_setmeta: setting [%s] to [%s]\n", key, uvalue));
self->doc->meta[key] = uvalue;
switch (key[0]) {
LOGDEB0(("Doc_setattr: setting [%s] to [%s]\n", name, uvalue));
self->doc->meta[name] = uvalue;
switch (name[0]) {
case 'd':
if (!strcmp(key, "dbytes")) {
if (!strcmp(name, "dbytes")) {
self->doc->dbytes = uvalue;
}
break;
case 'f':
if (!strcmp(key, "fbytes")) {
if (!strcmp(name, "fbytes")) {
self->doc->fbytes = uvalue;
}
break;
case 'i':
if (!strcmp(key, "ipath")) {
if (!strcmp(name, "ipath")) {
self->doc->ipath = uvalue;
}
break;
case 'm':
if (!strcmp(key, "mimetype")) {
if (!strcmp(name, "mimetype")) {
self->doc->mimetype = uvalue;
} else if (!strcmp(key, "mtime")) {
} else if (!strcmp(name, "mtime")) {
self->doc->dmtime = uvalue;
}
break;
case 's':
if (!strcmp(key, "sig")) {
if (!strcmp(name, "sig")) {
self->doc->sig = uvalue;
}
break;
case 't':
if (!strcmp(key, "text")) {
if (!strcmp(name, "text")) {
self->doc->text = uvalue;
}
break;
case 'u':
if (!strcmp(key, "url")) {
if (!strcmp(name, "url")) {
self->doc->url = uvalue;
}
break;
}
return 0;
}
#if 0
static PyGetSetDef Doc_getseters[] = {
// Name, get, set, doc, closure
{"url", (getter)Doc_getmeta, (setter)Doc_setmeta,
@ -410,6 +461,7 @@ static PyGetSetDef Doc_getseters[] = {
"sig", (void *)"sig"},
{NULL} /* Sentinel */
};
#endif
PyDoc_STRVAR(doc_DocObject,
"Doc()\n"
@ -427,8 +479,8 @@ static PyTypeObject recoll_DocType = {
0, /*tp_itemsize*/
(destructor)Doc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
(getattrfunc)Doc_getattr, /*tp_getattr*/
(setattrfunc)Doc_setattr, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
@ -450,7 +502,7 @@ static PyTypeObject recoll_DocType = {
0, /* tp_iternext */
0, /* tp_methods */
0, /* tp_members */
Doc_getseters, /* tp_getset */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
@ -470,7 +522,10 @@ typedef struct {
/* Type-specific fields go here. */
Rcl::Query *query;
int next; // Index of result to be fetched next or -1 if uninit
char *sortfield;
int ascending;
} recoll_QueryObject;
/////////////////////////////////////////////
/// Query object
static void
@ -481,6 +536,7 @@ Query_dealloc(recoll_QueryObject *self)
the_queries.erase(self->query);
delete self->query;
self->query = 0;
self->sortfield = 0;
self->ob_type->tp_free((PyObject*)self);
}
@ -495,6 +551,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
return 0;
self->query = 0;
self->next = -1;
self->sortfield = 0;
return (PyObject *)self;
}
@ -511,9 +568,29 @@ Query_init(recoll_QueryObject *self, PyObject *, PyObject *)
delete self->query;
self->query = 0;
self->next = -1;
self->sortfield = 0;
self->ascending = true;
return 0;
}
PyDoc_STRVAR(doc_Query_sortby,
"sortby(field=fieldname, ascending=true)\n"
"Sort results by 'fieldname', in ascending or descending order.\n"
"Only one field can be used, no subsorts for now.\n"
);
static PyObject *
Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB(("Query_sortby\n"));
static char *kwlist[] = {"field", "ascending", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|i", kwlist,
&self->sortfield,
&self->ascending))
return 0;
Py_RETURN_NONE;
}
PyDoc_STRVAR(doc_Query_execute,
"execute(query_string, stemmming=1|0)\n"
"\n"
@ -527,14 +604,17 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB(("Query_execute\n"));
static char *kwlist[] = {"query_string", "stemming", NULL};
char *utf8 = 0;
char *sutf8 = 0; // needs freeing
int dostem = 1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|i:Query_execute",
kwlist, "utf-8", &utf8,
kwlist, "utf-8", &sutf8,
&dostem)) {
return 0;
}
LOGDEB(("Query_execute: [%s]\n", utf8));
LOGDEB(("Query_execute: [%s]\n", sutf8));
string utf8(sutf8);
PyMem_Free(sutf8);
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
PyErr_SetString(PyExc_AttributeError, "query");
@ -542,11 +622,12 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
}
string reason;
Rcl::SearchData *sd = wasaStringToRcl(utf8, reason);
PyMem_Free(utf8);
if (!sd) {
PyErr_SetString(PyExc_ValueError, reason.c_str());
return 0;
}
sd->setSortBy(self->sortfield, self->ascending);
RefCntr<Rcl::SearchData> rq(sd);
self->query->setQuery(rq, dostem?Rcl::Query::QO_STEM:Rcl::Query::QO_NONE);
int cnt = self->query->getResCnt();
@ -557,7 +638,7 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyDoc_STRVAR(doc_Query_executesd,
"execute(SearchData, stemming=1|0)\n"
"\n"
"Starts a search for the query defined by SearchData.\n"
"Starts a search for the query defined by the SearchData object.\n"
);
static PyObject *
@ -576,6 +657,7 @@ Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
pysd->sd->setSortBy(self->sortfield, self->ascending);
self->query->setQuery(pysd->sd, dostem ? Rcl::Query::QO_STEM :
Rcl::Query::QO_NONE);
int cnt = self->query->getResCnt();
@ -616,18 +698,22 @@ Query_fetchone(recoll_QueryObject* self, PyObject *, PyObject *)
return 0;
}
self->next++;
// Move some data from the dedicated fields to the meta array to make
// fetching attributes easier
// fetching attributes easier. Is this actually needed ? Useful for
// url and relevancy rating which are also formatted .
Rcl::Doc *doc = result->doc;
printableUrl(rclconfig->getDefCharset(), doc->url, doc->meta["url"]);
doc->meta["mimetype"] = doc->mimetype;
doc->meta["mtime"] = doc->dmtime.empty() ? doc->fmtime : doc->dmtime;
doc->meta["ipath"] = doc->ipath;
doc->meta["fbytes"] = doc->fbytes;
doc->meta["dbytes"] = doc->dbytes;
printableUrl(rclconfig->getDefCharset(), doc->url,
doc->meta[Rcl::Doc::keyurl]);
doc->meta[Rcl::Doc::keytp] = doc->mimetype;
doc->meta[Rcl::Doc::keymt] = doc->dmtime.empty() ?
doc->fmtime : doc->dmtime;
doc->meta[Rcl::Doc::keyipt] = doc->ipath;
doc->meta[Rcl::Doc::keyfs] = doc->fbytes;
doc->meta[Rcl::Doc::keyds] = doc->dbytes;
char pc[20];
sprintf(pc, "%02d %%", percent);
doc->meta["relevance"] = pc;
doc->meta[Rcl::Doc::keyrr] = pc;
return (PyObject *)result;
}
@ -637,7 +723,10 @@ static PyMethodDef Query_methods[] = {
doc_Query_execute},
{"executesd", (PyCFunction)Query_executesd, METH_VARARGS|METH_KEYWORDS,
doc_Query_executesd},
{"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,doc_Query_fetchone},
{"fetchone", (PyCFunction)Query_fetchone, METH_VARARGS,
doc_Query_fetchone},
{"sortby", (PyCFunction)Query_sortby, METH_VARARGS|METH_KEYWORDS,
doc_Query_sortby},
{NULL} /* Sentinel */
};
@ -881,8 +970,8 @@ Db_makeDocAbstract(recoll_DbObject* self, PyObject *args, PyObject *)
static PyObject *
Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds)
{
char *udi = 0;
char *sig = 0;
char *udi = 0; // needs freeing
char *sig = 0; // needs freeing
LOGDEB(("Db_needUpdate\n"));
if (!PyArg_ParseTuple(args, "eses:Db_needUpdate",
"utf-8", &udi, "utf-8", &sig)) {
@ -891,6 +980,8 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds)
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR(("Db_needUpdate: db not found %p\n", self->db));
PyErr_SetString(PyExc_AttributeError, "db");
PyMem_Free(udi);
PyMem_Free(sig);
return 0;
}
bool result = self->db->needUpdate(udi, sig);
@ -903,16 +994,20 @@ static PyObject *
Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *)
{
LOGDEB(("Db_addOrUpdate\n"));
char *udi = 0;
char *parent_udi = 0;
char *sudi = 0; // needs freeing
char *sparent_udi = 0; // needs freeing
recoll_DocObject *pydoc;
if (!PyArg_ParseTuple(args, "esO!|es:Db_addOrUpdate",
"utf-8", &udi, &recoll_DocType, &pydoc,
"utf-8", &parent_udi)) {
"utf-8", &sudi, &recoll_DocType, &pydoc,
"utf-8", &sparent_udi)) {
return 0;
}
string udi(sudi);
string parent_udi(sparent_udi ? sparent_udi : "");
PyMem_Free(sudi);
PyMem_Free(sparent_udi);
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR(("Db_addOrUpdate: db not found %p\n", self->db));
PyErr_SetString(PyExc_AttributeError, "db");
@ -923,16 +1018,11 @@ Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *)
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (!self->db->addOrUpdate(udi, parent_udi?parent_udi:"", *pydoc->doc)) {
if (!self->db->addOrUpdate(udi, parent_udi, *pydoc->doc)) {
LOGERR(("Db_addOrUpdate: rcldb error\n"));
PyErr_SetString(PyExc_AttributeError, "rcldb error");
PyMem_Free(udi);
PyMem_Free(parent_udi);
return 0;
}
PyMem_Free(udi);
if (parent_udi)
PyMem_Free(parent_udi);
Py_RETURN_NONE;
}

View File

@ -27,6 +27,7 @@ module1 = Extension('recoll',
top + 'query/wasatorcl.cpp',
top + 'rcldb/pathhash.cpp',
top + 'rcldb/rcldb.cpp',
top + 'rcldb/rcldoc.cpp',
top + 'rcldb/rclquery.cpp',
top + 'rcldb/searchdata.cpp',
top + 'rcldb/stemdb.cpp',

View File

@ -0,0 +1,33 @@
#!/usr/bin/env python
import sys
import recoll
def dotest(db, q):
query = db.query()
query.sortby("title", 1)
nres = query.executesd(q, stemming = 1)
print "Result count: ", nres
if nres > 10:
nres = 10
while query.next >= 0 and query.next < nres:
doc = query.fetchone()
print query.next
for k in ("url", "mtime", "title", "author", "abstract"):
print k, ":", getattr(doc, k).encode('utf-8')
#abs = db.makeDocAbstract(doc, query).encode('utf-8')
#print abs
print
# End dotest
sd = recoll.SearchData()
sd.addClause("and", "essaouira maroc")
#sd.addClause("and", "dockes", field="author")
#sd.addClause("phrase", "jean francois", 1)
#sd.addClause("excl", "plage")
db = recoll.connect()
dotest(db, sd)
sys.exit(0)

View File

@ -1,24 +1,25 @@
#!/usr/bin/env python
"""
Demo implementation of a xesam server. Run it like
Recoll implementation of a xesam server.
Based on the example in the xesam-tools package by:
Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
demo/xesam-dummy-service [-s|--state-messages]
Run it like
xesam-recoll-service
And launch a search on it via
./xesam-tool search hello
xesam-tool search hello
You can use the -s or --state-messages switch to enable StateChanged
signal monitoring in xesam-tool as well as in xesam-dummy-service.
"""
# Sets up path to uninstalled xesam module
import demo
import xesam
import xesam.query
from xesam.server import *
import xesam.server
import gobject
import sys
@ -34,15 +35,16 @@ class RecollServer (xesam.server.Searcher):
"""
def __init__ (self):
h_fact = HandleFactory ()
fact = ClientFactory (self, h_fact, RecollSession, RecollSearch)
h_fact = xesam.server.HandleFactory ()
fact = xesam.server.ClientFactory (self, h_fact,
RecollSession, RecollSearch)
xesam.server.Searcher.__init__ (self, h_fact, fact)
self.set_echo_queries (True)
self.rcldb = recoll.connect()
def start (self):
# Export our selves via a SearchServerStub
SearchServerStub(self).start()
xesam.server.SearchServerStub(self).start()
def GetProperty (self, shandle, name):
prop = xesam.server.Searcher.GetProperty(self, shandle, name)
@ -54,33 +56,24 @@ class RecollServer (xesam.server.Searcher):
xesam.debug ("Set property request for '%s=%s', on session '%s', returning %s" % (name, value, shandle,val))
return val
class RecollSession (Session):
class RecollSession (xesam.server.Session):
"""
"""
def __init__ (self, searcher, session_handle):
Session.__init__ (self, searcher, session_handle)
xesam.server.Session.__init__ (self, searcher, session_handle)
self.set_property ("recoll.org", "xesam-recoll-service")
class RecollSearch (Search):
class RecollSearch (xesam.server.Search):
"""
"""
# Translation from known xesam/whatever field names to Recoll Doc elements
FLDTRANS = \
{
"xesam:title" : lambda doc : doc.title,
"xesam:summary" : lambda doc : doc.abstract,
"xesam:mimeType" : lambda doc : doc.mimetype,
"xesam:contentModified" : lambda doc : \
timestampToIso8601(doc.dmtime or doc.fmtime),
"xesam:url" : lambda doc : doc.url
}
SLICE = 10
def __init__ (self, searcher, session, search_handle, \
query=None, xml=None) :
Search.__init__ (self, searcher, session, search_handle, \
xesam.server.Search.__init__ (self, searcher, session, search_handle, \
query=query, xml=xml)
self._hit_fields = session.get_property (xesam.SESSION_HIT_FIELDS)
@ -88,10 +81,7 @@ class RecollSearch (Search):
xesam.error ("Got property hit.fields as None."
" Setting default xesam:url")
self._hit_fields = ["xesam:url"]
print "RecollSearch: fields:", self._hit_fields
# TOBEDONE: if fields includes "snippet" we need to generate
# the synthetic abstract for each returned doc
# Also relevancyRating, ContentCategory et SourceCategory
xesam.debug("RecollSearch: fields:" % self._hit_fields)
xesam.debug ("Created %s with handle %s and query:\n%s" %
(self.__class__, self.get_handle(), self.get_query()))
@ -99,6 +89,21 @@ class RecollSearch (Search):
if not isinstance(self.get_query(), xesam.query.UserQuery):
raise Exception ("Only UserQuery supported ATM, sorry.")
self.rclquery = self._searcher.rcldb.query()
# In the latest version (>0.95), primary/secondary is replaced by
# a field list.
sortfield = session.get_property(xesam.SESSION_SORT_PRIMARY)
order = session.get_property(xesam.SESSION_SORT_ORDER)
# xesam-tool does not know how to set these for now, so let's
# TEST here
sortfield = "contentModified"
order = "descending"
xesam.debug("Session sort primary %s order %s" % (sortfield, order))
# END TEST
if sortfield:
self.rclquery.sortby(sortfield, order == "ascending" and 1 or 0)
def start (self):
xesam.debug ("RecollSearch '%s' got [%s]" %
@ -110,10 +115,16 @@ class RecollSearch (Search):
doc = self.rclquery.fetchone()
data = []
for fld in self._hit_fields:
if self.FLDTRANS.has_key (fld):
data.append(self.FLDTRANS[fld](doc))
# Need to handle ContentCategory and SourceCategory
fld = fld.lower().replace("xesam:", "")
xesam.debug("Adding data for fld %s" % (fld))
if fld == "snippet":
data.append(self._searcher.rcldb.makeDocAbstract(doc,
self.rclquery))
elif fld == "contentmodified":
data.append(timestampToIso8601(getattr(doc, "mtime")))
else:
data.append("")
data.append(getattr(doc, fld, ""))
self.add_new_hit (self._hit_fields, data)
hits += 1
if hits >= self.SLICE:
@ -135,7 +146,7 @@ class RecollSearch (Search):
xesam.debug ("RecollSearch get_hits")
if self._stopped:
return Search.get_hits(self, num_hits)
return xesam.server.Search.get_hits(self, num_hits)
hits = 0
done = 0;
@ -163,7 +174,7 @@ class RecollSearch (Search):
xesam.debug ("Search '%s' emitted 'done'" % self.get_handle())
self.stop()
return Search.get_hits(self, num_hits)
return xesam.server.Search.get_hits(self, num_hits)
if __name__ == "__main__":

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.14 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.15 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -42,6 +42,33 @@ using namespace std;
#include "internfile.h"
#include "wipedir.h"
bool dump_contents(RclConfig *rclconfig, string& tmpdir, Rcl::Doc& doc)
{
string fn = doc.url.substr(7);
struct stat st;
if (stat(fn.c_str(), &st) != 0) {
cout << "No such file: " << fn << endl;
return true;
}
if (tmpdir.empty() || access(tmpdir.c_str(), 0) < 0) {
string reason;
if (!maketmpdir(tmpdir, reason)) {
cerr << "Cannot create temporary directory: "
<< reason << endl;
return false;
}
}
wipedir(tmpdir);
FileInterner interner(fn, &st, rclconfig, tmpdir, &doc.mimetype);
if (interner.internfile(doc, doc.ipath)) {
cout << doc.text << endl;
} else {
cout << "Cant intern: " << fn << endl;
}
return true;
}
static char *thisprog;
static char usage [] =
" [-o|-a|-f] <query string>\n"
@ -60,6 +87,8 @@ static char usage [] =
" -n <cnt> limit the maximum number of results (0->no limit, default 2000)\n"
" -b : basic. Just output urls, no mime types or titles\n"
" -m : dump the whole document meta[] array\n"
" -S fld : sort by field name\n"
" -D : sort descending\n"
;
static void
Usage(void)
@ -82,10 +111,15 @@ static int op_flags;
#define OPT_q 0x200
#define OPT_t 0x400
#define OPT_m 0x800
#define OPT_D 0x1000
#define OPT_S 0x2000
int recollq(RclConfig **cfp, int argc, char **argv)
{
string a_config;
string sortfield;
int limit = 2000;
thisprog = argv[0];
argc--; argv++;
@ -103,6 +137,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
a_config = *(++argv);
argc--; goto b1;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'f': op_flags |= OPT_f; break;
case 'l': op_flags |= OPT_l; break;
case 'm': op_flags |= OPT_m; break;
@ -112,6 +147,9 @@ int recollq(RclConfig **cfp, int argc, char **argv)
argc--; goto b1;
case 'o': op_flags |= OPT_o; break;
case 'q': op_flags |= OPT_q; break;
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
sortfield = *(++argv);
argc--; goto b1;
case 't': op_flags |= OPT_t; break;
default: Usage(); break;
}
@ -168,6 +206,10 @@ int recollq(RclConfig **cfp, int argc, char **argv)
return 1;
}
if (op_flags & OPT_S) {
sd->setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
}
RefCntr<Rcl::SearchData> rq(sd);
Rcl::Query query(&rcldb);
query.setQuery(rq, Rcl::Query::QO_STEM);
@ -197,6 +239,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
<< "[" << doc.url.c_str() << "]" << "\t"
<< "[" << doc.meta[Rcl::Doc::keytt].c_str() << "]" << "\t"
<< doc.fbytes.c_str() << "\tbytes" << "\t"
<< doc.dmtime.c_str() << "\tSecs" << "\t"
<< endl;
if (op_flags & OPT_m) {
for (map<string,string>::const_iterator it = doc.meta.begin();
@ -204,32 +247,11 @@ int recollq(RclConfig **cfp, int argc, char **argv)
cout << it->first << " = " << it->second << endl;
}
}
cout << endl;
}
if (op_flags & OPT_d) {
string fn = doc.url.substr(7);
struct stat st;
if (stat(fn.c_str(), &st) != 0) {
cout << "No such file: " << fn << endl;
continue;
}
if (tmpdir.empty() || access(tmpdir.c_str(), 0) < 0) {
string reason;
if (!maketmpdir(tmpdir, reason)) {
cerr << "Cannot create temporary directory: "
<< reason << endl;
return 1;
}
}
wipedir(tmpdir);
FileInterner interner(fn, &st, rclconfig, tmpdir, &doc.mimetype);
if (interner.internfile(doc, doc.ipath)) {
cout << doc.text << endl;
} else {
cout << "Cant intern: " << fn << endl;
}
}
dump_contents(rclconfig, tmpdir, doc);
}
}
// Maybe clean up temporary directory

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.144 2008-09-09 12:58:23 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.145 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -129,12 +129,10 @@ bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
return false;
}
// Only ONE field name inside the index data record differs from the
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
// omega
static const string keycap("caption");
static const string keymtp("mtype");
static const string keyfmt("fmtime");
static const string keydmt("dmtime");
static const string keyoc("origcharset");
static const string keyurl("url");
// Turn data record from db into document fields
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
@ -144,11 +142,11 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
ConfSimple parms(&data);
if (!parms.ok())
return false;
parms.get(keyurl, doc.url);
parms.get(keymtp, doc.mimetype);
parms.get(keyfmt, doc.fmtime);
parms.get(keydmt, doc.dmtime);
parms.get(keyoc, doc.origcharset);
parms.get(Doc::keyurl, doc.url);
parms.get(Doc::keytp, doc.mimetype);
parms.get(Doc::keyfmt, doc.fmtime);
parms.get(Doc::keydmt, doc.dmtime);
parms.get(Doc::keyoc, doc.origcharset);
parms.get(keycap, doc.meta[Doc::keytt]);
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
@ -162,10 +160,10 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
char buf[20];
sprintf(buf,"%.2f", float(percent) / 100.0);
doc.meta[Doc::keyrr] = buf;
parms.get(string("ipath"), doc.ipath);
parms.get(string("fbytes"), doc.fbytes);
parms.get(string("dbytes"), doc.dbytes);
parms.get(string("sig"), doc.sig);
parms.get(Doc::keyipt, doc.ipath);
parms.get(Doc::keyfs, doc.fbytes);
parms.get(Doc::keyds, doc.dbytes);
parms.get(Doc::keysig, doc.sig);
doc.xdocid = docid;
// Other, not predefined meta fields:
@ -691,24 +689,25 @@ bool Db::isopen()
// indexed with no prefix (ie: abstract)
bool Db::fieldToPrefix(const string& fldname, string &pfx)
{
// This is the default table
// This is the default table. We prefer the data from rclconfig if
// available
static map<string, string> fldToPrefs;
if (fldToPrefs.empty()) {
fldToPrefs[Doc::keyabs] = string();
fldToPrefs["ext"] = "XE";
fldToPrefs["filename"] = "XSFN";
fldToPrefs[Doc::keyfn] = "XSFN";
fldToPrefs["title"] = "S";
fldToPrefs[keycap] = "S";
fldToPrefs[Doc::keytt] = "S";
fldToPrefs["subject"] = "S";
fldToPrefs[Doc::keyau] = "A";
fldToPrefs["creator"] = "A";
fldToPrefs["from"] = "A";
fldToPrefs[Doc::keykw] = "K";
fldToPrefs["keyword"] = "K";
fldToPrefs["tag"] = "K";
fldToPrefs[Doc::keykw] = "K";
fldToPrefs["tags"] = "K";
}
@ -719,6 +718,7 @@ bool Db::fieldToPrefix(const string& fldname, string &pfx)
if (config && config->getFieldPrefix(fld, pfx))
return true;
// No data in rclconfig? Check default values
map<string, string>::const_iterator it = fldToPrefs.find(fld);
if (it != fldToPrefs.end()) {
pfx = it->second;
@ -816,9 +816,17 @@ void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
m_synthAbsWordCtxLen = syntctxlen;
}
static inline void leftzeropad(string& s, unsigned len)
{
if (s.length() && s.length() < len)
s = s.insert(0, len-s.length(), '0');
}
static const int MB = 1024 * 1024;
static const string nc("\n\r\x0c");
#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}
// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
@ -958,39 +966,43 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
// reasonable lengths and suppress newlines (so that the data
// record can keep a simple syntax)
string record = "url=" + doc.url;
record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime;
string record;
RECORD_APPEND(record, Doc::keyurl, doc.url);
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
// We left-zero-pad the times so that they are lexico-sortable
leftzeropad(doc.fmtime, 11);
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
if (!doc.dmtime.empty()) {
record += "\ndmtime=" + doc.dmtime;
leftzeropad(doc.dmtime, 11);
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
}
record += "\norigcharset=" + doc.origcharset;
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
if (!doc.fbytes.empty())
record += string("\nfbytes=") + doc.fbytes;
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
// Note that we add the signature both as a value and in the data record
if (!doc.sig.empty())
record += string("\nsig=") + doc.sig;
RECORD_APPEND(record, Doc::keysig, doc.sig);
newdocument.add_value(VALUE_SIG, doc.sig);
char sizebuf[30];
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
record += string("\ndbytes=") + sizebuf;
RECORD_APPEND(record, Doc::keyds, sizebuf);
if (!doc.ipath.empty())
record += "\nipath=" + doc.ipath;
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
if (doc.meta[Doc::keytt].empty())
doc.meta[Doc::keytt] = doc.utf8fn;
doc.meta[Doc::keytt] =
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
if (!doc.meta[Doc::keytt].empty())
record += "\n" + keycap + "=" + doc.meta[Doc::keytt];
RECORD_APPEND(record, keycap, doc.meta[Doc::keytt]);
doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
if (!doc.meta[Doc::keykw].empty())
record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw];
RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
// If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so
@ -1010,22 +1022,23 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
nc);
}
if (!doc.meta[Doc::keyabs].empty())
record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs];
RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
RclConfig *config = RclConfig::getMainConfig();
if (config) {
const set<string>& stored = config->getStoredFields();
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = stringtolower(config->fieldCanon(*it));
if (!doc.meta[*it].empty()) {
string value =
neutchars(truncate_to_word(doc.meta[*it], 150), nc);
record += "\n" + *it + "=" + value;
RECORD_APPEND(record, nm, value);
}
}
}
record += "\n";
LOGDEB0(("Rcl::Db::add: new doc record:\n %s\n", record.c_str()));
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
newdocument.set_data(record);
const char *fnc = udi.c_str();
@ -1105,21 +1118,6 @@ bool Db::needUpdate(const string &udi, const string& sig)
// Retrieve old file/doc signature from value
string osig = doc.get_value(VALUE_SIG);
#if 0
// Get old sig from data record
string data = doc.get_data();
string::size_type i1, i2;
i1 = data.find("sig=");
if (i1 == string::npos)
return true;
i1 += 4;
if (i1 >= data.length())
return true;
i2 = data.find_first_of("\n\r", i1);
if (i2 == string::npos)
return true;
string osig = data.substr(i1, i2-i1);
#endif
LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n",
osig.c_str(), sig.c_str()));
// Compare new/old sig
@ -1287,14 +1285,12 @@ bool Db::purgeFile(const string &udi)
return false;
}
// File name wild card expansion. This is a specialisation ot termMatch
bool Db::filenameWildExp(const string& fnexp, list<string>& names)
{
// File name search, with possible wildcards.
// We expand wildcards by scanning the filename terms (prefixed
// with XSFN) from the database.
// We build an OR query with the expanded values if any.
string pattern;
dumb_string(fnexp, pattern);
names.clear();
// If pattern is not quoted, and has no wildcards, we add * at
// each end: match any substring
@ -1303,33 +1299,14 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
} else if (pattern.find_first_of("*?[") == string::npos) {
pattern = "*" + pattern + "*";
} // else let it be
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
// Match pattern against all file names in the db
string ermsg;
try {
Xapian::TermIterator it = m_ndb->db.allterms_begin();
it.skip_to("XSFN");
for (;it != m_ndb->db.allterms_end(); it++) {
if ((*it).find("XSFN") != 0)
break;
string fn = (*it).substr(4);
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
names.push_back((*it).c_str());
}
// Limit the match count
if (names.size() > 1000) {
LOGERR(("Db::filenameWildExp: too many matched file names\n"));
break;
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("filenameWildExp: xapian error: %s\n", ermsg.c_str()));
list<TermMatchEntry> entries;
if (!termMatch(ET_WILD, string(), pattern, entries, 1000, Doc::keyfn))
return false;
}
for (list<TermMatchEntry>::const_iterator it = entries.begin();
it != entries.end(); it++)
names.push_back("XSFN"+it->term);
if (names.empty()) {
// Build an impossible query: we know its impossible because we
@ -1385,11 +1362,11 @@ const string regSpecChars = "(.[{";
bool Db::termMatch(MatchType typ, const string &lang,
const string &root,
list<TermMatchEntry>& res,
int max)
int max,
const string& field)
{
if (!m_ndb || !m_ndb->m_isopen)
return false;
Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
res.clear();
@ -1399,6 +1376,11 @@ bool Db::termMatch(MatchType typ, const string &lang,
dumb_string(root, droot);
string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
string prefix;
if (!field.empty()) {
(void)fieldToPrefix(field, prefix);
}
if (typ == ET_STEM) {
if (!stemExpand(lang, root, res, max))
return false;
@ -1429,33 +1411,43 @@ bool Db::termMatch(MatchType typ, const string &lang,
string::size_type es = droot.find_first_of(nochars);
string is;
switch (es) {
case string::npos: is = droot;break;
case 0: break;
default: is = droot.substr(0, es);break;
case string::npos: is = prefix + droot; break;
case 0: is = prefix; break;
default: is = prefix + droot.substr(0, es); break;
}
LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
Xapian::TermIterator it = db.allterms_begin();
if (!is.empty())
it.skip_to(is.c_str());
for (int n = 0;it != db.allterms_end(); it++) {
// If we're beyond the terms matching the initial string, end
if (!is.empty() && (*it).find(is) != 0)
break;
// Don't match special internal terms beginning with uppercase ascii
if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
continue;
if (typ == ET_WILD) {
if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
continue;
} else {
if (regexec(&reg, (*it).c_str(), 0, 0, 0))
continue;
string ermsg;
try {
Xapian::TermIterator it = db.allterms_begin();
if (!is.empty())
it.skip_to(is.c_str());
for (int n = 0; it != db.allterms_end(); it++) {
// If we're beyond the terms matching the initial string, end
if (!is.empty() && (*it).find(is) != 0)
break;
string term;
if (!prefix.empty())
term = (*it).substr(prefix.length());
else
term = *it;
if (typ == ET_WILD) {
if (fnmatch(droot.c_str(), term.c_str(), 0) == FNM_NOMATCH)
continue;
} else {
if (regexec(&reg, term.c_str(), 0, 0, 0))
continue;
}
// Do we want stem expansion here? We don't do it for now
res.push_back(TermMatchEntry(term, it.get_termfreq()));
++n;
}
// Do we want stem expansion here? We don't do it for now
res.push_back(TermMatchEntry(*it, it.get_termfreq()));
++n;
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("termMatch: %s\n", ermsg.c_str()));
return false;
}
if (typ == ET_REGEXP) {
regfree(&reg);
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.61 2008-08-26 07:38:29 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.62 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -156,9 +156,11 @@ class Db {
* Stem expansion is performed if lang is not empty */
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
bool termMatch(MatchType typ, const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1);
list<TermMatchEntry>& result, int max = -1,
const string& field = "");
/** Specific filename wildcard expansion */
/** Special filename wildcard to XSFN terms expansion.
internal/searchdata use only */
bool filenameWildExp(const string& exp, list<string>& names);
/** Set parameters for synthetic abstract generation */

View File

@ -4,7 +4,7 @@
#include "xapian.h"
namespace Rcl {
/* @(#$Id: rcldb_p.h,v 1.4 2008-09-05 10:34:17 dockes Exp $ (C) 2007 J.F.Dockes */
/* @(#$Id: rcldb_p.h,v 1.5 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes */
// Generic Xapian exception catching code. We do this quite often,
// and I have no idea how to do this except for a macro
@ -70,5 +70,14 @@ class Db::Native {
bool subDocs(const string &udi, vector<Xapian::docid>& docids);
};
// Field names inside the index data record may differ from the rcldoc ones
// (esp.: caption / title)
inline const string& docfToDatf(const string& df)
{
static const string keycap("caption");
return df.compare(Doc::keytt) ? df : keycap;
}
}
#endif /* _rcldb_p_h_included_ */

View File

@ -1,14 +1,25 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.1 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.2 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes";
#endif
#include "rcldoc.h"
namespace Rcl {
const string Doc::keyabs("abstract");
const string Doc::keyau("author");
const string Doc::keyfn("filename");
const string Doc::keykw("keywords");
const string Doc::keyrr("relevancyrating");
const string Doc::keytt("title");
const string Doc::keyurl("url");
const string Doc::keyfn("filename");
const string Doc::keyipt("ipath");
const string Doc::keytp("mtype");
const string Doc::keyfmt("fmtime");
const string Doc::keydmt("dmtime");
const string Doc::keymt("mtime");
const string Doc::keyoc("origcharset");
const string Doc::keyfs("fbytes");
const string Doc::keyds("dbytes");
const string Doc::keysz("size");
const string Doc::keysig("sig");
const string Doc::keyrr("relevancyrating");
const string Doc::keyabs("abstract");
const string Doc::keyau("author");
const string Doc::keytt("title");
const string Doc::keykw("keywords");
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.9 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: rcldoc.h,v 1.10 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string>
#include <map>
@ -44,9 +44,9 @@ class Doc {
// can be accessed after a query without fetching the actual document).
// We indicate the routine that sets them up during indexing
// This is just "file://" + binary filename. No transcoding: this
// is used to access files
// Index: computed from fn by Db::add caller. Query: from doc data.
// This is just "file://" + binary or url-encoded filename. No
// transcoding: this is used to access files Index: computed from
// fn by Db::add caller. Query: from doc data.
string url;
// Transcoded version of the simple file name for SFN-prefixed
@ -134,12 +134,29 @@ class Doc {
pc = 0;
xdocid = 0;
}
static const string keyfn;
static const string keyrr;
static const string keyabs;
static const string keyau;
static const string keytt;
static const string keykw;
// The official names for recoll native fields when used in a text
// context (ie: the python interface duplicates some of the fixed
// fields in the meta array, these are the names used). Defined in
// rcldoc.cpp. For fields stored in the meta[] array (ie, title,
// author), filters _must_ use these values
static const string keyurl; // url
static const string keyfn; // file name
static const string keyipt; // ipath
static const string keytp; // mime type
static const string keyfmt; // file mtime
static const string keydmt; // document mtime
static const string keymt; // mtime dmtime if set else fmtime
static const string keyoc; // original charset
static const string keyfs; // file size
static const string keyds; // document size
static const string keysz; // dbytes if set else fbytes
static const string keysig; // sig
static const string keyrr; // relevancy rating
static const string keyabs; // abstract
static const string keyau; // author
static const string keytt; // title
static const string keykw; // keywords
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp $ (C) 2008 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.6 2008-09-16 08:18:30 dockes Exp $ (C) 2008 J.F.Dockes";
#endif
#include <stdlib.h>
@ -8,6 +8,8 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp
#include <list>
#include <vector>
#include "xapian/sorter.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "rclquery.h"
@ -20,6 +22,8 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.5 2008-09-05 11:45:16 dockes Exp
#ifndef NO_NAMESPACES
namespace Rcl {
#endif
class FilterMatcher : public Xapian::MatchDecider {
public:
FilterMatcher(const string &topdir)
@ -41,7 +45,7 @@ public:
// The only filtering for now is on file path (subtree)
string url;
parms.get(string("url"), url);
parms.get(Doc::keyurl, url);
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
m_topdir.c_str(), url.c_str()));
if (url.find(m_topdir, 7) == 7) {
@ -55,14 +59,46 @@ private:
string m_topdir;
};
// Sort helper class
class QSorter : public Xapian::Sorter {
public:
QSorter(const string& f) : m_fld(docfToDatf(f) + "=") {}
virtual std::string operator()(const Xapian::Document& xdoc) const {
string data = xdoc.get_data();
// It would be simpler to do the record->Rcl::Doc thing, but
// hand-doing this will be faster. It makes more assumptions
// about the format than a ConfTree though:
string::size_type i1, i2;
i1 = data.find(m_fld);
if (i1 == string::npos)
return string();
i1 += m_fld.length();
if (i1 >= data.length())
return string();
i2 = data.find_first_of("\n\r", i1);
if (i2 == string::npos)
return string();
return data.substr(i1, i2-i1);
}
private:
string m_fld;
};
Query::Query(Db *db)
: m_nq(new Native(this)), m_db(db)
: m_nq(new Native(this)), m_db(db), m_sorter(0)
{
}
Query::~Query()
{
deleteZ(m_nq);
if (m_sorter) {
delete (QSorter*)m_sorter;
m_sorter = 0;
}
}
string Query::getReason() const
@ -75,6 +111,7 @@ Db *Query::whatDb()
return m_db;
}
//#define ISNULL(X) (X).isNull()
#define ISNULL(X) !(X)
@ -114,6 +151,17 @@ bool Query::setQuery(RefCntr<SearchData> sdata, int opts,
try {
m_nq->enquire = new Xapian::Enquire(m_db->m_ndb->db);
m_nq->enquire->set_query(m_nq->query);
if (!sdata->getSortBy().empty()) {
if (m_sorter) {
delete (QSorter*)m_sorter;
m_sorter = 0;
}
m_sorter = new QSorter(sdata->getSortBy());
// It really seems there is a xapian bug about sort order, we
// invert here.
m_nq->enquire->set_sort_by_key((QSorter*)m_sorter,
!sdata->getSortAscending());
}
m_nq->mset = Xapian::MSet();
// Get the query description and trim the "Xapian::Query"
d = m_nq->query.get_description();

View File

@ -1,6 +1,6 @@
#ifndef _rclquery_h_included_
#define _rclquery_h_included_
/* @(#$Id: rclquery.h,v 1.2 2008-07-01 08:31:08 dockes Exp $ (C) 2008 J.F.Dockes */
/* @(#$Id: rclquery.h,v 1.3 2008-09-16 08:18:30 dockes Exp $ (C) 2008 J.F.Dockes */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -87,6 +87,7 @@ private:
string m_filterTopDir; // Current query filter on subtree top directory
string m_reason; // Error explanation
Db *m_db;
void *m_sorter;
unsigned int m_qOpts;
/* Copyconst and assignement private and forbidden */
Query(const Query &) {}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.22 2008-08-28 15:42:43 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.23 2008-09-16 08:18:30 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -153,6 +153,14 @@ void SearchData::erase() {
m_reason.erase();
}
void SearchData::setSortBy(const string& fld, bool ascending) {
RclConfig *cfg = RclConfig::getMainConfig();
m_sortField = cfg->fieldCanon(stringtolower(fld));
m_sortAscending = ascending;
LOGDEB0(("SearchData::setSortBy: [%s] %s\n", m_sortField.c_str(),
m_sortAscending ? "ascending" : "descending"));
}
// Am I a file name only search ? This is to turn off term highlighting
bool SearchData::fileNameOnly()
{
@ -572,9 +580,9 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
list<string> names;
for (list<string>::iterator it = patterns.begin();
it != patterns.end(); it++) {
// This relies on filenameWildExp not resetting and always
// adding to the input
db.filenameWildExp(*it, names);
list<string> more;
db.filenameWildExp(*it, more);
names.splice(names.end(), more);
}
// Build a query out of the matching file name terms.
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());

View File

@ -16,7 +16,7 @@
*/
#ifndef _SEARCHDATA_H_INCLUDED_
#define _SEARCHDATA_H_INCLUDED_
/* @(#$Id: searchdata.h,v 1.17 2008-09-08 15:47:44 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: searchdata.h,v 1.18 2008-09-16 08:18:30 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* Structures to hold data coming almost directly from the gui
@ -110,11 +110,17 @@ public:
/** Add file type for filtering results */
void addFiletype(const string& ft) {m_filetypes.push_back(ft);}
/** Choose sort order. Should this be in RclQuery instead ? */
void setSortBy(const string& fld, bool ascending = true);
const string& getSortBy() const {return m_sortField;}
bool getSortAscending() const {return m_sortAscending;}
private:
SClType m_tp; // Only SCLT_AND or SCLT_OR here
vector<SearchDataClause *> m_query;
vector<string> m_filetypes; // Restrict to filetypes if set.
string m_topdir; // Restrict to subtree.
SClType m_tp; // Only SCLT_AND or SCLT_OR here
vector<SearchDataClause*> m_query;
vector<string> m_filetypes; // Restrict to filetypes if set.
string m_topdir; // Restrict to subtree.
string m_sortField;
bool m_sortAscending;
// Printable expanded version of the complete query, retrieved/set
// from rcldb after the Xapian::setQuery() call
string m_description;

View File

@ -1,4 +1,4 @@
# @(#$Id: fields,v 1.2 2008-09-15 08:03:37 dockes Exp $ (C) 2007 J.F.Dockes
# @(#$Id: fields,v 1.3 2008-09-16 08:18:30 dockes Exp $ (C) 2007 J.F.Dockes
# Field names configuration. This defines how one may search ie for
# author:Hemingway
# Important:
@ -43,13 +43,18 @@ stored = author
##########################
# This section defines field names aliases or synonyms. Any right hand side
# value will be turned into the lhs canonic name before further treatment
# Left-hand values must match names in the prefixes section or
# data-record fields.
# Note to filter writers: only canonic names should be used when indexing.
#
# The left-hand values in the recoll distribution file are well known and
# must match names used in the c++ code, or even the index data
# record. They can't change! But you can add others.
#
# Filters should only add canonic names to the meta array when indexing,
# not aliases.
[aliases]
abstract = summary dc:summary description xesam:description
author = creator dc:creator xesam:author xesam:creator
caption = title dc:title subject
title = title dc:title subject
# catg = dc:type contentCategory
dbytes = size xesam:size
dmtime = date dc:date dc:datemodified datemodified contentmodified \
@ -64,5 +69,6 @@ url = dc:identifier xesam:url
#########################
# This section defines a hierarchy for field names. Searching for a lhs
# ancestor will be expanded to a search for itself and all rhs descendants
# This is not used for now
[specialisations]
author = from