When storing doc text, always use a metadata entry. Get rid of the code to

store it in the data record. Make storing the default.  Add "fetchtext"
parameter to getDoc() to fetch and store the text in doc.text. Make this
accessible from Python. Misc comments and indents.
This commit is contained in:
Jean-Francois Dockes 2018-01-25 13:20:02 +01:00
parent 2eaefa2b5d
commit 3d4fd3c62e
7 changed files with 233 additions and 292 deletions

View File

@ -63,7 +63,8 @@ typedef struct {
static void
SearchData_dealloc(recoll_SearchDataObject *self)
{
LOGDEB("SearchData_dealloc. Releasing. Count before: " << (self->sd.use_count()) << "\n" );
LOGDEB("SearchData_dealloc. Releasing. Count before: " <<
self->sd.use_count() << "\n");
self->sd.reset();
Py_TYPE(self)->tp_free((PyObject*)self);
}
@ -71,7 +72,7 @@ SearchData_dealloc(recoll_SearchDataObject *self)
static PyObject *
SearchData_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB("SearchData_new\n" );
LOGDEB("SearchData_new\n");
recoll_SearchDataObject *self;
self = (recoll_SearchDataObject *)type->tp_alloc(type, 0);
@ -90,7 +91,7 @@ PyDoc_STRVAR(doc_SearchDataObject,
static int
SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB("SearchData_init\n" );
LOGDEB("SearchData_init\n");
static const char* kwlist[] = {"type", "stemlang", NULL};
char *stp = 0;
char *steml = 0;
@ -180,9 +181,9 @@ static PyObject *
SearchData_addclause(recoll_SearchDataObject* self, PyObject *args,
PyObject *kwargs)
{
LOGDEB0("SearchData_addclause\n" );
LOGDEB0("SearchData_addclause\n");
if (!self->sd) {
LOGERR("SearchData_addclause: not init??\n" );
LOGERR("SearchData_addclause: not init??\n");
PyErr_SetString(PyExc_AttributeError, "sd");
return 0;
}
@ -294,7 +295,7 @@ SearchData_addclause(recoll_SearchDataObject* self, PyObject *args,
static void
Doc_dealloc(recoll_DocObject *self)
{
LOGDEB("Doc_dealloc\n" );
LOGDEB("Doc_dealloc\n");
if (self->doc)
the_docs.erase(self->doc);
deleteZ(self->doc);
@ -304,7 +305,7 @@ Doc_dealloc(recoll_DocObject *self)
static PyObject *
Doc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB("Doc_new\n" );
LOGDEB("Doc_new\n");
recoll_DocObject *self;
self = (recoll_DocObject *)type->tp_alloc(type, 0);
@ -318,7 +319,7 @@ Doc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
static int
Doc_init(recoll_DocObject *self, PyObject *, PyObject *)
{
LOGDEB("Doc_init\n" );
LOGDEB("Doc_init\n");
if (self->doc)
the_docs.erase(self->doc);
delete self->doc;
@ -340,7 +341,7 @@ PyDoc_STRVAR(doc_Doc_getbinurl,
static PyObject *
Doc_getbinurl(recoll_DocObject *self)
{
LOGDEB0("Doc_getbinurl\n" );
LOGDEB0("Doc_getbinurl\n");
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
@ -359,7 +360,7 @@ PyDoc_STRVAR(doc_Doc_setbinurl,
static PyObject *
Doc_setbinurl(recoll_DocObject *self, PyObject *value)
{
LOGDEB0("Doc_setbinurl\n" );
LOGDEB0("Doc_setbinurl\n");
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc??");
@ -381,7 +382,7 @@ PyDoc_STRVAR(doc_Doc_keys,
static PyObject *
Doc_keys(recoll_DocObject *self)
{
LOGDEB0("Doc_keys\n" );
LOGDEB0("Doc_keys\n");
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
@ -406,7 +407,7 @@ PyDoc_STRVAR(doc_Doc_items,
static PyObject *
Doc_items(recoll_DocObject *self)
{
LOGDEB0("Doc_items\n" );
LOGDEB0("Doc_items\n");
if (self->doc == 0 ||
the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc");
@ -516,7 +517,7 @@ PyDoc_STRVAR(doc_Doc_get,
static PyObject *
Doc_get(recoll_DocObject *self, PyObject *args)
{
LOGDEB1("Doc_get\n" );
LOGDEB1("Doc_get\n");
if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) {
PyErr_SetString(PyExc_AttributeError, "doc??");
return 0;
@ -567,7 +568,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
if (PyUnicode_Check(nameobj)) {
PyObject* utf8o = PyUnicode_AsUTF8String(nameobj);
if (utf8o == 0) {
LOGERR("Doc_getattro: encoding name to utf8 failed\n" );
LOGERR("Doc_getattro: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
@ -583,7 +584,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
string key = rclconfig->fieldQCanon(string(name));
string value;
if (idocget(self, key, value)) {
LOGDEB1("Doc_getattro: [" << key << "] -> [" << value << "]\n");
LOGDEB1("Doc_getattro: [" << key << "] -> [" << value << "]\n");
// Return a python unicode object
return PyUnicode_Decode(value.c_str(), value.size(), "utf-8","replace");
}
@ -621,7 +622,7 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value)
PyObject* putf8 = PyUnicode_AsUTF8String(value);
if (putf8 == 0) {
LOGERR("Doc_setmeta: encoding to utf8 failed\n" );
LOGERR("Doc_setmeta: encoding to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "value??");
return -1;
}
@ -709,7 +710,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key)
if (PyUnicode_Check(key)) {
PyObject* utf8o = PyUnicode_AsUTF8String(key);
if (utf8o == 0) {
LOGERR("Doc_getitemo: encoding name to utf8 failed\n" );
LOGERR("Doc_getitemo: encoding name to utf8 failed\n");
PyErr_SetString(PyExc_AttributeError, "name??");
Py_RETURN_NONE;
}
@ -830,6 +831,7 @@ typedef struct {
int ascending;
int arraysize; // Default size for fetchmany
recoll_DbObject* connection;
bool fetchtext;
} recoll_QueryObject;
PyDoc_STRVAR(doc_Query_close,
@ -838,7 +840,7 @@ PyDoc_STRVAR(doc_Query_close,
static PyObject *
Query_close(recoll_QueryObject *self)
{
LOGDEB("Query_close\n" );
LOGDEB("Query_close\n");
if (self->query) {
the_queries.erase(self->query);
deleteZ(self->query);
@ -854,7 +856,7 @@ Query_close(recoll_QueryObject *self)
static void
Query_dealloc(recoll_QueryObject *self)
{
LOGDEB("Query_dealloc\n" );
LOGDEB("Query_dealloc\n");
PyObject *ret = Query_close(self);
Py_DECREF(ret);
Py_TYPE(self)->tp_free((PyObject*)self);
@ -863,7 +865,7 @@ Query_dealloc(recoll_QueryObject *self)
static PyObject *
Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
LOGDEB("Query_new\n" );
LOGDEB("Query_new\n");
recoll_QueryObject *self;
self = (recoll_QueryObject *)type->tp_alloc(type, 0);
@ -876,6 +878,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
self->ascending = 1;
self->arraysize = 1;
self->connection = 0;
self->fetchtext = false;
return (PyObject *)self;
}
@ -885,7 +888,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
static int
Query_init(recoll_QueryObject *self, PyObject *, PyObject *)
{
LOGDEB("Query_init\n" );
LOGDEB("Query_init\n");
if (self->query)
the_queries.erase(self->query);
@ -913,7 +916,7 @@ PyDoc_STRVAR(doc_Query_sortby,
static PyObject *
Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_sortby\n" );
LOGDEB0("Query_sortby\n");
static const char *kwlist[] = {"field", "ascending", NULL};
char *sfield = 0;
PyObject *ascobj = 0;
@ -936,7 +939,8 @@ Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
}
PyDoc_STRVAR(doc_Query_execute,
"execute(query_string, stemming=1|0, stemlang=\"stemming language\")\n"
"execute(query_string, stemming=1|0, stemlang=\"stemming language\", "
"fetchtext=False)\n"
"\n"
"Starts a search for query_string, a Recoll search language string\n"
"(mostly Xesam-compatible).\n"
@ -947,20 +951,28 @@ PyDoc_STRVAR(doc_Query_execute,
static PyObject *
Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_execute\n" );
static const char *kwlist[] = {"query_string", "stemming", "stemlang", NULL};
LOGDEB0("Query_execute\n");
static const char *kwlist[] = {"query_string", "stemming", "stemlang",
"fetchtext", NULL};
char *sutf8 = 0; // needs freeing
char *sstemlang = 0;
int dostem = 1;
PyObject *dostemobj = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute",
PyObject *fetchtextobj = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OesO:Query_execute",
(char**)kwlist, "utf-8", &sutf8,
&dostemobj,
"utf-8", &sstemlang)) {
"utf-8", &sstemlang, &fetchtextobj)) {
return 0;
}
bool dostem{true};
if (dostemobj != 0 && !PyObject_IsTrue(dostemobj))
dostem = 0;
dostem = false;
if (fetchtextobj != 0 && PyObject_IsTrue(fetchtextobj)) {
self->fetchtext = true;
} else {
self->fetchtext = false;
}
string utf8(sutf8);
PyMem_Free(sutf8);
@ -970,7 +982,8 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyMem_Free(sstemlang);
}
LOGDEB0("Query_execute: [" << (utf8) << "] dostem " << (dostem) << " stemlang [" << (stemlang) << "]\n" );
LOGDEB0("Query_execute: [" << utf8 << "] dostem " << dostem <<
" stemlang [" << stemlang << "]\n");
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
@ -999,7 +1012,7 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
}
PyDoc_STRVAR(doc_Query_executesd,
"executesd(SearchData)\n"
"executesd(SearchData, fetchtext=False)\n"
"\n"
"Starts a search for the query defined by the SearchData object.\n"
);
@ -1007,12 +1020,13 @@ PyDoc_STRVAR(doc_Query_executesd,
static PyObject *
Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_executeSD\n" );
static const char *kwlist[] = {"searchdata", NULL};
LOGDEB0("Query_executeSD\n");
static const char *kwlist[] = {"searchdata", "fetchtext", NULL};
recoll_SearchDataObject *pysd = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:Query_execute",
(char **)kwlist,
&recoll_SearchDataType, &pysd)) {
PyObject *fetchtextobj = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_execute",
(char **)kwlist, &recoll_SearchDataType,
&pysd, &fetchtextobj)) {
return 0;
}
if (pysd == 0 || self->query == 0 ||
@ -1020,6 +1034,11 @@ Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
if (fetchtextobj != 0 && PyObject_IsTrue(fetchtextobj)) {
self->fetchtext = true;
} else {
self->fetchtext = false;
}
self->query->setSortBy(*self->sortfield, self->ascending);
self->query->setQuery(pysd->sd);
int cnt = self->query->getResCnt();
@ -1049,7 +1068,7 @@ PyDoc_STRVAR(doc_Query_fetchone,
static PyObject *
Query_fetchone(PyObject *_self)
{
LOGDEB0("Query_fetchone/next\n" );
LOGDEB0("Query_fetchone/next\n");
recoll_QueryObject* self = (recoll_QueryObject*)_self;
if (self->query == 0 ||
@ -1072,7 +1091,7 @@ Query_fetchone(PyObject *_self)
// We used to check against rowcount here, but this was wrong:
// xapian result count estimate are sometimes wrong, we must go on
// fetching until we fail
if (!self->query->getDoc(self->next, *result->doc)) {
if (!self->query->getDoc(self->next, *result->doc, self->fetchtext)) {
PyErr_SetNone(PyExc_StopIteration);
return 0;
}
@ -1090,7 +1109,7 @@ PyDoc_STRVAR(doc_Query_fetchmany,
static PyObject *
Query_fetchmany(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_fetchmany\n" );
LOGDEB0("Query_fetchmany\n");
static const char *kwlist[] = {"size", NULL};
int size = 0;
@ -1120,7 +1139,7 @@ Query_fetchmany(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_EnvironmentError, "doc create failed");
return 0;
}
if (!self->query->getDoc(self->next, *docobj->doc)) {
if (!self->query->getDoc(self->next, *docobj->doc, self->fetchtext)) {
PyErr_SetNone(PyExc_StopIteration);
break;
}
@ -1140,7 +1159,7 @@ PyDoc_STRVAR(doc_Query_scroll,
static PyObject *
Query_scroll(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_scroll\n" );
LOGDEB0("Query_scroll\n");
static const char *kwlist[] = {"position", "mode", NULL};
int pos = 0;
char *smode = 0;
@ -1227,7 +1246,7 @@ public:
static PyObject *
Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Query_highlight\n" );
LOGDEB0("Query_highlight\n");
static const char *kwlist[] = {"text", "ishtml", "eolbr", "methods", NULL};
char *sutf8 = 0; // needs freeing
int ishtml = 0;
@ -1249,7 +1268,7 @@ Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
ishtml = 1;
if (eolbrobj && !PyObject_IsTrue(eolbrobj))
eolbr = 0;
LOGDEB0("Query_highlight: ishtml " << (ishtml) << "\n" );
LOGDEB0("Query_highlight: ishtml " << ishtml << "\n");
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
@ -1287,7 +1306,7 @@ PyDoc_STRVAR(doc_Query_makedocabstract,
static PyObject *
Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
{
LOGDEB0("Query_makeDocAbstract\n" );
LOGDEB0("Query_makeDocAbstract\n");
static const char *kwlist[] = {"doc", "methods", NULL};
recoll_DocObject *pydoc = 0;
PyObject *hlmethods = 0;
@ -1299,12 +1318,12 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs)
}
if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
LOGERR("Query_makeDocAbstract: doc not found " << (pydoc->doc) << "\n" );
LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n");
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (the_queries.find(self->query) == the_queries.end()) {
LOGERR("Query_makeDocAbstract: query not found " << (self->query) << "\n" );
LOGERR("Query_makeDocAbstract: query not found " << self->query << "\n");
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
@ -1357,7 +1376,7 @@ PyDoc_STRVAR(doc_Query_getxquery,
static PyObject *
Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *)
{
LOGDEB0("Query_getxquery self->query " << (self->query) << "\n" );
LOGDEB0("Query_getxquery self->query " << self->query << "\n");
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
@ -1385,7 +1404,7 @@ PyDoc_STRVAR(doc_Query_getgroups,
static PyObject *
Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *)
{
LOGDEB0("Query_getgroups\n" );
LOGDEB0("Query_getgroups\n");
if (self->query == 0 ||
the_queries.find(self->query) == the_queries.end()) {
@ -1530,7 +1549,7 @@ typedef struct recoll_DbObject {
static PyObject *
Db_close(recoll_DbObject *self)
{
LOGDEB("Db_close. self " << (self) << "\n" );
LOGDEB("Db_close. self " << self << "\n");
if (self->db) {
the_dbs.erase(self->db);
delete self->db;
@ -1542,7 +1561,7 @@ Db_close(recoll_DbObject *self)
static void
Db_dealloc(recoll_DbObject *self)
{
LOGDEB("Db_dealloc\n" );
LOGDEB("Db_dealloc\n");
PyObject *ret = Db_close(self);
Py_DECREF(ret);
Py_TYPE(self)->tp_free((PyObject*)self);
@ -1551,7 +1570,7 @@ Db_dealloc(recoll_DbObject *self)
static PyObject *
Db_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
LOGDEB2("Db_new\n" );
LOGDEB2("Db_new\n");
recoll_DbObject *self;
self = (recoll_DbObject *)type->tp_alloc(type, 0);
@ -1583,7 +1602,7 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
} else {
rclconfig = recollinit(0, 0, reason, 0);
}
LOGDEB("Db_init\n" );
LOGDEB("Db_init\n");
if (rclconfig == 0) {
PyErr_SetString(PyExc_EnvironmentError, reason.c_str());
@ -1599,7 +1618,7 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
delete self->db;
self->db = new Rcl::Db(rclconfig);
if (!self->db->open(writable ? Rcl::Db::DbUpd : Rcl::Db::DbRO)) {
LOGERR("Db_init: db open error\n" );
LOGERR("Db_init: db open error\n");
PyErr_SetString(PyExc_EnvironmentError, "Can't open index");
return -1;
}
@ -1642,9 +1661,9 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
static PyObject *
Db_query(recoll_DbObject* self)
{
LOGDEB("Db_query\n" );
LOGDEB("Db_query\n");
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_query: db not found " << (self->db) << "\n" );
LOGERR("Db_query: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
return 0;
}
@ -1663,18 +1682,19 @@ Db_query(recoll_DbObject* self)
static PyObject *
Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Db_setAbstractParams\n" );
LOGDEB0("Db_setAbstractParams\n");
static const char *kwlist[] = {"maxchars", "contextwords", NULL};
int ctxwords = -1, maxchars = -1;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", (char**)kwlist,
&maxchars, &ctxwords))
return 0;
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_query: db not found " << (self->db) << "\n" );
LOGERR("Db_query: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db id not found");
return 0;
}
LOGDEB0("Db_setAbstractParams: mxchrs " << (maxchars) << ", ctxwrds " << (ctxwords) << "\n" );
LOGDEB0("Db_setAbstractParams: mxchrs " << maxchars << ", ctxwrds " <<
ctxwords << "\n");
self->db->setAbstractParams(-1, maxchars, ctxwords);
Py_RETURN_NONE;
}
@ -1682,7 +1702,7 @@ Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs)
static PyObject *
Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
{
LOGDEB0("Db_makeDocAbstract\n" );
LOGDEB0("Db_makeDocAbstract\n");
recoll_DocObject *pydoc = 0;
recoll_QueryObject *pyquery = 0;
if (!PyArg_ParseTuple(args, "O!O!:Db_makeDocAbstract",
@ -1691,18 +1711,18 @@ Db_makeDocAbstract(recoll_DbObject* self, PyObject *args)
return 0;
}
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_makeDocAbstract: db not found " << (self->db) << "\n" );
LOGERR("Db_makeDocAbstract: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
return 0;
}
if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
LOGERR("Db_makeDocAbstract: doc not found " << (pydoc->doc) << "\n" );
LOGERR("Db_makeDocAbstract: doc not found " << pydoc->doc << "\n");
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (pyquery->query == 0 ||
the_queries.find(pyquery->query) == the_queries.end()) {
LOGERR("Db_makeDocAbstract: query not found " << (pyquery->query) << "\n" );
LOGERR("Db_makeDocAbstract: query not found " << pyquery->query << "\n");
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
@ -1727,7 +1747,7 @@ PyDoc_STRVAR(doc_Db_termMatch,
static PyObject *
Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs)
{
LOGDEB0("Db_termMatch\n" );
LOGDEB0("Db_termMatch\n");
static const char *kwlist[] = {"type", "expr", "field", "maxlen",
"casesens", "diacsens", "lang", NULL};
char *tp = 0;
@ -1750,7 +1770,7 @@ Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs)
return 0;
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_termMatch: db not found " << (self->db) << "\n" );
LOGERR("Db_termMatch: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
goto out;
}
@ -1775,7 +1795,7 @@ Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs)
if (!self->db->termMatch(typ_sens, lang ? lang : "english",
expr, result, maxlen, field ? field : "")) {
LOGERR("Db_termMatch: db termMatch error\n" );
LOGERR("Db_termMatch: db termMatch error\n");
PyErr_SetString(PyExc_AttributeError, "rcldb termMatch error");
goto out;
}
@ -1796,7 +1816,7 @@ out:
static PyObject *
Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds)
{
LOGDEB0("Db_needUpdate\n" );
LOGDEB0("Db_needUpdate\n");
char *udi = 0; // needs freeing
char *sig = 0; // needs freeing
if (!PyArg_ParseTuple(args, "eses:Db_needUpdate",
@ -1804,7 +1824,7 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds)
return 0;
}
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_needUpdate: db not found " << (self->db) << "\n" );
LOGERR("Db_needUpdate: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
PyMem_Free(udi);
PyMem_Free(sig);
@ -1819,13 +1839,13 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds)
static PyObject *
Db_delete(recoll_DbObject* self, PyObject *args, PyObject *kwds)
{
LOGDEB0("Db_delete\n" );
LOGDEB0("Db_delete\n");
char *udi = 0; // needs freeing
if (!PyArg_ParseTuple(args, "es:Db_delete", "utf-8", &udi)) {
return 0;
}
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_delete: db not found " << (self->db) << "\n" );
LOGERR("Db_delete: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
PyMem_Free(udi);
return 0;
@ -1838,9 +1858,9 @@ Db_delete(recoll_DbObject* self, PyObject *args, PyObject *kwds)
static PyObject *
Db_purge(recoll_DbObject* self)
{
LOGDEB0("Db_purge\n" );
LOGDEB0("Db_purge\n");
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_purge: db not found " << (self->db) << "\n" );
LOGERR("Db_purge: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
return 0;
}
@ -1851,7 +1871,7 @@ Db_purge(recoll_DbObject* self)
static PyObject *
Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *)
{
LOGDEB0("Db_addOrUpdate\n" );
LOGDEB0("Db_addOrUpdate\n");
char *sudi = 0; // needs freeing
char *sparent_udi = 0; // needs freeing
recoll_DocObject *pydoc;
@ -1867,17 +1887,17 @@ Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *)
PyMem_Free(sparent_udi);
if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) {
LOGERR("Db_addOrUpdate: db not found " << (self->db) << "\n" );
LOGERR("Db_addOrUpdate: db not found " << self->db << "\n");
PyErr_SetString(PyExc_AttributeError, "db");
return 0;
}
if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) {
LOGERR("Db_addOrUpdate: doc not found " << (pydoc->doc) << "\n" );
LOGERR("Db_addOrUpdate: doc not found " << pydoc->doc << "\n");
PyErr_SetString(PyExc_AttributeError, "doc");
return 0;
}
if (!self->db->addOrUpdate(udi, parent_udi, *pydoc->doc)) {
LOGERR("Db_addOrUpdate: rcldb error\n" );
LOGERR("Db_addOrUpdate: rcldb error\n");
PyErr_SetString(PyExc_AttributeError, "rcldb error");
return 0;
}
@ -1992,7 +2012,7 @@ static PyTypeObject recoll_DbType = {
static PyObject *
recoll_connect(PyObject *self, PyObject *args, PyObject *kwargs)
{
LOGDEB2("recoll_connect\n" );
LOGDEB2("recoll_connect\n");
recoll_DbObject *db = (recoll_DbObject *)
PyObject_Call((PyObject *)&recoll_DbType, args, kwargs);
return (PyObject *)db;

View File

@ -329,41 +329,10 @@ int Query::Native::abstractFromText(
)
{
Xapian::Database& xrdb(ndb->xrdb);
Xapian::Document xdoc;
string reason;
XAPTRY(xdoc = xrdb.get_document(docid), xrdb, reason);
if (!reason.empty()) {
LOGERR("abstractFromText: could not get doc: " << reason << endl);
return ABSRES_ERROR;
}
string rawtext, data;
#ifdef RAWTEXT_IN_DATA
XAPTRY(data = xdoc.get_data(), xrdb, reason);
if (!reason.empty()) {
LOGERR("abstractFromText: could not get data: " << reason << endl);
return ABSRES_ERROR;
}
Doc doc;
if (ndb->dbDataToRclDoc(docid, data, doc)) {
rawtext = doc.meta["RAWTEXT"];
}
#endif
#ifdef RAWTEXT_IN_METADATA
XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)),
ndb->xrdb, reason);
if (!reason.empty()) {
LOGERR("abstractFromText: could not get value: " << reason << endl);
return ABSRES_ERROR;
}
ZLibUtBuf cbuf;
inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
#endif
if (rawtext.empty()) {
LOGDEB0("abstractFromText: no text\n");
string rawtext;
if (!ndb->getRawText(docid, rawtext)) {
LOGDEB0("abstractFromText: can't fetch text\n");
return ABSRES_ERROR;
}

View File

@ -202,12 +202,8 @@ void *DbUpdWorker(void* vdbp)
switch (tsk->op) {
case DbUpdTask::AddOrUpdate:
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,
tsk->doc, tsk->txtlen
#ifdef RAWTEXT_IN_METADATA
, tsk->rawztext
#endif
);
status = ndbp->addOrUpdateWrite(
tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext);
break;
case DbUpdTask::Delete:
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
@ -267,9 +263,12 @@ void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
// to force using Chert. No sense in doing this if we are
// storing the text anyway.
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
// New Xapian with Chert support. Use Chert and the old
// abstract generation method, except if told otherwise by the
// configuration.
// Xapian with Glass and Chert support. If storedoctext is
// specified in the configuration, use the default backend
// (Glass), else force Chert. There might be reasons why
// someone would want to use Chert and store text anyway, but
// it's an exotic case, and things are complicated enough
// already.
if (o_index_storedoctext) {
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = true;
@ -286,15 +285,13 @@ void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
xwdb = Xapian::WritableDatabase(stub, action);
m_storetext = false;
}
#elif ! XAPIAN_AT_LEAST(1,3,0)
// Old Xapian. Use the default index format and let the user
// decide of the abstract generation method.
#elif (! XAPIAN_AT_LEAST(1,3,0)) || XAPIAN_AT_LEAST(1,5,0)
// Old Xapian (chert only) or newer (no chert). Use the
// default index backend and let the user decide of the
// abstract generation method. The configured default is to
// store the text.
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = o_index_storedoctext;
#else
// Newer Xapian with no Chert support. Store the text.
xwdb = Xapian::WritableDatabase(dir, action);
m_storetext = true;
#endif
// Set the storetext value inside the index descriptor (new
// with recoll 1.24, maybe we'll have other stuff to store in
@ -533,7 +530,7 @@ Xapian::docid Db::Native::getDoc(const string& udi, int idxi,
// Turn data record from db into document fields
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
Doc &doc)
Doc &doc, bool fetchtext)
{
LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
ConfSimple parms(data);
@ -593,6 +590,9 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
}
doc.meta[Doc::keyurl] = doc.url;
doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
if (fetchtext) {
getRawText(docid, doc.text);
}
return true;
}
@ -672,16 +672,33 @@ int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks, int pos)
return int(it - pbreaks.begin() + 1);
}
bool Db::Native::getRawText(Xapian::docid docid, string& rawtext)
{
if (!m_storetext) {
LOGDEB("Db::Native::getRawText: document text not stored in index\n");
return false;
}
string reason;
XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason);
if (!reason.empty()) {
LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl);
return false;
}
if (rawtext.empty()) {
return true;
}
ZLibUtBuf cbuf;
inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
return true;
}
// Note: we're passed a Xapian::Document* because Xapian
// reference-counting is not mt-safe. We take ownership and need
// to delete it before returning.
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *newdocument_ptr,
size_t textlen
#ifdef RAWTEXT_IN_METADATA
, const string& rawztext
#endif
)
bool Db::Native::addOrUpdateWrite(
const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr,
size_t textlen, const string& rawztext)
{
#ifdef IDX_THREADS
Chrono chron;
@ -738,7 +755,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
}
}
#ifdef RAWTEXT_IN_METADATA
XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
xwdb, m_rcldb->m_reason);
if (!m_rcldb->m_reason.empty()) {
@ -746,7 +762,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
m_rcldb->m_reason << "\n");
// This only affects snippets, so let's say not fatal
}
#endif
// Test if we're over the flush threshold (limit memory usage):
bool ret = m_rcldb->maybeflush(textlen);
@ -1436,9 +1451,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
// Udi unique term: this is used for file existence/uptodate
// checks, and unique id for the replace_document() call.
string uniterm = make_uniterm(udi);
#if defined(RAWTEXT_IN_METADATA)
string rawztext; // Doc compressed text
#endif
string rawztext; // Doc compressed text
if (doc.onlyxattr) {
// Only updating an existing doc with new extended attributes
@ -1553,13 +1566,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
if (!splitter.text_to_words(doc.text)) {
LOGDEB("Db::addOrUpdate: split failed for main text\n");
} else {
#if defined(RAWTEXT_IN_METADATA)
if (m_ndb->m_storetext) {
ZLibUtBuf buf;
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
rawztext.assign(buf.getBuf(), buf.getCnt());
}
#endif
}
#ifdef TEXTSPLIT_STATS
@ -1771,23 +1782,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
}
#ifdef RAWTEXT_IN_DATA
if (m_ndb->m_storetext) {
RECORD_APPEND(record, string("RAWTEXT"),
neutchars(doc.text, cstr_nc));
}
#endif
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
newdocument.set_data(record);
}
#ifdef IDX_THREADS
if (m_ndb->m_havewriteq) {
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
newdocument_ptr, doc.text.length()
#ifdef RAWTEXT_IN_METADATA
, rawztext
#endif
);
DbUpdTask *tp = new DbUpdTask(
DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr,
doc.text.length(), rawztext);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::addOrUpdate:Cant queue task\n");
delete newdocument_ptr;
@ -1799,11 +1801,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
#endif
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
doc.text.length()
#ifdef RAWTEXT_IN_METADATA
, rawztext
#endif
);
doc.text.length(), rawztext);
}
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
@ -2230,11 +2228,7 @@ bool Db::purgeFile(const string &udi, bool *existed)
if (m_ndb->m_havewriteq) {
string rztxt;
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
0, (size_t)-1,
#if defined(RAWTEXT_IN_METADATA)
rztxt
#endif
);
0, (size_t)-1, rztxt);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::purgeFile:Cant queue task\n");
return false;
@ -2262,11 +2256,7 @@ bool Db::purgeOrphans(const string &udi)
if (m_ndb->m_havewriteq) {
string rztxt;
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
0, (size_t)-1,
#ifdef RAWTEXT_IN_METADATA
rztxt
#endif
);
0, (size_t)-1, rztxt);
if (!m_ndb->m_wqueue.put(tp)) {
LOGERR("Db::purgeFile:Cant queue task\n");
return false;

View File

@ -63,8 +63,8 @@ namespace Rcl {
// is incompatible anyway.
enum value_slot {
// Omega-compatible values:
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
VALUE_SIZE = 2, // sortable_serialise(<file size in bytes>)
////////// Recoll only:
@ -80,24 +80,16 @@ class Query;
class TermMatchEntry {
public:
TermMatchEntry()
: wcf(0)
{
}
: wcf(0) {}
TermMatchEntry(const string& t, int f, int d)
: term(t), wcf(f), docs(d)
{
}
: term(t), wcf(f), docs(d) {}
TermMatchEntry(const string& t)
: term(t), wcf(0)
{
: term(t), wcf(0) {}
bool operator==(const TermMatchEntry &o) const {
return term == o.term;
}
bool operator==(const TermMatchEntry &o) const
{
return term == o.term;
}
bool operator<(const TermMatchEntry &o) const
{
return term < o.term;
bool operator<(const TermMatchEntry &o) const {
return term < o.term;
}
string term;
@ -108,13 +100,11 @@ public:
/** Term match result list header: statistics and global info */
class TermMatchResult {
public:
TermMatchResult()
{
clear();
TermMatchResult() {
clear();
}
void clear()
{
entries.clear();
void clear() {
entries.clear();
}
// Term expansion
vector<TermMatchEntry> entries;
@ -125,7 +115,7 @@ public:
class DbStats {
public:
DbStats()
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) { }
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {}
// Index-wide stats
unsigned int dbdoccount;
double dbavgdoclen;
@ -137,27 +127,27 @@ public:
inline bool has_prefix(const string& trm)
{
if (o_index_stripchars) {
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
} else {
return !trm.empty() && trm[0] == ':';
return !trm.empty() && trm[0] == ':';
}
}
inline string strip_prefix(const string& trm)
{
if (trm.empty())
return trm;
return trm;
string::size_type st = 0;
if (o_index_stripchars) {
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
if (st == string::npos)
return string();
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
if (st == string::npos)
return string();
} else {
if (has_prefix(trm)) {
st = trm.find_last_of(":") + 1;
} else {
return trm;
}
if (has_prefix(trm)) {
st = trm.find_last_of(":") + 1;
} else {
return trm;
}
}
return trm.substr(st);
}
@ -165,9 +155,9 @@ inline string strip_prefix(const string& trm)
inline string wrap_prefix(const string& pfx)
{
if (o_index_stripchars) {
return pfx;
return pfx;
} else {
return cstr_colon + pfx + cstr_colon;
return cstr_colon + pfx + cstr_colon;
}
}
@ -175,7 +165,7 @@ inline string wrap_prefix(const string& pfx)
* Wrapper class for the native database.
*/
class Db {
public:
public:
// A place for things we don't want visible here.
class Native;
friend class Native;
@ -203,13 +193,11 @@ class Db {
* special chars...
* @param with_aspell test for use with aspell, else for xapian speller
*/
static bool isSpellingCandidate(const string& term, bool with_aspell=true)
{
if (term.empty() || term.length() > 50)
return false;
if (has_prefix(term))
return false;
Utf8Iter u8i(term);
static bool isSpellingCandidate(const string& term, bool with_aspell=true) {
if (term.empty() || term.length() > 50 || has_prefix(term))
return false;
Utf8Iter u8i(term);
if (with_aspell) {
// If spelling with aspell, neither katakana nor other cjk
// scripts are candidates
@ -232,10 +220,10 @@ class Db {
return false;
#endif
}
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
!= string::npos)
return false;
return true;
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
!= string::npos)
return false;
return true;
}
/** Return spelling suggestion */
@ -283,7 +271,7 @@ class Db {
void setExistingFlags(const string& udi, unsigned int docid);
/** Indicate if we are doing a systematic reindex. This complements
needUpdate() return */
needUpdate() return */
bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
/** Add or update document identified by unique identifier.
@ -305,8 +293,8 @@ class Db {
* much as possible depending on the document type.
* ** doc will be modified in a destructive way **
*/
bool addOrUpdate(const string &udi,
const string &parent_udi, Doc &doc);
bool addOrUpdate(const string &udi, const string &parent_udi, Doc &doc);
#ifdef IDX_THREADS
void waitUpdIdle();
#endif
@ -314,8 +302,8 @@ class Db {
/** Delete document(s) for given UDI, including subdocs */
bool purgeFile(const string &udi, bool *existed = 0);
/** Delete subdocs with an out of date sig. We do this to purge
obsolete subdocs during a partial update where no general purge
will be done */
obsolete subdocs during a partial update where no general purge
will be done */
bool purgeOrphans(const string &udi);
/** Remove documents that no longer exist in the file system. This
@ -377,20 +365,19 @@ class Db {
* in the TermMatchResult header
*/
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64};
int matchTypeTp(int tp)
{
return tp & 7;
ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64};
int matchTypeTp(int tp) {
return tp & 7;
}
bool termMatch(int typ_sens, const string &lang, const string &term,
TermMatchResult& result, int max = -1,
const string& field = "", vector<string> *multiwords = 0);
TermMatchResult& result, int max = -1,
const string& field = "", vector<string> *multiwords = 0);
bool dbStats(DbStats& stats, bool listFailed);
/** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear);
/** Return all mime types in index. This can be different from the
ones defined in the config because of 'file' command
usage. Inserts the types at the end of the parameter */
ones defined in the config because of 'file' command
usage. Inserts the types at the end of the parameter */
bool getAllDbMimeTypes(std::vector<std::string>&);
/** Wildcard expansion specific to file names. Internal/sdata use only */
@ -398,13 +385,11 @@ class Db {
/** Set parameters for synthetic abstract generation */
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
int getAbsCtxLen() const
{
return m_synthAbsWordCtxLen;
int getAbsCtxLen() const {
return m_synthAbsWordCtxLen;
}
int getAbsLen() const
{
return m_synthAbsLen;
int getAbsLen() const {
return m_synthAbsLen;
}
/** Get document for given udi
*
@ -453,28 +438,26 @@ class Db {
bool termExists(const string& term);
/** Test if terms stem to different roots. */
bool stemDiffers(const string& lang, const string& term,
const string& base);
const string& base);
const RclConfig *getConf() {return m_config;}
/**
Activate the "in place reset" mode where all documents are
considered as needing update. This is a global/per-process
option, and can't be reset. It should be set at the start of
the indexing pass. 2012-10: no idea why this is done this way...
Activate the "in place reset" mode where all documents are
considered as needing update. This is a global/per-process
option, and can't be reset. It should be set at the start of
the indexing pass. 2012-10: no idea why this is done this way...
*/
static void setInPlaceReset() {o_inPlaceReset = true;}
/** Flush interval get/set. This is used by the first indexing
pass to override the config value and flush more rapidly
initially so that the user can quickly play with queries */
int getFlushMb()
{
return m_flushMb;
pass to override the config value and flush more rapidly
initially so that the user can quickly play with queries */
int getFlushMb() {
return m_flushMb;
}
void setFlushMb(int mb)
{
m_flushMb = mb;
void setFlushMb(int mb) {
m_flushMb = mb;
}
bool doFlush();
@ -556,8 +539,8 @@ private:
// Reinitialize when adding/removing additional dbs
bool adjustdbs();
bool idxTermMatch(int typ_sens, const string &lang, const string &term,
TermMatchResult& result, int max = -1,
const string& field = cstr_null);
TermMatchResult& result, int max = -1,
const string& field = cstr_null);
// Flush when idxflushmb is reached
bool maybeflush(int64_t moretext);

View File

@ -30,9 +30,6 @@
#endif // IDX_THREADS
#include "xmacros.h"
// Store raw doc text in data record or metadata ?
#undef RAWTEXT_IN_DATA
#define RAWTEXT_IN_METADATA
namespace Rcl {
@ -55,15 +52,10 @@ public:
// available on the caller site.
// Take some care to avoid sharing string data (if string impl is cow)
DbUpdTask(Op _op, const string& ud, const string& un,
Xapian::Document *d, size_t tl
#ifdef RAWTEXT_IN_METADATA
, string& rztxt
#endif
Xapian::Document *d, size_t tl, string& rztxt
) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
doc(d), txtlen(tl) {
#ifdef RAWTEXT_IN_METADATA
rawztext.swap(rztxt);
#endif
}
// Udi and uniterm equivalently designate the doc
Op op;
@ -74,9 +66,7 @@ public:
// purge because we actually don't know it, and the code fakes a
// text length based on the term count.
size_t txtlen;
#ifdef RAWTEXT_IN_METADATA
string rawztext; // Compressed doc text
#endif
};
#endif // IDX_THREADS
@ -119,10 +109,7 @@ class Db::Native {
// Final steps of doc update, part which need to be single-threaded
bool addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *doc, size_t txtlen
#ifdef RAWTEXT_IN_METADATA
, const string& rawztext
#endif
);
, const string& rawztext);
/** Delete all documents which are contained in the input document,
* which must be a file-level one.
@ -141,7 +128,8 @@ class Db::Native {
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getPageNumberForPosition(const vector<int>& pbreaks, int pos);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc,
bool fetchtext = false);
size_t whatDbIdx(Xapian::docid id);
@ -193,7 +181,6 @@ class Db::Native {
/** Check if a page position list is defined */
bool hasPages(Xapian::docid id);
#ifdef RAWTEXT_IN_METADATA
std::string rawtextMetaKey(Xapian::docid did) {
// Xapian's Olly Betts avises to use a key which will
// sort the same as the docid (which we do), and to
@ -205,10 +192,10 @@ class Db::Native {
sprintf(buf, "%010d", did);
return buf;
}
#endif
bool getRawText(Xapian::docid docid, string& rawtext);
void deleteDocument(Xapian::docid docid) {
#ifdef RAWTEXT_IN_METADATA
string metareason;
XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()),
xwdb, metareason);
@ -217,7 +204,6 @@ class Db::Native {
metareason << "\n");
// not fatal
}
#endif
xwdb.delete_document(docid);
}
};

View File

@ -64,14 +64,14 @@ static const string& docfToDatf(const string& df)
// custom field data will have to be processed before insertion to
// achieve equivalent results.
#if XAPIAN_MAJOR_VERSION == 1 && XAPIAN_MINOR_VERSION < 2
class QSorter : public Xapian::Sorter {
class QSorter : public Xapian::Sorter
#else
class QSorter : public Xapian::KeyMaker {
class QSorter : public Xapian::KeyMaker
#endif
{
public:
QSorter(const string& f)
: m_fld(docfToDatf(f) + "=")
{
: m_fld(docfToDatf(f) + "=") {
m_ismtime = !m_fld.compare("dmtime=");
if (m_ismtime)
m_issize = false;
@ -80,8 +80,7 @@ public:
!m_fld.compare("pcbytes=");
}
virtual std::string operator()(const Xapian::Document& xdoc) const
{
virtual std::string operator()(const Xapian::Document& xdoc) const {
string data = xdoc.get_data();
// It would be simpler to do the record->Rcl::Doc thing, but
// hand-doing this will be faster. It makes more assumptions
@ -372,7 +371,7 @@ int Query::getResCnt()
// Note that as stated by a Xapian developer, Enquire searches from
// scratch each time get_mset() is called. So the better performance
// on subsequent calls is probably only due to disk caching.
bool Query::getDoc(int xapi, Doc &doc)
bool Query::getDoc(int xapi, Doc &doc, bool fetchtext)
{
LOGDEB1("Query::getDoc: xapian enquire index " << xapi << "\n");
if (ISNULL(m_nq) || !m_nq->xenquire) {
@ -451,7 +450,7 @@ bool Query::getDoc(int xapi, Doc &doc)
}
// Parse xapian document's data and populate doc fields
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc);
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, fetchtext);
}
vector<string> Query::expand(const Doc &doc)

View File

@ -66,25 +66,21 @@ class Query {
~Query();
/** Get explanation about last error */
std::string getReason() const
{
std::string getReason() const {
return m_reason;
}
/** Choose sort order. Must be called before setQuery */
void setSortBy(const std::string& fld, bool ascending = true);
const std::string& getSortBy() const
{
const std::string& getSortBy() const {
return m_sortField;
}
bool getSortAscending() const
{
bool getSortAscending() const {
return m_sortAscending;
}
/** Return or filter results with identical content checksum */
void setCollapseDuplicates(bool on)
{
void setCollapseDuplicates(bool on) {
m_collapseDuplicates = on;
}
@ -98,7 +94,7 @@ class Query {
int getResCnt();
/** Get document at rank i in current query results. */
bool getDoc(int i, Doc &doc);
bool getDoc(int i, Doc &doc, bool fetchtext = false);
/** Get possibly expanded list of query terms */
bool getQueryTerms(std::vector<std::string>& terms);
@ -117,8 +113,7 @@ class Query {
int getFirstMatchPage(const Doc &doc, std::string& term);
/** Retrieve a reference to the searchData we are using */
std::shared_ptr<SearchData> getSD()
{
std::shared_ptr<SearchData> getSD() {
return m_sd;
}
@ -126,8 +121,7 @@ class Query {
std::vector<std::string> expand(const Doc &doc);
/** Return the Db we're set for */
Db *whatDb() const
{
Db *whatDb() const {
return m_db;
}