diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 0b8a1716..ff7404d2 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -63,7 +63,8 @@ typedef struct { static void SearchData_dealloc(recoll_SearchDataObject *self) { - LOGDEB("SearchData_dealloc. Releasing. Count before: " << (self->sd.use_count()) << "\n" ); + LOGDEB("SearchData_dealloc. Releasing. Count before: " << + self->sd.use_count() << "\n"); self->sd.reset(); Py_TYPE(self)->tp_free((PyObject*)self); } @@ -71,7 +72,7 @@ SearchData_dealloc(recoll_SearchDataObject *self) static PyObject * SearchData_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - LOGDEB("SearchData_new\n" ); + LOGDEB("SearchData_new\n"); recoll_SearchDataObject *self; self = (recoll_SearchDataObject *)type->tp_alloc(type, 0); @@ -90,7 +91,7 @@ PyDoc_STRVAR(doc_SearchDataObject, static int SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs) { - LOGDEB("SearchData_init\n" ); + LOGDEB("SearchData_init\n"); static const char* kwlist[] = {"type", "stemlang", NULL}; char *stp = 0; char *steml = 0; @@ -180,9 +181,9 @@ static PyObject * SearchData_addclause(recoll_SearchDataObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("SearchData_addclause\n" ); + LOGDEB0("SearchData_addclause\n"); if (!self->sd) { - LOGERR("SearchData_addclause: not init??\n" ); + LOGERR("SearchData_addclause: not init??\n"); PyErr_SetString(PyExc_AttributeError, "sd"); return 0; } @@ -294,7 +295,7 @@ SearchData_addclause(recoll_SearchDataObject* self, PyObject *args, static void Doc_dealloc(recoll_DocObject *self) { - LOGDEB("Doc_dealloc\n" ); + LOGDEB("Doc_dealloc\n"); if (self->doc) the_docs.erase(self->doc); deleteZ(self->doc); @@ -304,7 +305,7 @@ Doc_dealloc(recoll_DocObject *self) static PyObject * Doc_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - LOGDEB("Doc_new\n" ); + LOGDEB("Doc_new\n"); recoll_DocObject *self; self = (recoll_DocObject *)type->tp_alloc(type, 0); @@ -318,7 +319,7 @@ Doc_new(PyTypeObject *type, PyObject *args, PyObject *kwds) static int Doc_init(recoll_DocObject *self, PyObject *, PyObject *) { - LOGDEB("Doc_init\n" ); + LOGDEB("Doc_init\n"); if (self->doc) the_docs.erase(self->doc); delete self->doc; @@ -340,7 +341,7 @@ PyDoc_STRVAR(doc_Doc_getbinurl, static PyObject * Doc_getbinurl(recoll_DocObject *self) { - LOGDEB0("Doc_getbinurl\n" ); + LOGDEB0("Doc_getbinurl\n"); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc"); @@ -359,7 +360,7 @@ PyDoc_STRVAR(doc_Doc_setbinurl, static PyObject * Doc_setbinurl(recoll_DocObject *self, PyObject *value) { - LOGDEB0("Doc_setbinurl\n" ); + LOGDEB0("Doc_setbinurl\n"); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc??"); @@ -381,7 +382,7 @@ PyDoc_STRVAR(doc_Doc_keys, static PyObject * Doc_keys(recoll_DocObject *self) { - LOGDEB0("Doc_keys\n" ); + LOGDEB0("Doc_keys\n"); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc"); @@ -406,7 +407,7 @@ PyDoc_STRVAR(doc_Doc_items, static PyObject * Doc_items(recoll_DocObject *self) { - LOGDEB0("Doc_items\n" ); + LOGDEB0("Doc_items\n"); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc"); @@ -516,7 +517,7 @@ PyDoc_STRVAR(doc_Doc_get, static PyObject * Doc_get(recoll_DocObject *self, PyObject *args) { - LOGDEB1("Doc_get\n" ); + LOGDEB1("Doc_get\n"); if (self->doc == 0 || the_docs.find(self->doc) == the_docs.end()) { PyErr_SetString(PyExc_AttributeError, "doc??"); return 0; @@ -567,7 +568,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj) if (PyUnicode_Check(nameobj)) { PyObject* utf8o = PyUnicode_AsUTF8String(nameobj); if (utf8o == 0) { - LOGERR("Doc_getattro: encoding name to utf8 failed\n" ); + LOGERR("Doc_getattro: encoding name to utf8 failed\n"); PyErr_SetString(PyExc_AttributeError, "name??"); Py_RETURN_NONE; } @@ -583,7 +584,7 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj) string key = rclconfig->fieldQCanon(string(name)); string value; if (idocget(self, key, value)) { - LOGDEB1("Doc_getattro: [" << key << "] -> [" << value << "]\n"); + LOGDEB1("Doc_getattro: [" << key << "] -> [" << value << "]\n"); // Return a python unicode object return PyUnicode_Decode(value.c_str(), value.size(), "utf-8","replace"); } @@ -621,7 +622,7 @@ Doc_setattr(recoll_DocObject *self, char *name, PyObject *value) PyObject* putf8 = PyUnicode_AsUTF8String(value); if (putf8 == 0) { - LOGERR("Doc_setmeta: encoding to utf8 failed\n" ); + LOGERR("Doc_setmeta: encoding to utf8 failed\n"); PyErr_SetString(PyExc_AttributeError, "value??"); return -1; } @@ -709,7 +710,7 @@ Doc_subscript(recoll_DocObject *self, PyObject *key) if (PyUnicode_Check(key)) { PyObject* utf8o = PyUnicode_AsUTF8String(key); if (utf8o == 0) { - LOGERR("Doc_getitemo: encoding name to utf8 failed\n" ); + LOGERR("Doc_getitemo: encoding name to utf8 failed\n"); PyErr_SetString(PyExc_AttributeError, "name??"); Py_RETURN_NONE; } @@ -830,6 +831,7 @@ typedef struct { int ascending; int arraysize; // Default size for fetchmany recoll_DbObject* connection; + bool fetchtext; } recoll_QueryObject; PyDoc_STRVAR(doc_Query_close, @@ -838,7 +840,7 @@ PyDoc_STRVAR(doc_Query_close, static PyObject * Query_close(recoll_QueryObject *self) { - LOGDEB("Query_close\n" ); + LOGDEB("Query_close\n"); if (self->query) { the_queries.erase(self->query); deleteZ(self->query); @@ -854,7 +856,7 @@ Query_close(recoll_QueryObject *self) static void Query_dealloc(recoll_QueryObject *self) { - LOGDEB("Query_dealloc\n" ); + LOGDEB("Query_dealloc\n"); PyObject *ret = Query_close(self); Py_DECREF(ret); Py_TYPE(self)->tp_free((PyObject*)self); @@ -863,7 +865,7 @@ Query_dealloc(recoll_QueryObject *self) static PyObject * Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { - LOGDEB("Query_new\n" ); + LOGDEB("Query_new\n"); recoll_QueryObject *self; self = (recoll_QueryObject *)type->tp_alloc(type, 0); @@ -876,6 +878,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->ascending = 1; self->arraysize = 1; self->connection = 0; + self->fetchtext = false; return (PyObject *)self; } @@ -885,7 +888,7 @@ Query_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) static int Query_init(recoll_QueryObject *self, PyObject *, PyObject *) { - LOGDEB("Query_init\n" ); + LOGDEB("Query_init\n"); if (self->query) the_queries.erase(self->query); @@ -913,7 +916,7 @@ PyDoc_STRVAR(doc_Query_sortby, static PyObject * Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_sortby\n" ); + LOGDEB0("Query_sortby\n"); static const char *kwlist[] = {"field", "ascending", NULL}; char *sfield = 0; PyObject *ascobj = 0; @@ -936,7 +939,8 @@ Query_sortby(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) } PyDoc_STRVAR(doc_Query_execute, -"execute(query_string, stemming=1|0, stemlang=\"stemming language\")\n" +"execute(query_string, stemming=1|0, stemlang=\"stemming language\", " + "fetchtext=False)\n" "\n" "Starts a search for query_string, a Recoll search language string\n" "(mostly Xesam-compatible).\n" @@ -947,20 +951,28 @@ PyDoc_STRVAR(doc_Query_execute, static PyObject * Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_execute\n" ); - static const char *kwlist[] = {"query_string", "stemming", "stemlang", NULL}; + LOGDEB0("Query_execute\n"); + static const char *kwlist[] = {"query_string", "stemming", "stemlang", + "fetchtext", NULL}; char *sutf8 = 0; // needs freeing char *sstemlang = 0; - int dostem = 1; PyObject *dostemobj = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|Oes:Query_execute", + PyObject *fetchtextobj = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "es|OesO:Query_execute", (char**)kwlist, "utf-8", &sutf8, &dostemobj, - "utf-8", &sstemlang)) { + "utf-8", &sstemlang, &fetchtextobj)) { return 0; } + + bool dostem{true}; if (dostemobj != 0 && !PyObject_IsTrue(dostemobj)) - dostem = 0; + dostem = false; + if (fetchtextobj != 0 && PyObject_IsTrue(fetchtextobj)) { + self->fetchtext = true; + } else { + self->fetchtext = false; + } string utf8(sutf8); PyMem_Free(sutf8); @@ -970,7 +982,8 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) PyMem_Free(sstemlang); } - LOGDEB0("Query_execute: [" << (utf8) << "] dostem " << (dostem) << " stemlang [" << (stemlang) << "]\n" ); + LOGDEB0("Query_execute: [" << utf8 << "] dostem " << dostem << + " stemlang [" << stemlang << "]\n"); if (self->query == 0 || the_queries.find(self->query) == the_queries.end()) { @@ -999,7 +1012,7 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) } PyDoc_STRVAR(doc_Query_executesd, -"executesd(SearchData)\n" +"executesd(SearchData, fetchtext=False)\n" "\n" "Starts a search for the query defined by the SearchData object.\n" ); @@ -1007,12 +1020,13 @@ PyDoc_STRVAR(doc_Query_executesd, static PyObject * Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_executeSD\n" ); - static const char *kwlist[] = {"searchdata", NULL}; + LOGDEB0("Query_executeSD\n"); + static const char *kwlist[] = {"searchdata", "fetchtext", NULL}; recoll_SearchDataObject *pysd = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:Query_execute", - (char **)kwlist, - &recoll_SearchDataType, &pysd)) { + PyObject *fetchtextobj = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|O:Query_execute", + (char **)kwlist, &recoll_SearchDataType, + &pysd, &fetchtextobj)) { return 0; } if (pysd == 0 || self->query == 0 || @@ -1020,6 +1034,11 @@ Query_executesd(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_AttributeError, "query"); return 0; } + if (fetchtextobj != 0 && PyObject_IsTrue(fetchtextobj)) { + self->fetchtext = true; + } else { + self->fetchtext = false; + } self->query->setSortBy(*self->sortfield, self->ascending); self->query->setQuery(pysd->sd); int cnt = self->query->getResCnt(); @@ -1049,7 +1068,7 @@ PyDoc_STRVAR(doc_Query_fetchone, static PyObject * Query_fetchone(PyObject *_self) { - LOGDEB0("Query_fetchone/next\n" ); + LOGDEB0("Query_fetchone/next\n"); recoll_QueryObject* self = (recoll_QueryObject*)_self; if (self->query == 0 || @@ -1072,7 +1091,7 @@ Query_fetchone(PyObject *_self) // We used to check against rowcount here, but this was wrong: // xapian result count estimate are sometimes wrong, we must go on // fetching until we fail - if (!self->query->getDoc(self->next, *result->doc)) { + if (!self->query->getDoc(self->next, *result->doc, self->fetchtext)) { PyErr_SetNone(PyExc_StopIteration); return 0; } @@ -1090,7 +1109,7 @@ PyDoc_STRVAR(doc_Query_fetchmany, static PyObject * Query_fetchmany(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_fetchmany\n" ); + LOGDEB0("Query_fetchmany\n"); static const char *kwlist[] = {"size", NULL}; int size = 0; @@ -1120,7 +1139,7 @@ Query_fetchmany(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_EnvironmentError, "doc create failed"); return 0; } - if (!self->query->getDoc(self->next, *docobj->doc)) { + if (!self->query->getDoc(self->next, *docobj->doc, self->fetchtext)) { PyErr_SetNone(PyExc_StopIteration); break; } @@ -1140,7 +1159,7 @@ PyDoc_STRVAR(doc_Query_scroll, static PyObject * Query_scroll(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_scroll\n" ); + LOGDEB0("Query_scroll\n"); static const char *kwlist[] = {"position", "mode", NULL}; int pos = 0; char *smode = 0; @@ -1227,7 +1246,7 @@ public: static PyObject * Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Query_highlight\n" ); + LOGDEB0("Query_highlight\n"); static const char *kwlist[] = {"text", "ishtml", "eolbr", "methods", NULL}; char *sutf8 = 0; // needs freeing int ishtml = 0; @@ -1249,7 +1268,7 @@ Query_highlight(recoll_QueryObject* self, PyObject *args, PyObject *kwargs) ishtml = 1; if (eolbrobj && !PyObject_IsTrue(eolbrobj)) eolbr = 0; - LOGDEB0("Query_highlight: ishtml " << (ishtml) << "\n" ); + LOGDEB0("Query_highlight: ishtml " << ishtml << "\n"); if (self->query == 0 || the_queries.find(self->query) == the_queries.end()) { @@ -1287,7 +1306,7 @@ PyDoc_STRVAR(doc_Query_makedocabstract, static PyObject * Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs) { - LOGDEB0("Query_makeDocAbstract\n" ); + LOGDEB0("Query_makeDocAbstract\n"); static const char *kwlist[] = {"doc", "methods", NULL}; recoll_DocObject *pydoc = 0; PyObject *hlmethods = 0; @@ -1299,12 +1318,12 @@ Query_makedocabstract(recoll_QueryObject* self, PyObject *args,PyObject *kwargs) } if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) { - LOGERR("Query_makeDocAbstract: doc not found " << (pydoc->doc) << "\n" ); + LOGERR("Query_makeDocAbstract: doc not found " << pydoc->doc << "\n"); PyErr_SetString(PyExc_AttributeError, "doc"); return 0; } if (the_queries.find(self->query) == the_queries.end()) { - LOGERR("Query_makeDocAbstract: query not found " << (self->query) << "\n" ); + LOGERR("Query_makeDocAbstract: query not found " << self->query << "\n"); PyErr_SetString(PyExc_AttributeError, "query"); return 0; } @@ -1357,7 +1376,7 @@ PyDoc_STRVAR(doc_Query_getxquery, static PyObject * Query_getxquery(recoll_QueryObject* self, PyObject *, PyObject *) { - LOGDEB0("Query_getxquery self->query " << (self->query) << "\n" ); + LOGDEB0("Query_getxquery self->query " << self->query << "\n"); if (self->query == 0 || the_queries.find(self->query) == the_queries.end()) { @@ -1385,7 +1404,7 @@ PyDoc_STRVAR(doc_Query_getgroups, static PyObject * Query_getgroups(recoll_QueryObject* self, PyObject *, PyObject *) { - LOGDEB0("Query_getgroups\n" ); + LOGDEB0("Query_getgroups\n"); if (self->query == 0 || the_queries.find(self->query) == the_queries.end()) { @@ -1530,7 +1549,7 @@ typedef struct recoll_DbObject { static PyObject * Db_close(recoll_DbObject *self) { - LOGDEB("Db_close. self " << (self) << "\n" ); + LOGDEB("Db_close. self " << self << "\n"); if (self->db) { the_dbs.erase(self->db); delete self->db; @@ -1542,7 +1561,7 @@ Db_close(recoll_DbObject *self) static void Db_dealloc(recoll_DbObject *self) { - LOGDEB("Db_dealloc\n" ); + LOGDEB("Db_dealloc\n"); PyObject *ret = Db_close(self); Py_DECREF(ret); Py_TYPE(self)->tp_free((PyObject*)self); @@ -1551,7 +1570,7 @@ Db_dealloc(recoll_DbObject *self) static PyObject * Db_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - LOGDEB2("Db_new\n" ); + LOGDEB2("Db_new\n"); recoll_DbObject *self; self = (recoll_DbObject *)type->tp_alloc(type, 0); @@ -1583,7 +1602,7 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs) } else { rclconfig = recollinit(0, 0, reason, 0); } - LOGDEB("Db_init\n" ); + LOGDEB("Db_init\n"); if (rclconfig == 0) { PyErr_SetString(PyExc_EnvironmentError, reason.c_str()); @@ -1599,7 +1618,7 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs) delete self->db; self->db = new Rcl::Db(rclconfig); if (!self->db->open(writable ? Rcl::Db::DbUpd : Rcl::Db::DbRO)) { - LOGERR("Db_init: db open error\n" ); + LOGERR("Db_init: db open error\n"); PyErr_SetString(PyExc_EnvironmentError, "Can't open index"); return -1; } @@ -1642,9 +1661,9 @@ Db_init(recoll_DbObject *self, PyObject *args, PyObject *kwargs) static PyObject * Db_query(recoll_DbObject* self) { - LOGDEB("Db_query\n" ); + LOGDEB("Db_query\n"); if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_query: db not found " << (self->db) << "\n" ); + LOGERR("Db_query: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); return 0; } @@ -1663,18 +1682,19 @@ Db_query(recoll_DbObject* self) static PyObject * Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Db_setAbstractParams\n" ); + LOGDEB0("Db_setAbstractParams\n"); static const char *kwlist[] = {"maxchars", "contextwords", NULL}; int ctxwords = -1, maxchars = -1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", (char**)kwlist, &maxchars, &ctxwords)) return 0; if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_query: db not found " << (self->db) << "\n" ); + LOGERR("Db_query: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db id not found"); return 0; } - LOGDEB0("Db_setAbstractParams: mxchrs " << (maxchars) << ", ctxwrds " << (ctxwords) << "\n" ); + LOGDEB0("Db_setAbstractParams: mxchrs " << maxchars << ", ctxwrds " << + ctxwords << "\n"); self->db->setAbstractParams(-1, maxchars, ctxwords); Py_RETURN_NONE; } @@ -1682,7 +1702,7 @@ Db_setAbstractParams(recoll_DbObject *self, PyObject *args, PyObject *kwargs) static PyObject * Db_makeDocAbstract(recoll_DbObject* self, PyObject *args) { - LOGDEB0("Db_makeDocAbstract\n" ); + LOGDEB0("Db_makeDocAbstract\n"); recoll_DocObject *pydoc = 0; recoll_QueryObject *pyquery = 0; if (!PyArg_ParseTuple(args, "O!O!:Db_makeDocAbstract", @@ -1691,18 +1711,18 @@ Db_makeDocAbstract(recoll_DbObject* self, PyObject *args) return 0; } if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_makeDocAbstract: db not found " << (self->db) << "\n" ); + LOGERR("Db_makeDocAbstract: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); return 0; } if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) { - LOGERR("Db_makeDocAbstract: doc not found " << (pydoc->doc) << "\n" ); + LOGERR("Db_makeDocAbstract: doc not found " << pydoc->doc << "\n"); PyErr_SetString(PyExc_AttributeError, "doc"); return 0; } if (pyquery->query == 0 || the_queries.find(pyquery->query) == the_queries.end()) { - LOGERR("Db_makeDocAbstract: query not found " << (pyquery->query) << "\n" ); + LOGERR("Db_makeDocAbstract: query not found " << pyquery->query << "\n"); PyErr_SetString(PyExc_AttributeError, "query"); return 0; } @@ -1727,7 +1747,7 @@ PyDoc_STRVAR(doc_Db_termMatch, static PyObject * Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs) { - LOGDEB0("Db_termMatch\n" ); + LOGDEB0("Db_termMatch\n"); static const char *kwlist[] = {"type", "expr", "field", "maxlen", "casesens", "diacsens", "lang", NULL}; char *tp = 0; @@ -1750,7 +1770,7 @@ Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs) return 0; if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_termMatch: db not found " << (self->db) << "\n" ); + LOGERR("Db_termMatch: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); goto out; } @@ -1775,7 +1795,7 @@ Db_termMatch(recoll_DbObject* self, PyObject *args, PyObject *kwargs) if (!self->db->termMatch(typ_sens, lang ? lang : "english", expr, result, maxlen, field ? field : "")) { - LOGERR("Db_termMatch: db termMatch error\n" ); + LOGERR("Db_termMatch: db termMatch error\n"); PyErr_SetString(PyExc_AttributeError, "rcldb termMatch error"); goto out; } @@ -1796,7 +1816,7 @@ out: static PyObject * Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds) { - LOGDEB0("Db_needUpdate\n" ); + LOGDEB0("Db_needUpdate\n"); char *udi = 0; // needs freeing char *sig = 0; // needs freeing if (!PyArg_ParseTuple(args, "eses:Db_needUpdate", @@ -1804,7 +1824,7 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds) return 0; } if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_needUpdate: db not found " << (self->db) << "\n" ); + LOGERR("Db_needUpdate: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); PyMem_Free(udi); PyMem_Free(sig); @@ -1819,13 +1839,13 @@ Db_needUpdate(recoll_DbObject* self, PyObject *args, PyObject *kwds) static PyObject * Db_delete(recoll_DbObject* self, PyObject *args, PyObject *kwds) { - LOGDEB0("Db_delete\n" ); + LOGDEB0("Db_delete\n"); char *udi = 0; // needs freeing if (!PyArg_ParseTuple(args, "es:Db_delete", "utf-8", &udi)) { return 0; } if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_delete: db not found " << (self->db) << "\n" ); + LOGERR("Db_delete: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); PyMem_Free(udi); return 0; @@ -1838,9 +1858,9 @@ Db_delete(recoll_DbObject* self, PyObject *args, PyObject *kwds) static PyObject * Db_purge(recoll_DbObject* self) { - LOGDEB0("Db_purge\n" ); + LOGDEB0("Db_purge\n"); if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_purge: db not found " << (self->db) << "\n" ); + LOGERR("Db_purge: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); return 0; } @@ -1851,7 +1871,7 @@ Db_purge(recoll_DbObject* self) static PyObject * Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *) { - LOGDEB0("Db_addOrUpdate\n" ); + LOGDEB0("Db_addOrUpdate\n"); char *sudi = 0; // needs freeing char *sparent_udi = 0; // needs freeing recoll_DocObject *pydoc; @@ -1867,17 +1887,17 @@ Db_addOrUpdate(recoll_DbObject* self, PyObject *args, PyObject *) PyMem_Free(sparent_udi); if (self->db == 0 || the_dbs.find(self->db) == the_dbs.end()) { - LOGERR("Db_addOrUpdate: db not found " << (self->db) << "\n" ); + LOGERR("Db_addOrUpdate: db not found " << self->db << "\n"); PyErr_SetString(PyExc_AttributeError, "db"); return 0; } if (pydoc->doc == 0 || the_docs.find(pydoc->doc) == the_docs.end()) { - LOGERR("Db_addOrUpdate: doc not found " << (pydoc->doc) << "\n" ); + LOGERR("Db_addOrUpdate: doc not found " << pydoc->doc << "\n"); PyErr_SetString(PyExc_AttributeError, "doc"); return 0; } if (!self->db->addOrUpdate(udi, parent_udi, *pydoc->doc)) { - LOGERR("Db_addOrUpdate: rcldb error\n" ); + LOGERR("Db_addOrUpdate: rcldb error\n"); PyErr_SetString(PyExc_AttributeError, "rcldb error"); return 0; } @@ -1992,7 +2012,7 @@ static PyTypeObject recoll_DbType = { static PyObject * recoll_connect(PyObject *self, PyObject *args, PyObject *kwargs) { - LOGDEB2("recoll_connect\n" ); + LOGDEB2("recoll_connect\n"); recoll_DbObject *db = (recoll_DbObject *) PyObject_Call((PyObject *)&recoll_DbType, args, kwargs); return (PyObject *)db; diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index 164a31f0..d9500d09 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -329,41 +329,10 @@ int Query::Native::abstractFromText( ) { Xapian::Database& xrdb(ndb->xrdb); - Xapian::Document xdoc; - string reason; - XAPTRY(xdoc = xrdb.get_document(docid), xrdb, reason); - if (!reason.empty()) { - LOGERR("abstractFromText: could not get doc: " << reason << endl); - return ABSRES_ERROR; - } - - string rawtext, data; -#ifdef RAWTEXT_IN_DATA - XAPTRY(data = xdoc.get_data(), xrdb, reason); - if (!reason.empty()) { - LOGERR("abstractFromText: could not get data: " << reason << endl); - return ABSRES_ERROR; - } - Doc doc; - if (ndb->dbDataToRclDoc(docid, data, doc)) { - rawtext = doc.meta["RAWTEXT"]; - } -#endif -#ifdef RAWTEXT_IN_METADATA - XAPTRY(rawtext = ndb->xrdb.get_metadata(ndb->rawtextMetaKey(docid)), - ndb->xrdb, reason); - if (!reason.empty()) { - LOGERR("abstractFromText: could not get value: " << reason << endl); - return ABSRES_ERROR; - } - ZLibUtBuf cbuf; - inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf); - rawtext.assign(cbuf.getBuf(), cbuf.getCnt()); -#endif - - if (rawtext.empty()) { - LOGDEB0("abstractFromText: no text\n"); + string rawtext; + if (!ndb->getRawText(docid, rawtext)) { + LOGDEB0("abstractFromText: can't fetch text\n"); return ABSRES_ERROR; } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index a2f4b8ee..c7cb5093 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -202,12 +202,8 @@ void *DbUpdWorker(void* vdbp) switch (tsk->op) { case DbUpdTask::AddOrUpdate: LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n"); - status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm, - tsk->doc, tsk->txtlen -#ifdef RAWTEXT_IN_METADATA - , tsk->rawztext -#endif - ); + status = ndbp->addOrUpdateWrite( + tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext); break; case DbUpdTask::Delete: LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n"); @@ -267,9 +263,12 @@ void Db::Native::openWrite(const string& dir, Db::OpenMode mode) // to force using Chert. No sense in doing this if we are // storing the text anyway. #if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND - // New Xapian with Chert support. Use Chert and the old - // abstract generation method, except if told otherwise by the - // configuration. + // Xapian with Glass and Chert support. If storedoctext is + // specified in the configuration, use the default backend + // (Glass), else force Chert. There might be reasons why + // someone would want to use Chert and store text anyway, but + // it's an exotic case, and things are complicated enough + // already. if (o_index_storedoctext) { xwdb = Xapian::WritableDatabase(dir, action); m_storetext = true; @@ -286,15 +285,13 @@ void Db::Native::openWrite(const string& dir, Db::OpenMode mode) xwdb = Xapian::WritableDatabase(stub, action); m_storetext = false; } -#elif ! XAPIAN_AT_LEAST(1,3,0) - // Old Xapian. Use the default index format and let the user - // decide of the abstract generation method. +#elif (! XAPIAN_AT_LEAST(1,3,0)) || XAPIAN_AT_LEAST(1,5,0) + // Old Xapian (chert only) or newer (no chert). Use the + // default index backend and let the user decide of the + // abstract generation method. The configured default is to + // store the text. xwdb = Xapian::WritableDatabase(dir, action); m_storetext = o_index_storedoctext; -#else - // Newer Xapian with no Chert support. Store the text. - xwdb = Xapian::WritableDatabase(dir, action); - m_storetext = true; #endif // Set the storetext value inside the index descriptor (new // with recoll 1.24, maybe we'll have other stuff to store in @@ -533,7 +530,7 @@ Xapian::docid Db::Native::getDoc(const string& udi, int idxi, // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, - Doc &doc) + Doc &doc, bool fetchtext) { LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n"); ConfSimple parms(data); @@ -593,6 +590,9 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, } doc.meta[Doc::keyurl] = doc.url; doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime; + if (fetchtext) { + getRawText(docid, doc.text); + } return true; } @@ -672,16 +672,33 @@ int Db::Native::getPageNumberForPosition(const vector& pbreaks, int pos) return int(it - pbreaks.begin() + 1); } +bool Db::Native::getRawText(Xapian::docid docid, string& rawtext) +{ + if (!m_storetext) { + LOGDEB("Db::Native::getRawText: document text not stored in index\n"); + return false; + } + string reason; + XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason); + if (!reason.empty()) { + LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl); + return false; + } + if (rawtext.empty()) { + return true; + } + ZLibUtBuf cbuf; + inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf); + rawtext.assign(cbuf.getBuf(), cbuf.getCnt()); + return true; +} + // Note: we're passed a Xapian::Document* because Xapian // reference-counting is not mt-safe. We take ownership and need // to delete it before returning. -bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, - Xapian::Document *newdocument_ptr, - size_t textlen -#ifdef RAWTEXT_IN_METADATA - , const string& rawztext -#endif - ) +bool Db::Native::addOrUpdateWrite( + const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr, + size_t textlen, const string& rawztext) { #ifdef IDX_THREADS Chrono chron; @@ -738,7 +755,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, } } -#ifdef RAWTEXT_IN_METADATA XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext), xwdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { @@ -746,7 +762,6 @@ bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, m_rcldb->m_reason << "\n"); // This only affects snippets, so let's say not fatal } -#endif // Test if we're over the flush threshold (limit memory usage): bool ret = m_rcldb->maybeflush(textlen); @@ -1436,9 +1451,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) // Udi unique term: this is used for file existence/uptodate // checks, and unique id for the replace_document() call. string uniterm = make_uniterm(udi); -#if defined(RAWTEXT_IN_METADATA) - string rawztext; // Doc compressed text -#endif + string rawztext; // Doc compressed text if (doc.onlyxattr) { // Only updating an existing doc with new extended attributes @@ -1553,13 +1566,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) if (!splitter.text_to_words(doc.text)) { LOGDEB("Db::addOrUpdate: split failed for main text\n"); } else { -#if defined(RAWTEXT_IN_METADATA) if (m_ndb->m_storetext) { ZLibUtBuf buf; deflateToBuf(doc.text.c_str(), doc.text.size(), buf); rawztext.assign(buf.getBuf(), buf.getCnt()); } -#endif } #ifdef TEXTSPLIT_STATS @@ -1771,23 +1782,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) newdocument.add_boolean_term(wrap_prefix("XM") + *md5); } -#ifdef RAWTEXT_IN_DATA - if (m_ndb->m_storetext) { - RECORD_APPEND(record, string("RAWTEXT"), - neutchars(doc.text, cstr_nc)); - } -#endif LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n"); newdocument.set_data(record); } #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { - DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, - newdocument_ptr, doc.text.length() -#ifdef RAWTEXT_IN_METADATA - , rawztext -#endif - ); + DbUpdTask *tp = new DbUpdTask( + DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr, + doc.text.length(), rawztext); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::addOrUpdate:Cant queue task\n"); delete newdocument_ptr; @@ -1799,11 +1801,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) #endif return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr, - doc.text.length() -#ifdef RAWTEXT_IN_METADATA - , rawztext -#endif - ); + doc.text.length(), rawztext); } bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, @@ -2230,11 +2228,7 @@ bool Db::purgeFile(const string &udi, bool *existed) if (m_ndb->m_havewriteq) { string rztxt; DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, - 0, (size_t)-1, -#if defined(RAWTEXT_IN_METADATA) - rztxt -#endif - ); + 0, (size_t)-1, rztxt); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::purgeFile:Cant queue task\n"); return false; @@ -2262,11 +2256,7 @@ bool Db::purgeOrphans(const string &udi) if (m_ndb->m_havewriteq) { string rztxt; DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, - 0, (size_t)-1, -#ifdef RAWTEXT_IN_METADATA - rztxt -#endif - ); + 0, (size_t)-1, rztxt); if (!m_ndb->m_wqueue.put(tp)) { LOGERR("Db::purgeFile:Cant queue task\n"); return false; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index cc127c12..3e249fc8 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -63,8 +63,8 @@ namespace Rcl { // is incompatible anyway. enum value_slot { // Omega-compatible values: - VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970. - VALUE_MD5 = 1, // 16 byte MD5 checksum of original document. + VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970. + VALUE_MD5 = 1, // 16 byte MD5 checksum of original document. VALUE_SIZE = 2, // sortable_serialise() ////////// Recoll only: @@ -80,24 +80,16 @@ class Query; class TermMatchEntry { public: TermMatchEntry() - : wcf(0) - { - } + : wcf(0) {} TermMatchEntry(const string& t, int f, int d) - : term(t), wcf(f), docs(d) - { - } + : term(t), wcf(f), docs(d) {} TermMatchEntry(const string& t) - : term(t), wcf(0) - { + : term(t), wcf(0) {} + bool operator==(const TermMatchEntry &o) const { + return term == o.term; } - bool operator==(const TermMatchEntry &o) const - { - return term == o.term; - } - bool operator<(const TermMatchEntry &o) const - { - return term < o.term; + bool operator<(const TermMatchEntry &o) const { + return term < o.term; } string term; @@ -108,13 +100,11 @@ public: /** Term match result list header: statistics and global info */ class TermMatchResult { public: - TermMatchResult() - { - clear(); + TermMatchResult() { + clear(); } - void clear() - { - entries.clear(); + void clear() { + entries.clear(); } // Term expansion vector entries; @@ -125,7 +115,7 @@ public: class DbStats { public: DbStats() - :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) { } + :dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0) {} // Index-wide stats unsigned int dbdoccount; double dbavgdoclen; @@ -137,27 +127,27 @@ public: inline bool has_prefix(const string& trm) { if (o_index_stripchars) { - return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z'; + return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z'; } else { - return !trm.empty() && trm[0] == ':'; + return !trm.empty() && trm[0] == ':'; } } inline string strip_prefix(const string& trm) { if (trm.empty()) - return trm; + return trm; string::size_type st = 0; if (o_index_stripchars) { - st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ"); - if (st == string::npos) - return string(); + st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ"); + if (st == string::npos) + return string(); } else { - if (has_prefix(trm)) { - st = trm.find_last_of(":") + 1; - } else { - return trm; - } + if (has_prefix(trm)) { + st = trm.find_last_of(":") + 1; + } else { + return trm; + } } return trm.substr(st); } @@ -165,9 +155,9 @@ inline string strip_prefix(const string& trm) inline string wrap_prefix(const string& pfx) { if (o_index_stripchars) { - return pfx; + return pfx; } else { - return cstr_colon + pfx + cstr_colon; + return cstr_colon + pfx + cstr_colon; } } @@ -175,7 +165,7 @@ inline string wrap_prefix(const string& pfx) * Wrapper class for the native database. */ class Db { - public: +public: // A place for things we don't want visible here. class Native; friend class Native; @@ -203,13 +193,11 @@ class Db { * special chars... * @param with_aspell test for use with aspell, else for xapian speller */ - static bool isSpellingCandidate(const string& term, bool with_aspell=true) - { - if (term.empty() || term.length() > 50) - return false; - if (has_prefix(term)) - return false; - Utf8Iter u8i(term); + static bool isSpellingCandidate(const string& term, bool with_aspell=true) { + if (term.empty() || term.length() > 50 || has_prefix(term)) + return false; + + Utf8Iter u8i(term); if (with_aspell) { // If spelling with aspell, neither katakana nor other cjk // scripts are candidates @@ -232,10 +220,10 @@ class Db { return false; #endif } - if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") - != string::npos) - return false; - return true; + if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") + != string::npos) + return false; + return true; } /** Return spelling suggestion */ @@ -283,7 +271,7 @@ class Db { void setExistingFlags(const string& udi, unsigned int docid); /** Indicate if we are doing a systematic reindex. This complements - needUpdate() return */ + needUpdate() return */ bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;} /** Add or update document identified by unique identifier. @@ -305,8 +293,8 @@ class Db { * much as possible depending on the document type. * ** doc will be modified in a destructive way ** */ - bool addOrUpdate(const string &udi, - const string &parent_udi, Doc &doc); + bool addOrUpdate(const string &udi, const string &parent_udi, Doc &doc); + #ifdef IDX_THREADS void waitUpdIdle(); #endif @@ -314,8 +302,8 @@ class Db { /** Delete document(s) for given UDI, including subdocs */ bool purgeFile(const string &udi, bool *existed = 0); /** Delete subdocs with an out of date sig. We do this to purge - obsolete subdocs during a partial update where no general purge - will be done */ + obsolete subdocs during a partial update where no general purge + will be done */ bool purgeOrphans(const string &udi); /** Remove documents that no longer exist in the file system. This @@ -377,20 +365,19 @@ class Db { * in the TermMatchResult header */ enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3, - ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64}; - int matchTypeTp(int tp) - { - return tp & 7; + ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64}; + int matchTypeTp(int tp) { + return tp & 7; } bool termMatch(int typ_sens, const string &lang, const string &term, - TermMatchResult& result, int max = -1, - const string& field = "", vector *multiwords = 0); + TermMatchResult& result, int max = -1, + const string& field = "", vector *multiwords = 0); bool dbStats(DbStats& stats, bool listFailed); /** Return min and max years for doc mod times in db */ bool maxYearSpan(int *minyear, int *maxyear); /** Return all mime types in index. This can be different from the - ones defined in the config because of 'file' command - usage. Inserts the types at the end of the parameter */ + ones defined in the config because of 'file' command + usage. Inserts the types at the end of the parameter */ bool getAllDbMimeTypes(std::vector&); /** Wildcard expansion specific to file names. Internal/sdata use only */ @@ -398,13 +385,11 @@ class Db { /** Set parameters for synthetic abstract generation */ void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); - int getAbsCtxLen() const - { - return m_synthAbsWordCtxLen; + int getAbsCtxLen() const { + return m_synthAbsWordCtxLen; } - int getAbsLen() const - { - return m_synthAbsLen; + int getAbsLen() const { + return m_synthAbsLen; } /** Get document for given udi * @@ -453,28 +438,26 @@ class Db { bool termExists(const string& term); /** Test if terms stem to different roots. */ bool stemDiffers(const string& lang, const string& term, - const string& base); + const string& base); const RclConfig *getConf() {return m_config;} /** - Activate the "in place reset" mode where all documents are - considered as needing update. This is a global/per-process - option, and can't be reset. It should be set at the start of - the indexing pass. 2012-10: no idea why this is done this way... + Activate the "in place reset" mode where all documents are + considered as needing update. This is a global/per-process + option, and can't be reset. It should be set at the start of + the indexing pass. 2012-10: no idea why this is done this way... */ static void setInPlaceReset() {o_inPlaceReset = true;} /** Flush interval get/set. This is used by the first indexing - pass to override the config value and flush more rapidly - initially so that the user can quickly play with queries */ - int getFlushMb() - { - return m_flushMb; + pass to override the config value and flush more rapidly + initially so that the user can quickly play with queries */ + int getFlushMb() { + return m_flushMb; } - void setFlushMb(int mb) - { - m_flushMb = mb; + void setFlushMb(int mb) { + m_flushMb = mb; } bool doFlush(); @@ -556,8 +539,8 @@ private: // Reinitialize when adding/removing additional dbs bool adjustdbs(); bool idxTermMatch(int typ_sens, const string &lang, const string &term, - TermMatchResult& result, int max = -1, - const string& field = cstr_null); + TermMatchResult& result, int max = -1, + const string& field = cstr_null); // Flush when idxflushmb is reached bool maybeflush(int64_t moretext); diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 1db35780..d9c3c8d5 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -30,9 +30,6 @@ #endif // IDX_THREADS #include "xmacros.h" -// Store raw doc text in data record or metadata ? -#undef RAWTEXT_IN_DATA -#define RAWTEXT_IN_METADATA namespace Rcl { @@ -55,15 +52,10 @@ public: // available on the caller site. // Take some care to avoid sharing string data (if string impl is cow) DbUpdTask(Op _op, const string& ud, const string& un, - Xapian::Document *d, size_t tl -#ifdef RAWTEXT_IN_METADATA - , string& rztxt -#endif + Xapian::Document *d, size_t tl, string& rztxt ) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()), doc(d), txtlen(tl) { -#ifdef RAWTEXT_IN_METADATA rawztext.swap(rztxt); -#endif } // Udi and uniterm equivalently designate the doc Op op; @@ -74,9 +66,7 @@ public: // purge because we actually don't know it, and the code fakes a // text length based on the term count. size_t txtlen; -#ifdef RAWTEXT_IN_METADATA string rawztext; // Compressed doc text -#endif }; #endif // IDX_THREADS @@ -119,10 +109,7 @@ class Db::Native { // Final steps of doc update, part which need to be single-threaded bool addOrUpdateWrite(const string& udi, const string& uniterm, Xapian::Document *doc, size_t txtlen -#ifdef RAWTEXT_IN_METADATA - , const string& rawztext -#endif - ); + , const string& rawztext); /** Delete all documents which are contained in the input document, * which must be a file-level one. @@ -141,7 +128,8 @@ class Db::Native { bool getPagePositions(Xapian::docid docid, vector& vpos); int getPageNumberForPosition(const vector& pbreaks, int pos); - bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); + bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc, + bool fetchtext = false); size_t whatDbIdx(Xapian::docid id); @@ -193,7 +181,6 @@ class Db::Native { /** Check if a page position list is defined */ bool hasPages(Xapian::docid id); -#ifdef RAWTEXT_IN_METADATA std::string rawtextMetaKey(Xapian::docid did) { // Xapian's Olly Betts avises to use a key which will // sort the same as the docid (which we do), and to @@ -205,10 +192,10 @@ class Db::Native { sprintf(buf, "%010d", did); return buf; } -#endif + + bool getRawText(Xapian::docid docid, string& rawtext); void deleteDocument(Xapian::docid docid) { -#ifdef RAWTEXT_IN_METADATA string metareason; XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()), xwdb, metareason); @@ -217,7 +204,6 @@ class Db::Native { metareason << "\n"); // not fatal } -#endif xwdb.delete_document(docid); } }; diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 31af8288..a92ac9fd 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -64,14 +64,14 @@ static const string& docfToDatf(const string& df) // custom field data will have to be processed before insertion to // achieve equivalent results. #if XAPIAN_MAJOR_VERSION == 1 && XAPIAN_MINOR_VERSION < 2 -class QSorter : public Xapian::Sorter { +class QSorter : public Xapian::Sorter #else -class QSorter : public Xapian::KeyMaker { +class QSorter : public Xapian::KeyMaker #endif +{ public: QSorter(const string& f) - : m_fld(docfToDatf(f) + "=") - { + : m_fld(docfToDatf(f) + "=") { m_ismtime = !m_fld.compare("dmtime="); if (m_ismtime) m_issize = false; @@ -80,8 +80,7 @@ public: !m_fld.compare("pcbytes="); } - virtual std::string operator()(const Xapian::Document& xdoc) const - { + virtual std::string operator()(const Xapian::Document& xdoc) const { string data = xdoc.get_data(); // It would be simpler to do the record->Rcl::Doc thing, but // hand-doing this will be faster. It makes more assumptions @@ -372,7 +371,7 @@ int Query::getResCnt() // Note that as stated by a Xapian developer, Enquire searches from // scratch each time get_mset() is called. So the better performance // on subsequent calls is probably only due to disk caching. -bool Query::getDoc(int xapi, Doc &doc) +bool Query::getDoc(int xapi, Doc &doc, bool fetchtext) { LOGDEB1("Query::getDoc: xapian enquire index " << xapi << "\n"); if (ISNULL(m_nq) || !m_nq->xenquire) { @@ -451,7 +450,7 @@ bool Query::getDoc(int xapi, Doc &doc) } // Parse xapian document's data and populate doc fields - return m_db->m_ndb->dbDataToRclDoc(docid, data, doc); + return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, fetchtext); } vector Query::expand(const Doc &doc) diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index 2bb6c43a..9746cb89 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -66,25 +66,21 @@ class Query { ~Query(); /** Get explanation about last error */ - std::string getReason() const - { + std::string getReason() const { return m_reason; } /** Choose sort order. Must be called before setQuery */ void setSortBy(const std::string& fld, bool ascending = true); - const std::string& getSortBy() const - { + const std::string& getSortBy() const { return m_sortField; } - bool getSortAscending() const - { + bool getSortAscending() const { return m_sortAscending; } /** Return or filter results with identical content checksum */ - void setCollapseDuplicates(bool on) - { + void setCollapseDuplicates(bool on) { m_collapseDuplicates = on; } @@ -98,7 +94,7 @@ class Query { int getResCnt(); /** Get document at rank i in current query results. */ - bool getDoc(int i, Doc &doc); + bool getDoc(int i, Doc &doc, bool fetchtext = false); /** Get possibly expanded list of query terms */ bool getQueryTerms(std::vector& terms); @@ -117,8 +113,7 @@ class Query { int getFirstMatchPage(const Doc &doc, std::string& term); /** Retrieve a reference to the searchData we are using */ - std::shared_ptr getSD() - { + std::shared_ptr getSD() { return m_sd; } @@ -126,8 +121,7 @@ class Query { std::vector expand(const Doc &doc); /** Return the Db we're set for */ - Db *whatDb() const - { + Db *whatDb() const { return m_db; }