From 5c2f62ae00865121dcdc2429d7c5ddbaa1a4a106 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 15 Dec 2020 17:34:47 +0100 Subject: [PATCH] test prog --- src/testmains/Makefile.am | 6 +- src/testmains/rclqdocmem.cpp | 634 +++++++++++++++++++++++++++++++++++ 2 files changed, 639 insertions(+), 1 deletion(-) create mode 100644 src/testmains/rclqdocmem.cpp diff --git a/src/testmains/Makefile.am b/src/testmains/Makefile.am index aeefa5d9..7e54f4a2 100644 --- a/src/testmains/Makefile.am +++ b/src/testmains/Makefile.am @@ -20,6 +20,7 @@ COMMONCPPFLAGS = -I. \ -I$(top_srcdir)/common \ -I$(top_srcdir)/index \ -I$(top_srcdir)/internfile \ + -I$(top_srcdir)/query \ -I$(top_srcdir)/rcldb \ -I$(top_srcdir)/unac \ -I$(top_srcdir)/utils \ @@ -38,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ $(DEFS) noinst_PROGRAMS = textsplit utf8iter fstreewalk rclconfig hldata unac mbox \ - circache wipedir mimetype pathut fileudi x11mon + circache wipedir mimetype pathut fileudi x11mon rclqdocmem circache_SOURCES = trcircache.cpp circache_LDADD = ../librecoll.la @@ -78,3 +79,6 @@ wipedir_LDADD = ../librecoll.la x11mon_SOURCES = trx11mon.cpp x11mon_LDADD = ../utils/x11mon.o ../librecoll.la -lX11 + +rclqdocmem_SOURCES = rclqdocmem.cpp +rclqdocmem_LDADD = ../librecoll.la diff --git a/src/testmains/rclqdocmem.cpp b/src/testmains/rclqdocmem.cpp new file mode 100644 index 00000000..516eeb40 --- /dev/null +++ b/src/testmains/rclqdocmem.cpp @@ -0,0 +1,634 @@ +/* Copyright (C) 2017-2019 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +#include +#include +#include +#include +#include + +#define obstack_chunk_alloc malloc +#define obstack_chunk_free free +#include +#define OBSTACK_STRINGCOPY(STK, S) \ + (char *)obstack_copy(&STK, S.c_str(), S.size()+1) + +#include "rclinit.h" +#include "rclconfig.h" +#include "rcldb.h" +#include "searchdata.h" +#include "rclquery.h" +#include "pathut.h" +#include "rclutil.h" +#include "wasatorcl.h" +#include "log.h" +#include "pathut.h" +#include "plaintorich.h" +#include "hldata.h" +#include "smallut.h" + + +//const std::string confdir{"/home/dockes/.recoll-prod"}; +const std::string confdir{"/var/cache/upmpdcli/uprcl"}; +const int MB = 1024 *1024; + +// Docs as docs +#undef STORE_DOCS +// Docs as map +#undef STORE_MAPS_RAW +// Docs as map, shared key storage +#undef STORE_MAPS_SHAREDKEYS +// Docs as map shared key storage+obstack for data +#undef STORE_MAPS_SHAREDKEYS_OBSTACK +// Docs as vector shared key-to-idx storage + pointer vector +#define STORE_ARRAYS +// Minimal, not really usable, needs linear search inside record to find field +#undef STORE_ALLOBSTACK + +static void meminfo(const char *msg) +{ + struct mallinfo minfo = mallinfo(); + std::cerr << msg << " : " << minfo.uordblks / MB << " MB\n"; +} + +static inline bool testentry(const std::pair& entry) +{ +#undef ALLENTRIES +#ifdef ALLENTRIES + return true; +#else + return !entry.second.empty() && + entry.first != "author" && entry.first != "ipath" && + entry.first != "rcludi" + && entry.first != "relevancyrating" && entry.first != "sig" + && entry.first != "abstract" && entry.first != "caption" && + entry.first != "filename" && entry.first != "origcharset" && + entry.first != "sig"; +#endif +} + + +int main(int argc, char *argv[]) +{ + std::string reason; + RclConfig *rclconfig = recollinit(0, 0, 0, reason, &confdir); + if (!rclconfig || !rclconfig->ok()) { + std::cerr << "Recoll init failed: " << reason << "\n"; + return 1; + } + meminfo("Before db open"); + Rcl::Db rcldb(rclconfig); + if (!rcldb.open(Rcl::Db::DbRO)) { + std::cerr << "db open error\n"; + return 1; + } + + Rcl::Query query(&rcldb); + std::shared_ptr sd{wasaStringToRcl( + rclconfig, "english", "mime:*", reason)}; + if (!sd) { + std::cerr << "wasStringToRcl: " << reason << "\n"; + return 1; + } + + meminfo("Before setquery"); + + query.setQuery(sd); + + int cnt = query.getResCnt(); + + meminfo("After getResCnt"); + + std::cerr << "Got " << cnt << " estimated results\n"; + +#if !defined(STORE_ARRAYS) + // Store_arrays needs 2 walks anyway, to find the field names + int i = 0; + for (;;) { + Rcl::Doc doc; + if (!query.getDoc(i++, doc, false)) { + break; + } + } + int imax = i; + meminfo("After getDocs"); + std::cerr << "Got " << imax << " docs\n"; +#endif // !STORE_ARRAYS + +#if defined(STORE_DOCS) + // + // 25 kdocs (music tags) + // Each result stored as Rcl::Doc: 85 MB + // + std::vector docs; + docs.resize(imax); + meminfo("After resize"); + for (i=0;i 58 MB + // + std::vector> docs; + docs.resize(imax); + meminfo("After resize"); + for (i=0;i" << entry.second <<"\n"; + vdoc.insert(entry); + } + } + //std::cerr << "\n"; + } + +#elif defined(STORE_MAPS_SHAREDKEYS) + // + // Each result stored as map with shared keys: + // Memory: audio: 56 MB, main: 221 + // + std::set keys; + std::vector> docs; + docs.resize(imax); + meminfo("After resize"); + std::string cstr_url{"url"}; + std::string cstr_mt{"mimetype"}; + std::string cstr_fmt{"fmtime"}; + std::string cstr_dmt{"dmtime"}; + std::string cstr_fb{"fbytes"}; + std::string cstr_db{"dbytes"}; + for (i=0;i" << entry.second <<"\n"; + vdoc.insert({&*it, entry.second}); + } + } + //std::cerr << "\n"; + } + +#elif defined(STORE_MAPS_SHAREDKEYS_OBSTACK) + // + // Each result stored as map with shared keys and + // obstack string storage: + // Memory: audio: 54 MB main: 213 MB + // + std::set keys; + std::vector> docs; + struct obstack obst; + obstack_init(&obst); + obstack_chunk_size(&obst) = 1024*1024; + docs.resize(imax); + meminfo("After resize"); + std::string cstr_url{"url"}; + std::string cstr_mt{"mimetype"}; + std::string cstr_fmt{"fmtime"}; + std::string cstr_dmt{"dmtime"}; + std::string cstr_fb{"fbytes"}; + std::string cstr_db{"dbytes"}; + for (i=0;i" << cp <<"\n"; + vdoc.insert({&*it, cp}); + } + } + //std::cerr << "\n"; + } + +#elif defined(STORE_ARRAYS) + + // + // Each result stored as a vector with a shared + // key->intidx map to store the key name to index mapping, and and + // obstack string storage for the values. + // Memory: audio: 29 MB. Main index: 95 MB + // + // On the audio test index, the total string size computed below + // is 8 MB for 25169 docs, including terminating zeroes for + // non-present fields (see the counting in the last version below). + // + // Residual mystery: + // After the obstack_free, the residual memory usage (audio) is 13 MB. + // So 29-13 = 16 MB for the obstack storage. Why?? + // the Vector size should be around 25kdocs * 59keys * 8bytes = 12 MB + // + // On the main db: 151350 docs 16 keys 95 MB. + // After free 38 MB. Array is 151k * 16 * 8 = 19 MB + // 19 MB missing ?? + // + // The missing bytes ? probably reflect the immense amount of + // allocations performed by the program while building the + // array. Valgrind: + // HEAP SUMMARY: + // in use at exit: 652,298 bytes in 3,033 blocks + // total heap usage: 8,613,584 allocs, 8,610,551 frees, + // 4,421,628,817 bytes allocated + // + // Possible optimizations: the record arrays don't really need to + // be all pointers, and could use a single char * followed by byte + // displacements which could be 32 (or even 16?) bits. This would + // gain at least an additional 9 MB, getting the storage size to + // around 20 MB for the audio 25k index. This is really much + // better than the initial 85 (or even 58 for maps), with a read + // performance impact which should be quite modest. + // ** This supposes that we don't use obstack though, as obstack + // placement is unpredictable. + std::map keyidx { + {"url",0}, + {"mimetype", 1}, + {"fmtime", 2}, + {"dmtime", 3}, + {"fbytes", 4}, + {"dbytes", 5}, + }; + + int ndocs = 0; + for (;;ndocs++) { + Rcl::Doc doc; + if (!query.getDoc(ndocs, doc, false)) { + break; + } + for (const auto& entry : doc.meta) { + if (testentry(entry)) { + auto it = keyidx.find(entry.first); + if (it == keyidx.end()) { + int idx = keyidx.size(); + keyidx.insert({entry.first, idx}); + }; + } + } + } + // 49 keys ! + std::cerr << "Found " << keyidx.size() << " different keys\n"; + + std::vector> docs; + docs.resize(ndocs); + meminfo("After resize"); + + for (int i = 0; i < ndocs; i++) { + Rcl::Doc doc; + if (!query.getDoc(i, doc, false)) { + break; + } + auto& vdoc = docs[i]; + vdoc.resize(keyidx.size()); + int nbytes = + doc.url.size() + 1 + + doc.mimetype.size() + 1 + + doc.fmtime.size() + 1 + + doc.dmtime.size() + 1 + + doc.fbytes.size() + 1 + + doc.dbytes.size() + 1; + for (const auto& entry : doc.meta) { + if (testentry(entry)) { + if (keyidx.find(entry.first) == keyidx.end()) { + std::cerr << "Unknown key: " << entry.first << "\n"; + abort(); + } + nbytes += entry.second.size() + 1; + } + } + + char *cp = (char*)malloc(nbytes); + if (nullptr == cp) { + abort(); + } + if (i < 2) { + std::cerr << "malloc returned " << (void*)cp << "\n"; + } + +#define STRINGCPCOPY(CHARP, S) do { \ + memcpy(CHARP, S.c_str(), S.size()+1); \ + CHARP += S.size()+1; \ + } while (false); + + vdoc[0] = cp; STRINGCPCOPY(cp, doc.url); + vdoc[1] = cp; STRINGCPCOPY(cp, doc.mimetype); + vdoc[2] = cp; STRINGCPCOPY(cp, doc.fmtime); + vdoc[3] = cp; STRINGCPCOPY(cp, doc.dmtime); + vdoc[4] = cp; STRINGCPCOPY(cp, doc.fbytes); + vdoc[5] = cp; STRINGCPCOPY(cp, doc.dbytes); + for (const auto& entry : doc.meta) { + if (testentry(entry)) { + auto it = keyidx.find(entry.first); + if (it == keyidx.end()) { + std::cerr << "Unknown key: " << entry.first << "\n"; + abort(); + } + if (it->second <= 5) { + // Already done ! + continue; + } + vdoc[it->second] = cp; STRINGCPCOPY(cp, entry.second); + } + } + if (i < 2) { + std::cerr << "vdoc[0] " << (void*)vdoc[0] << "\n"; + } + } + + meminfo("After storing"); + for (auto& vdoc : docs) { + if (!vdoc.empty()) { + //std::cerr << "Freeing " << (void*)(vdoc[0]) << "\n"; + free(vdoc[0]); + } + } + meminfo("After free"); + +#elif defined(STORE_ALLOBSTACK) + + // + // Each result stored as a single array of chars with concatenated + // 0-terminated strings and shared key->intidx map. Slow access, + // but should be almost minimal. + // Memory: audio: 14 MB, maindb: 55 MB + // + // This is not really practical because any field access will need + // a linear search inside a single record to find its beginning by + // crossing/counting ending null bytes. + // + // Note: the obstack storage for record strings may have to be + // replaced by continuous storage if it proves too complicated to + // walk. This should not change the volume significantly. + // + // On the audio test index, the total string size computed below + // is 8 MB for 25169 docs, including terminating zeroes for + // non-present fields. + // + // The program uses 3 MB of memory prior to storing the strings, + // meaning that the character data itself uses 11 MB. + // I don't understand where the 3MB of overhead comes from. + // The vector itself is 8 * 25k = 200 KB. + // + // With a chunk size of 1MB, the obstack overhead should be + // negligible. Varying the chunk size seems to indicate it is. + // + // Retested on the main index (150k docs, 30MB of strings) yields + // 55 MB of total storage including 15 MB of prestorage usage, so + // 10MB of unexplained overhead. Strange is that pre-key compute + // is 1 instead of 3 for the audio db?? And why 15 pre-storage + // instead of 3?? + // + std::vector docs; + struct obstack obst; + obstack_init(&obst); + obstack_chunk_size(&obst) = 1024*1024; + obstack_alignment_mask(&obst) = 0; + docs.resize(imax); + meminfo("After resize"); + std::map keyidx { + {"url",0}, + {"mimetype", 1}, + {"fmtime", 2}, + {"dmtime", 3}, + {"fbytes", 4}, + {"dbytes", 5}, + }; + + for (i=0;i keyidx { + {"url",0}, + {"mimetype", 1}, + {"fmtime", 2}, + {"dmtime", 3}, + {"fbytes", 4}, + {"dbytes", 5}, + }; + + int imax = 0; + for (;;imax++) { + Rcl::Doc doc; + if (!query.getDoc(imax, doc, false)) { + break; + } + for (const auto& entry : doc.meta) { + if (testentry(entry)) { + auto it = keyidx.find(entry.first); + if (it == keyidx.end()) { + int idx = keyidx.size(); + keyidx.insert({entry.first, idx}); + }; + } + } + } + // 49 keys ! + std::cerr << "Found " << keyidx.size() << " different keys\n"; + std::vector> docs; + docs.resize(imax); + meminfo("After resize"); + + for (int i = 0; i < imax; i++) { + Rcl::Doc doc; + if (!query.getDoc(i, doc, false)) { + break; + } + auto& vdoc = docs[i]; + vdoc.resize(keyidx.size()); + vdoc[0] = OBSTACK_STRINGCOPY(obst, doc.url); + vdoc[1] = OBSTACK_STRINGCOPY(obst, doc.mimetype); + vdoc[2] = OBSTACK_STRINGCOPY(obst, doc.fmtime); + vdoc[3] = OBSTACK_STRINGCOPY(obst, doc.dmtime); + vdoc[4] = OBSTACK_STRINGCOPY(obst, doc.fbytes); + vdoc[5] = OBSTACK_STRINGCOPY(obst, doc.dbytes); + for (const auto& entry : doc.meta) { + if (testentry(entry)) { + auto it = keyidx.find(entry.first); + if (it == keyidx.end()) { + std::cerr << "Unknown key: " << entry.first << "\n"; + abort(); + } + if (it->second <= 5) { + continue; + } + const char *cp = OBSTACK_STRINGCOPY(obst, entry.second); + vdoc[it->second] = cp; + } + } + } + meminfo("After storing"); + obstack_free(&obst, nullptr); + meminfo("After obstack_free"); +#endif