/* Copyright (C) 2017-2019 J.F.Dockes * * License: GPL 2.1 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #define obstack_chunk_alloc malloc #define obstack_chunk_free free #include #define OBSTACK_STRINGCOPY(STK, S) \ (char *)obstack_copy(&STK, S.c_str(), S.size()+1) #include "rclinit.h" #include "rclconfig.h" #include "rcldb.h" #include "searchdata.h" #include "rclquery.h" #include "pathut.h" #include "rclutil.h" #include "wasatorcl.h" #include "log.h" #include "pathut.h" #include "plaintorich.h" #include "hldata.h" #include "smallut.h" #include "qresultstore.h" //const std::string confdir{"/home/dockes/.recoll-prod"}; const std::string confdir{"/var/cache/upmpdcli/uprcl"}; const int MB = 1024 *1024; // Docs as docs #undef STORE_DOCS // Docs as map #undef STORE_MAPS_RAW // Docs as map, shared key storage #undef STORE_MAPS_SHAREDKEYS // Docs as map shared key storage+obstack for data #undef STORE_MAPS_SHAREDKEYS_OBSTACK // Docs as vector shared key-to-idx storage + pointer vector #define STORE_ARRAYS // Minimal, not really usable, needs linear search inside record to find field #undef STORE_ALLOBSTACK static void meminfo(const char *msg) { struct mallinfo minfo = mallinfo(); std::cerr << msg << " : " << minfo.uordblks / MB << " MB\n"; } static inline bool testentry(const std::pair& entry) { #undef ALLENTRIES #ifdef ALLENTRIES return true; #else return !entry.second.empty() && entry.first != "author" && entry.first != "ipath" && entry.first != "rcludi" && entry.first != "relevancyrating" && entry.first != "sig" && entry.first != "abstract" && entry.first != "caption" && entry.first != "filename" && entry.first != "origcharset" && entry.first != "sig"; #endif } int main(int argc, char *argv[]) { std::string reason; RclConfig *rclconfig = recollinit(0, 0, 0, reason, &confdir); if (!rclconfig || !rclconfig->ok()) { std::cerr << "Recoll init failed: " << reason << "\n"; return 1; } meminfo("Before db open"); Rcl::Db rcldb(rclconfig); if (!rcldb.open(Rcl::Db::DbRO)) { std::cerr << "db open error\n"; return 1; } Rcl::Query query(&rcldb); std::shared_ptr sd{wasaStringToRcl( rclconfig, "english", "mime:*", reason)}; if (!sd) { std::cerr << "wasStringToRcl: " << reason << "\n"; return 1; } meminfo("Before setquery"); query.setQuery(sd); int cnt = query.getResCnt(); meminfo("After getResCnt"); std::cerr << "Got " << cnt << " estimated results\n"; #if !defined(STORE_ARRAYS) // Store_arrays needs 2 walks anyway, to find the field names int i = 0; for (;;) { Rcl::Doc doc; if (!query.getDoc(i++, doc, false)) { break; } } int imax = i; meminfo("After getDocs"); std::cerr << "Got " << imax << " docs\n"; #endif // !STORE_ARRAYS #if defined(STORE_DOCS) // // 25 kdocs (music tags) // Each result stored as Rcl::Doc: 85 MB // std::vector docs; docs.resize(imax); meminfo("After resize"); for (i=0;i 58 MB // std::vector> docs; docs.resize(imax); meminfo("After resize"); for (i=0;i" << entry.second <<"\n"; vdoc.insert(entry); } } //std::cerr << "\n"; } #elif defined(STORE_MAPS_SHAREDKEYS) // // Each result stored as map with shared keys: // Memory: audio: 56 MB, main: 221 // std::set keys; std::vector> docs; docs.resize(imax); meminfo("After resize"); std::string cstr_url{"url"}; std::string cstr_mt{"mimetype"}; std::string cstr_fmt{"fmtime"}; std::string cstr_dmt{"dmtime"}; std::string cstr_fb{"fbytes"}; std::string cstr_db{"dbytes"}; for (i=0;i" << entry.second <<"\n"; vdoc.insert({&*it, entry.second}); } } //std::cerr << "\n"; } #elif defined(STORE_MAPS_SHAREDKEYS_OBSTACK) // // Each result stored as map with shared keys and // obstack string storage: // Memory: audio: 54 MB main: 213 MB // std::set keys; std::vector> docs; struct obstack obst; obstack_init(&obst); obstack_chunk_size(&obst) = 1024*1024; docs.resize(imax); meminfo("After resize"); std::string cstr_url{"url"}; std::string cstr_mt{"mimetype"}; std::string cstr_fmt{"fmtime"}; std::string cstr_dmt{"dmtime"}; std::string cstr_fb{"fbytes"}; std::string cstr_db{"dbytes"}; for (i=0;i" << cp <<"\n"; vdoc.insert({&*it, cp}); } } //std::cerr << "\n"; } #elif defined(STORE_ARRAYS) // // Each result stored as a vector with a shared // key->intidx map to store the key name to index mapping, and and // obstack string storage for the values. // Memory: audio: 29 MB. Main index: 95 MB // // On the audio test index, the total string size computed below // is 8 MB for 25169 docs, including terminating zeroes for // non-present fields (see the counting in the last version below). // // Residual mystery: // After the obstack_free, the residual memory usage (audio) is 13 MB. // So 29-13 = 16 MB for the obstack storage. Why?? // the Vector size should be around 25kdocs * 59keys * 8bytes = 12 MB // // On the main db: 151350 docs 16 keys 95 MB. // After free 38 MB. Array is 151k * 16 * 8 = 19 MB // 19 MB missing ?? // // The missing bytes ? probably reflect the immense amount of // allocations performed by the program while building the // array. Valgrind: // HEAP SUMMARY: // in use at exit: 652,298 bytes in 3,033 blocks // total heap usage: 8,613,584 allocs, 8,610,551 frees, // 4,421,628,817 bytes allocated // // Possible optimizations: the record arrays don't really need to // be all pointers, and could use a single char * followed by byte // displacements which could be 32 (or even 16?) bits. This would // gain at least an additional 9 MB, getting the storage size to // around 20 MB for the audio 25k index. This is really much // better than the initial 85 (or even 58 for maps), with a read // performance impact which should be quite modest. // ** This supposes that we don't use obstack though, as obstack // placement is unpredictable. // // This the solution now implemented: no obstack, use struct with offsets // This uses 19 MB of storage for the audio index, and 72 MB for // the main one (less keys->less gain) { Rcl::QResultStore store; bool result = store.storeQuery( query, {"author", "ipath", "rcludi", "relevancyrating", "sig","abstract", "caption", "filename", "origcharset", "sig"}); if (!result) { std::cerr << "storeQuery failed\n"; return 1; } meminfo("After storing"); std::cerr << "url 20 " << store.fieldValue(20, "url") << "\n"; } #elif defined(STORE_ALLOBSTACK) // // Each result stored as a single array of chars with concatenated // 0-terminated strings and shared key->intidx map. Slow access, // but should be almost minimal. // Memory: audio: 14 MB, maindb: 55 MB // // This is not really practical because any field access will need // a linear search inside a single record to find its beginning by // crossing/counting ending null bytes. // // Note: the obstack storage for record strings may have to be // replaced by continuous storage if it proves too complicated to // walk. This should not change the volume significantly. // // On the audio test index, the total string size computed below // is 8 MB for 25169 docs, including terminating zeroes for // non-present fields. // // The program uses 3 MB of memory prior to storing the strings, // meaning that the character data itself uses 11 MB. // I don't understand where the 3MB of overhead comes from. // The vector itself is 8 * 25k = 200 KB. // // With a chunk size of 1MB, the obstack overhead should be // negligible. Varying the chunk size seems to indicate it is. // // Retested on the main index (150k docs, 30MB of strings) yields // 55 MB of total storage including 15 MB of prestorage usage, so // 10MB of unexplained overhead. Strange is that pre-key compute // is 1 instead of 3 for the audio db?? And why 15 pre-storage // instead of 3?? // std::vector docs; struct obstack obst; obstack_init(&obst); obstack_chunk_size(&obst) = 1024*1024; obstack_alignment_mask(&obst) = 0; docs.resize(imax); meminfo("After resize"); std::map keyidx { {"url",0}, {"mimetype", 1}, {"fmtime", 2}, {"dmtime", 3}, {"fbytes", 4}, {"dbytes", 5}, }; for (i=0;i keyidx { {"url",0}, {"mimetype", 1}, {"fmtime", 2}, {"dmtime", 3}, {"fbytes", 4}, {"dbytes", 5}, }; int imax = 0; for (;;imax++) { Rcl::Doc doc; if (!query.getDoc(imax, doc, false)) { break; } for (const auto& entry : doc.meta) { if (testentry(entry)) { auto it = keyidx.find(entry.first); if (it == keyidx.end()) { int idx = keyidx.size(); keyidx.insert({entry.first, idx}); }; } } } // 49 keys ! std::cerr << "Found " << keyidx.size() << " different keys\n"; std::vector> docs; docs.resize(imax); meminfo("After resize"); for (int i = 0; i < imax; i++) { Rcl::Doc doc; if (!query.getDoc(i, doc, false)) { break; } auto& vdoc = docs[i]; vdoc.resize(keyidx.size()); vdoc[0] = OBSTACK_STRINGCOPY(obst, doc.url); vdoc[1] = OBSTACK_STRINGCOPY(obst, doc.mimetype); vdoc[2] = OBSTACK_STRINGCOPY(obst, doc.fmtime); vdoc[3] = OBSTACK_STRINGCOPY(obst, doc.dmtime); vdoc[4] = OBSTACK_STRINGCOPY(obst, doc.fbytes); vdoc[5] = OBSTACK_STRINGCOPY(obst, doc.dbytes); for (const auto& entry : doc.meta) { if (testentry(entry)) { auto it = keyidx.find(entry.first); if (it == keyidx.end()) { std::cerr << "Unknown key: " << entry.first << "\n"; abort(); } if (it->second <= 5) { continue; } const char *cp = OBSTACK_STRINGCOPY(obst, entry.second); vdoc[it->second] = cp; } } } meminfo("After storing"); obstack_free(&obst, nullptr); meminfo("After obstack_free"); #endif