diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp index 3ea89d1a..c2ec8728 100644 --- a/src/common/rclinit.cpp +++ b/src/common/rclinit.cpp @@ -14,12 +14,16 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include "autoconfig.h" #include #include #include #include #include +#if !defined(PUTENV_ARG_CONST) +#include +#endif #include "debuglog.h" #include "rclconfig.h" @@ -96,6 +100,18 @@ RclConfig *recollinit(RclInitFlags flags, // threads don't try to do it at once). config->getDefCharset(); + int flushmb; + if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) { + LOGDEB(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n", + flushmb)); + static const char *cp = "XAPIAN_FLUSH_THRESHOLD=1000000"; +#ifdef PUTENV_ARG_CONST + ::putenv(cp); +#else + ::putenv(strdup(cp)); +#endif + } + return config; } diff --git a/src/doc/man/recoll.conf.5 b/src/doc/man/recoll.conf.5 index 872b1f9a..f0be5743 100644 --- a/src/doc/man/recoll.conf.5 +++ b/src/doc/man/recoll.conf.5 @@ -124,8 +124,8 @@ Threshold (megabytes of new text data) where we flush from memory to disk index. Setting this can help control memory usage. A value of 0 means no explicit flushing, letting Xapian use its own default, which is -flushing every 10000 documents (memory usage depends on -average document size). The default value is 10. +flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that +memory usage depends on average document size. The default value is 10. .TP .BI "filtersdir = " directory A directory to search for the external filter scripts used to index some diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 865c4c78..4ee90d01 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -3568,12 +3568,13 @@ skippedPaths = ~/somedir/∗.txt idxflushmb - Threshold (megabytes of new text data) - where we flush from memory to disk index. Setting this can - help control memory usage. A value of 0 means no explicit - flushing, letting Xapian use its own default, which is - flushing every 10000 documents (memory usage depends on - average document size). The default value is 10. + Threshold (megabytes of new text data) where we + flush from memory to disk index. Setting this can help control + memory usage. A value of 0 means no explicit flushing, letting + Xapian use its own default, which is flushing every 10000 (or + XAPIAN_FLUSH_THRESHOLD) documents, which gives little memory + usage control, as memory usage depends on average document + size. The default value is 10. diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 49ecd6f7..5bd8fb16 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1216,11 +1216,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, } // Test if we're over the flush threshold (limit memory usage): - m_curtxtsz += doc.text.length(); + maybeflush(doc.text.length()); + return true; +} + +// Flush when idxflushmbs is reached +bool Db::maybeflush(off_t moretext) +{ if (m_flushMb > 0) { + m_curtxtsz += moretext; if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { - ermsg.erase(); - LOGDEB(("Db::add: text size >= %d Mb, flushing\n", m_flushMb)); + LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n", + m_flushMb)); + string ermsg; try { m_ndb->xwdb.flush(); } XCATCHERROR(ermsg); @@ -1231,7 +1239,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, m_flushtxtsz = m_curtxtsz; } } - return true; } @@ -1386,7 +1393,18 @@ bool Db::purge() break; } } + try { + if (m_flushMb > 0) { + // We use an average term length of 5 for + // estimating the doc sizes which is probably not + // accurate but gives rough consistency with what + // we do for add/update. I should fetch the doc + // size from the data record, but this would be + // bad for performance. + Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid); + maybeflush(trms * 5); + } m_ndb->xwdb.delete_document(docid); LOGDEB(("Db::purge: deleted document #%d\n", docid)); } catch (const Xapian::DocNotFoundError &) { @@ -1426,6 +1444,10 @@ bool Db::purgeFile(const string &udi, bool *existed) } *existed = true; LOGDEB(("purgeFile: delete docid %d\n", *docid)); + if (m_flushMb > 0) { + Xapian::termcount trms = m_ndb->xwdb.get_doclength(*docid); + maybeflush(trms * 5); + } db.delete_document(*docid); vector docids; m_ndb->subDocs(udi, docids); @@ -1433,6 +1455,10 @@ bool Db::purgeFile(const string &udi, bool *existed) for (vector::iterator it = docids.begin(); it != docids.end(); it++) { LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it)); + if (m_flushMb > 0) { + Xapian::termcount trms = m_ndb->xwdb.get_doclength(*it); + maybeflush(trms * 5); + } db.delete_document(*it); } return true; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 046e5092..d8f35d4d 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -274,6 +274,9 @@ private: bool stemExpand(const string &lang, const string &s, TermMatchResult& result, int max = -1); + // Flush when idxflushmb is reached + bool maybeflush(off_t moretext); + /* Copyconst and assignemt private and forbidden */ Db(const Db &) {} Db& operator=(const Db &) {return *this;}; diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index cbaeef26..1876fa30 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -70,10 +70,13 @@ dbdir = xapiandb maxfsoccuppc = 0 # Threshold (megabytes of new data) where we flush from memory to disk -# index. Setting this (ie to 10) can help control memory usage. The default -# value of 0 means no explicit flushing, which lets Xapian perform its own -# thing, which is flushing every 10000 documents (memory usage depends on -# average document size). +# index. Setting this (ie to 10) can help control memory usage. +# +# A value of 0 means no explicit flushing, which lets Xapian perform its +# own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents +# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment +# variable. As memory usage depends on average document size, not only +# document count, this is not very useful. idxflushmb = 10 # Place to search for executable filters. If RECOLL_FILTERSDIR is set in