Control memory usage when deleting documents: use idxflushmb as when adding/updating

This commit is contained in:
Jean-Francois Dockes 2011-09-07 19:11:11 +02:00
parent e3532b0941
commit c5ff0cdf52
6 changed files with 65 additions and 16 deletions

View File

@ -14,12 +14,16 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/ */
#include "autoconfig.h"
#include <stdio.h> #include <stdio.h>
#include <signal.h> #include <signal.h>
#include <locale.h> #include <locale.h>
#include <pthread.h> #include <pthread.h>
#include <cstdlib> #include <cstdlib>
#if !defined(PUTENV_ARG_CONST)
#include <string.h>
#endif
#include "debuglog.h" #include "debuglog.h"
#include "rclconfig.h" #include "rclconfig.h"
@ -96,6 +100,18 @@ RclConfig *recollinit(RclInitFlags flags,
// threads don't try to do it at once). // threads don't try to do it at once).
config->getDefCharset(); config->getDefCharset();
int flushmb;
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
LOGDEB(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
flushmb));
static const char *cp = "XAPIAN_FLUSH_THRESHOLD=1000000";
#ifdef PUTENV_ARG_CONST
::putenv(cp);
#else
::putenv(strdup(cp));
#endif
}
return config; return config;
} }

View File

@ -124,8 +124,8 @@ Threshold (megabytes of new text data)
where we flush from memory to disk index. Setting this can where we flush from memory to disk index. Setting this can
help control memory usage. A value of 0 means no explicit help control memory usage. A value of 0 means no explicit
flushing, letting Xapian use its own default, which is flushing, letting Xapian use its own default, which is
flushing every 10000 documents (memory usage depends on flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that
average document size). The default value is 10. memory usage depends on average document size. The default value is 10.
.TP .TP
.BI "filtersdir = " directory .BI "filtersdir = " directory
A directory to search for the external filter scripts used to index some A directory to search for the external filter scripts used to index some

View File

@ -3568,12 +3568,13 @@ skippedPaths = ~/somedir/&lowast;.txt
<varlistentry><term><literal>idxflushmb</literal></term> <varlistentry><term><literal>idxflushmb</literal></term>
<listitem><para>Threshold (megabytes of new text data) <listitem><para>Threshold (megabytes of new text data) where we
where we flush from memory to disk index. Setting this can flush from memory to disk index. Setting this can help control
help control memory usage. A value of 0 means no explicit memory usage. A value of 0 means no explicit flushing, letting
flushing, letting Xapian use its own default, which is Xapian use its own default, which is flushing every 10000 (or
flushing every 10000 documents (memory usage depends on XAPIAN_FLUSH_THRESHOLD) documents, which gives little memory
average document size). The default value is 10.</para> usage control, as memory usage depends on average document
size. The default value is 10.</para>
</listitem> </listitem>
</varlistentry> </varlistentry>

View File

@ -1216,11 +1216,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
} }
// Test if we're over the flush threshold (limit memory usage): // Test if we're over the flush threshold (limit memory usage):
m_curtxtsz += doc.text.length(); maybeflush(doc.text.length());
return true;
}
// Flush when idxflushmbs is reached
bool Db::maybeflush(off_t moretext)
{
if (m_flushMb > 0) { if (m_flushMb > 0) {
m_curtxtsz += moretext;
if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) {
ermsg.erase(); LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n",
LOGDEB(("Db::add: text size >= %d Mb, flushing\n", m_flushMb)); m_flushMb));
string ermsg;
try { try {
m_ndb->xwdb.flush(); m_ndb->xwdb.flush();
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
@ -1231,7 +1239,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
m_flushtxtsz = m_curtxtsz; m_flushtxtsz = m_curtxtsz;
} }
} }
return true; return true;
} }
@ -1386,7 +1393,18 @@ bool Db::purge()
break; break;
} }
} }
try { try {
if (m_flushMb > 0) {
// We use an average term length of 5 for
// estimating the doc sizes which is probably not
// accurate but gives rough consistency with what
// we do for add/update. I should fetch the doc
// size from the data record, but this would be
// bad for performance.
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
maybeflush(trms * 5);
}
m_ndb->xwdb.delete_document(docid); m_ndb->xwdb.delete_document(docid);
LOGDEB(("Db::purge: deleted document #%d\n", docid)); LOGDEB(("Db::purge: deleted document #%d\n", docid));
} catch (const Xapian::DocNotFoundError &) { } catch (const Xapian::DocNotFoundError &) {
@ -1426,6 +1444,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
} }
*existed = true; *existed = true;
LOGDEB(("purgeFile: delete docid %d\n", *docid)); LOGDEB(("purgeFile: delete docid %d\n", *docid));
if (m_flushMb > 0) {
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*docid);
maybeflush(trms * 5);
}
db.delete_document(*docid); db.delete_document(*docid);
vector<Xapian::docid> docids; vector<Xapian::docid> docids;
m_ndb->subDocs(udi, docids); m_ndb->subDocs(udi, docids);
@ -1433,6 +1455,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
for (vector<Xapian::docid>::iterator it = docids.begin(); for (vector<Xapian::docid>::iterator it = docids.begin();
it != docids.end(); it++) { it != docids.end(); it++) {
LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it)); LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it));
if (m_flushMb > 0) {
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*it);
maybeflush(trms * 5);
}
db.delete_document(*it); db.delete_document(*it);
} }
return true; return true;

View File

@ -274,6 +274,9 @@ private:
bool stemExpand(const string &lang, const string &s, bool stemExpand(const string &lang, const string &s,
TermMatchResult& result, int max = -1); TermMatchResult& result, int max = -1);
// Flush when idxflushmb is reached
bool maybeflush(off_t moretext);
/* Copyconst and assignemt private and forbidden */ /* Copyconst and assignemt private and forbidden */
Db(const Db &) {} Db(const Db &) {}
Db& operator=(const Db &) {return *this;}; Db& operator=(const Db &) {return *this;};

View File

@ -70,10 +70,13 @@ dbdir = xapiandb
maxfsoccuppc = 0 maxfsoccuppc = 0
# Threshold (megabytes of new data) where we flush from memory to disk # Threshold (megabytes of new data) where we flush from memory to disk
# index. Setting this (ie to 10) can help control memory usage. The default # index. Setting this (ie to 10) can help control memory usage.
# value of 0 means no explicit flushing, which lets Xapian perform its own #
# thing, which is flushing every 10000 documents (memory usage depends on # A value of 0 means no explicit flushing, which lets Xapian perform its
# average document size). # own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents
# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment
# variable. As memory usage depends on average document size, not only
# document count, this is not very useful.
idxflushmb = 10 idxflushmb = 10
# Place to search for executable filters. If RECOLL_FILTERSDIR is set in # Place to search for executable filters. If RECOLL_FILTERSDIR is set in