Control memory usage when deleting documents: use idxflushmb as when adding/updating

This commit is contained in:
Jean-Francois Dockes 2011-09-07 19:11:11 +02:00
parent e3532b0941
commit c5ff0cdf52
6 changed files with 65 additions and 16 deletions

View File

@ -14,12 +14,16 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <signal.h>
#include <locale.h>
#include <pthread.h>
#include <cstdlib>
#if !defined(PUTENV_ARG_CONST)
#include <string.h>
#endif
#include "debuglog.h"
#include "rclconfig.h"
@ -96,6 +100,18 @@ RclConfig *recollinit(RclInitFlags flags,
// threads don't try to do it at once).
config->getDefCharset();
int flushmb;
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
LOGDEB(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
flushmb));
static const char *cp = "XAPIAN_FLUSH_THRESHOLD=1000000";
#ifdef PUTENV_ARG_CONST
::putenv(cp);
#else
::putenv(strdup(cp));
#endif
}
return config;
}

View File

@ -124,8 +124,8 @@ Threshold (megabytes of new text data)
where we flush from memory to disk index. Setting this can
help control memory usage. A value of 0 means no explicit
flushing, letting Xapian use its own default, which is
flushing every 10000 documents (memory usage depends on
average document size). The default value is 10.
flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that
memory usage depends on average document size. The default value is 10.
.TP
.BI "filtersdir = " directory
A directory to search for the external filter scripts used to index some

View File

@ -3568,12 +3568,13 @@ skippedPaths = ~/somedir/&lowast;.txt
<varlistentry><term><literal>idxflushmb</literal></term>
<listitem><para>Threshold (megabytes of new text data)
where we flush from memory to disk index. Setting this can
help control memory usage. A value of 0 means no explicit
flushing, letting Xapian use its own default, which is
flushing every 10000 documents (memory usage depends on
average document size). The default value is 10.</para>
<listitem><para>Threshold (megabytes of new text data) where we
flush from memory to disk index. Setting this can help control
memory usage. A value of 0 means no explicit flushing, letting
Xapian use its own default, which is flushing every 10000 (or
XAPIAN_FLUSH_THRESHOLD) documents, which gives little memory
usage control, as memory usage depends on average document
size. The default value is 10.</para>
</listitem>
</varlistentry>

View File

@ -1216,11 +1216,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
}
// Test if we're over the flush threshold (limit memory usage):
m_curtxtsz += doc.text.length();
maybeflush(doc.text.length());
return true;
}
// Flush when idxflushmbs is reached
bool Db::maybeflush(off_t moretext)
{
if (m_flushMb > 0) {
m_curtxtsz += moretext;
if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) {
ermsg.erase();
LOGDEB(("Db::add: text size >= %d Mb, flushing\n", m_flushMb));
LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n",
m_flushMb));
string ermsg;
try {
m_ndb->xwdb.flush();
} XCATCHERROR(ermsg);
@ -1231,7 +1239,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
m_flushtxtsz = m_curtxtsz;
}
}
return true;
}
@ -1386,7 +1393,18 @@ bool Db::purge()
break;
}
}
try {
if (m_flushMb > 0) {
// We use an average term length of 5 for
// estimating the doc sizes which is probably not
// accurate but gives rough consistency with what
// we do for add/update. I should fetch the doc
// size from the data record, but this would be
// bad for performance.
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
maybeflush(trms * 5);
}
m_ndb->xwdb.delete_document(docid);
LOGDEB(("Db::purge: deleted document #%d\n", docid));
} catch (const Xapian::DocNotFoundError &) {
@ -1426,6 +1444,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
}
*existed = true;
LOGDEB(("purgeFile: delete docid %d\n", *docid));
if (m_flushMb > 0) {
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*docid);
maybeflush(trms * 5);
}
db.delete_document(*docid);
vector<Xapian::docid> docids;
m_ndb->subDocs(udi, docids);
@ -1433,6 +1455,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
for (vector<Xapian::docid>::iterator it = docids.begin();
it != docids.end(); it++) {
LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it));
if (m_flushMb > 0) {
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*it);
maybeflush(trms * 5);
}
db.delete_document(*it);
}
return true;

View File

@ -274,6 +274,9 @@ private:
bool stemExpand(const string &lang, const string &s,
TermMatchResult& result, int max = -1);
// Flush when idxflushmb is reached
bool maybeflush(off_t moretext);
/* Copyconst and assignemt private and forbidden */
Db(const Db &) {}
Db& operator=(const Db &) {return *this;};

View File

@ -70,10 +70,13 @@ dbdir = xapiandb
maxfsoccuppc = 0
# Threshold (megabytes of new data) where we flush from memory to disk
# index. Setting this (ie to 10) can help control memory usage. The default
# value of 0 means no explicit flushing, which lets Xapian perform its own
# thing, which is flushing every 10000 documents (memory usage depends on
# average document size).
# index. Setting this (ie to 10) can help control memory usage.
#
# A value of 0 means no explicit flushing, which lets Xapian perform its
# own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents
# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment
# variable. As memory usage depends on average document size, not only
# document count, this is not very useful.
idxflushmb = 10
# Place to search for executable filters. If RECOLL_FILTERSDIR is set in