Control memory usage when deleting documents: use idxflushmb as when adding/updating
This commit is contained in:
parent
e3532b0941
commit
c5ff0cdf52
@ -14,12 +14,16 @@
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
#include <locale.h>
|
||||
#include <pthread.h>
|
||||
#include <cstdlib>
|
||||
#if !defined(PUTENV_ARG_CONST)
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rclconfig.h"
|
||||
@ -96,6 +100,18 @@ RclConfig *recollinit(RclInitFlags flags,
|
||||
// threads don't try to do it at once).
|
||||
config->getDefCharset();
|
||||
|
||||
int flushmb;
|
||||
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
|
||||
LOGDEB(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
|
||||
flushmb));
|
||||
static const char *cp = "XAPIAN_FLUSH_THRESHOLD=1000000";
|
||||
#ifdef PUTENV_ARG_CONST
|
||||
::putenv(cp);
|
||||
#else
|
||||
::putenv(strdup(cp));
|
||||
#endif
|
||||
}
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
|
||||
@ -124,8 +124,8 @@ Threshold (megabytes of new text data)
|
||||
where we flush from memory to disk index. Setting this can
|
||||
help control memory usage. A value of 0 means no explicit
|
||||
flushing, letting Xapian use its own default, which is
|
||||
flushing every 10000 documents (memory usage depends on
|
||||
average document size). The default value is 10.
|
||||
flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that
|
||||
memory usage depends on average document size. The default value is 10.
|
||||
.TP
|
||||
.BI "filtersdir = " directory
|
||||
A directory to search for the external filter scripts used to index some
|
||||
|
||||
@ -3568,12 +3568,13 @@ skippedPaths = ~/somedir/∗.txt
|
||||
|
||||
|
||||
<varlistentry><term><literal>idxflushmb</literal></term>
|
||||
<listitem><para>Threshold (megabytes of new text data)
|
||||
where we flush from memory to disk index. Setting this can
|
||||
help control memory usage. A value of 0 means no explicit
|
||||
flushing, letting Xapian use its own default, which is
|
||||
flushing every 10000 documents (memory usage depends on
|
||||
average document size). The default value is 10.</para>
|
||||
<listitem><para>Threshold (megabytes of new text data) where we
|
||||
flush from memory to disk index. Setting this can help control
|
||||
memory usage. A value of 0 means no explicit flushing, letting
|
||||
Xapian use its own default, which is flushing every 10000 (or
|
||||
XAPIAN_FLUSH_THRESHOLD) documents, which gives little memory
|
||||
usage control, as memory usage depends on average document
|
||||
size. The default value is 10.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
@ -1216,11 +1216,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
}
|
||||
|
||||
// Test if we're over the flush threshold (limit memory usage):
|
||||
m_curtxtsz += doc.text.length();
|
||||
maybeflush(doc.text.length());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Flush when idxflushmbs is reached
|
||||
bool Db::maybeflush(off_t moretext)
|
||||
{
|
||||
if (m_flushMb > 0) {
|
||||
m_curtxtsz += moretext;
|
||||
if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) {
|
||||
ermsg.erase();
|
||||
LOGDEB(("Db::add: text size >= %d Mb, flushing\n", m_flushMb));
|
||||
LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n",
|
||||
m_flushMb));
|
||||
string ermsg;
|
||||
try {
|
||||
m_ndb->xwdb.flush();
|
||||
} XCATCHERROR(ermsg);
|
||||
@ -1231,7 +1239,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
m_flushtxtsz = m_curtxtsz;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1386,7 +1393,18 @@ bool Db::purge()
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (m_flushMb > 0) {
|
||||
// We use an average term length of 5 for
|
||||
// estimating the doc sizes which is probably not
|
||||
// accurate but gives rough consistency with what
|
||||
// we do for add/update. I should fetch the doc
|
||||
// size from the data record, but this would be
|
||||
// bad for performance.
|
||||
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
|
||||
maybeflush(trms * 5);
|
||||
}
|
||||
m_ndb->xwdb.delete_document(docid);
|
||||
LOGDEB(("Db::purge: deleted document #%d\n", docid));
|
||||
} catch (const Xapian::DocNotFoundError &) {
|
||||
@ -1426,6 +1444,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
|
||||
}
|
||||
*existed = true;
|
||||
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
||||
if (m_flushMb > 0) {
|
||||
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*docid);
|
||||
maybeflush(trms * 5);
|
||||
}
|
||||
db.delete_document(*docid);
|
||||
vector<Xapian::docid> docids;
|
||||
m_ndb->subDocs(udi, docids);
|
||||
@ -1433,6 +1455,10 @@ bool Db::purgeFile(const string &udi, bool *existed)
|
||||
for (vector<Xapian::docid>::iterator it = docids.begin();
|
||||
it != docids.end(); it++) {
|
||||
LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it));
|
||||
if (m_flushMb > 0) {
|
||||
Xapian::termcount trms = m_ndb->xwdb.get_doclength(*it);
|
||||
maybeflush(trms * 5);
|
||||
}
|
||||
db.delete_document(*it);
|
||||
}
|
||||
return true;
|
||||
|
||||
@ -274,6 +274,9 @@ private:
|
||||
bool stemExpand(const string &lang, const string &s,
|
||||
TermMatchResult& result, int max = -1);
|
||||
|
||||
// Flush when idxflushmb is reached
|
||||
bool maybeflush(off_t moretext);
|
||||
|
||||
/* Copyconst and assignemt private and forbidden */
|
||||
Db(const Db &) {}
|
||||
Db& operator=(const Db &) {return *this;};
|
||||
|
||||
@ -70,10 +70,13 @@ dbdir = xapiandb
|
||||
maxfsoccuppc = 0
|
||||
|
||||
# Threshold (megabytes of new data) where we flush from memory to disk
|
||||
# index. Setting this (ie to 10) can help control memory usage. The default
|
||||
# value of 0 means no explicit flushing, which lets Xapian perform its own
|
||||
# thing, which is flushing every 10000 documents (memory usage depends on
|
||||
# average document size).
|
||||
# index. Setting this (ie to 10) can help control memory usage.
|
||||
#
|
||||
# A value of 0 means no explicit flushing, which lets Xapian perform its
|
||||
# own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents
|
||||
# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment
|
||||
# variable. As memory usage depends on average document size, not only
|
||||
# document count, this is not very useful.
|
||||
idxflushmb = 10
|
||||
|
||||
# Place to search for executable filters. If RECOLL_FILTERSDIR is set in
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user