implemented maxmemberkb limit for multidoc (e.g. archive) members

This commit is contained in:
Jean-Francois Dockes 2012-10-06 09:05:35 +02:00
parent 1329265b7b
commit 29fe1e4927
9 changed files with 45 additions and 18 deletions

View File

@ -17,6 +17,11 @@ class RclExecM:
def __init__(self): def __init__(self):
self.myname = os.path.basename(sys.argv[0]) self.myname = os.path.basename(sys.argv[0])
self.mimetype = "" self.mimetype = ""
self.maxmembersize = int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB"))
if self.maxmembersize:
self.maxmembersize = self.maxmembersize * 1024
else:
self.maxmembersize = 50 * 1024 * 1024
def rclog(self, s, doexit = 0, exitvalue = 1): def rclog(self, s, doexit = 0, exitvalue = 1):
print >> sys.stderr, "RCLMFILT:", self.myname, ":", s print >> sys.stderr, "RCLMFILT:", self.myname, ":", s

View File

@ -49,7 +49,7 @@ class RarExtractor:
if not isdir: if not isdir:
try: try:
if rarinfo.file_size > 50 * 1024 * 1024: if rarinfo.file_size > self.em.maxmembersize:
self.em.rclog("extractone: entry %s size %d too big" % self.em.rclog("extractone: entry %s size %d too big" %
(ipath, rarinfo.file_size)) (ipath, rarinfo.file_size))
docdata = "" docdata = ""

View File

@ -24,7 +24,7 @@ class TarExtractor:
docdata = "" docdata = ""
try: try:
info = self.tar.getmember(ipath) info = self.tar.getmember(ipath)
if info.size > 50 * 1024 * 1024: if info.size > self.em.maxmembersize:
# skip # skip
docdata = "" docdata = ""
self.em.rclog("extractone: entry %s size %d too big" % self.em.rclog("extractone: entry %s size %d too big" %

View File

@ -47,7 +47,7 @@ class ZipExtractor:
try: try:
info = self.zip.getinfo(ipath) info = self.zip.getinfo(ipath)
# There could be a 4GB Iso in the zip. We have to set a limit # There could be a 4GB Iso in the zip. We have to set a limit
if info.file_size > 50 * 1024*1024: if info.file_size > self.em.maxmembersize:
self.em.rclog("extractone: entry %s size %d too big" % self.em.rclog("extractone: entry %s size %d too big" %
(ipath, info.file_size)) (ipath, info.file_size))
docdata = "" docdata = ""

View File

@ -73,6 +73,10 @@ bool MimeHandlerExec::next_document()
LOGDEB(("MimeHandlerExec::next_document(): helper known missing\n")); LOGDEB(("MimeHandlerExec::next_document(): helper known missing\n"));
return false; return false;
} }
int filtermaxseconds = 900;
m_config->getConfParam("filtermaxseconds", &filtermaxseconds);
if (params.empty()) { if (params.empty()) {
// Hu ho // Hu ho
LOGERR(("MimeHandlerExec::mkDoc: empty params\n")); LOGERR(("MimeHandlerExec::mkDoc: empty params\n"));
@ -80,15 +84,11 @@ bool MimeHandlerExec::next_document()
return false; return false;
} }
int filtermaxseconds = 900;
m_config->getConfParam("filtermaxseconds", &filtermaxseconds);
// Command name // Command name
string cmd = params.front(); string cmd = params.front();
// Build parameter vector: delete cmd name and add the file name // Build parameter vector: delete cmd name and add the file name
vector<string>::iterator it = params.begin(); vector<string>myparams(params.begin() + 1, params.end());
vector<string>myparams(++it, params.end());
myparams.push_back(m_fn); myparams.push_back(m_fn);
if (!m_ipath.empty()) if (!m_ipath.empty())
myparams.push_back(m_ipath); myparams.push_back(m_ipath);
@ -147,13 +147,18 @@ void MimeHandlerExec::finaldetails()
{ {
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
// cfgFilterOutputCharset comes from the mimeconf filter definition line // cfgFilterOutputCharset comes from the mimeconf filter
// definition line If the value is "default", we use the charset
// value defined in recoll.conf (which may vary depending on
// directory)
string& charset = m_metaData[cstr_dj_keycharset]; string& charset = m_metaData[cstr_dj_keycharset];
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset; charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) { if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset; charset = m_dfltInputCharset;
} }
// The output mime type is html except if defined otherwise in the filter
// definition.
string& mt = m_metaData[cstr_dj_keymt]; string& mt = m_metaData[cstr_dj_keymt];
mt = cfgFilterOutputMtype.empty() ? "text/html" : mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype; cfgFilterOutputMtype;

View File

@ -50,13 +50,18 @@ bool MimeHandlerExecMultiple::startCmd()
// Command name // Command name
string cmd = params.front(); string cmd = params.front();
// Build parameter list: delete cmd name m_maxmemberkb = 50000;
vector<string>::iterator it = params.begin(); m_config->getConfParam("maxmemberkb", &m_maxmemberkb);
vector<string>myparams(++it, params.end()); ostringstream oss;
oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb;
m_cmd.putenv(oss.str());
// Start filter
m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
"RECOLL_FILTER_FORPREVIEW=no"); "RECOLL_FILTER_FORPREVIEW=no");
// Build parameter list: delete cmd name
vector<string>myparams(params.begin() + 1, params.end());
if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) { if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
missingHelper = true; missingHelper = true;
@ -116,6 +121,10 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
return false; return false;
} }
LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
if (len / 1024 > m_maxmemberkb) {
LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
return false;
}
// Hack: check for 'Document:' and read directly the document data // Hack: check for 'Document:' and read directly the document data
// to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky // to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky
@ -297,7 +306,6 @@ bool MimeHandlerExecMultiple::next_document()
(void)txtdcode("mh_execm"); (void)txtdcode("mh_execm");
} }
if (eofnext_received) if (eofnext_received)
m_havedoc = false; m_havedoc = false;

View File

@ -107,6 +107,7 @@ private:
bool startCmd(); bool startCmd();
bool readDataElement(string& name, string& data); bool readDataElement(string& name, string& data);
bool m_filefirst; bool m_filefirst;
int m_maxmemberkb;
}; };
#endif /* _MH_EXECM_H_INCLUDED_ */ #endif /* _MH_EXECM_H_INCLUDED_ */

View File

@ -590,8 +590,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
return; return;
} }
// We are dealing with an inline part of text/plain or text/html type // We are dealing with an inline part of text/plain or text/html
// type There may be several such parts, which is why we don't
// just return a text or html subdoc and let the filter stack
// work: we want to concatenate them in place instead
LOGDEB2(("walkmime: final: body start offset %d, length %d\n", LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc->getBodyStartOffset(), doc->getBodyLength())); doc->getBodyStartOffset(), doc->getBodyLength()));

View File

@ -159,6 +159,12 @@ indexallfilenames = 1
# #
# indexedmimetypes = text/html application/pdf # indexedmimetypes = text/html application/pdf
#
# Size limit for archive members. This is passed to the filters in the
# environment as RECOLL_FILTER_MAXMEMBERKB
#
maxmemberkb = 50000
# Size limit for compressed files. We need to decompress these in a # Size limit for compressed files. We need to decompress these in a
# temporary directory for identification, which can be wasteful in some # temporary directory for identification, which can be wasteful in some
# cases. Limit the waste. Negative means no limit. 0 results in no # cases. Limit the waste. Negative means no limit. 0 results in no