diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 821dcff0..810fd10a 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -17,6 +17,11 @@ class RclExecM: def __init__(self): self.myname = os.path.basename(sys.argv[0]) self.mimetype = "" + self.maxmembersize = int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB")) + if self.maxmembersize: + self.maxmembersize = self.maxmembersize * 1024 + else: + self.maxmembersize = 50 * 1024 * 1024 def rclog(self, s, doexit = 0, exitvalue = 1): print >> sys.stderr, "RCLMFILT:", self.myname, ":", s diff --git a/src/filters/rclrar b/src/filters/rclrar index 970442c1..aa10d602 100755 --- a/src/filters/rclrar +++ b/src/filters/rclrar @@ -49,7 +49,7 @@ class RarExtractor: if not isdir: try: - if rarinfo.file_size > 50 * 1024 * 1024: + if rarinfo.file_size > self.em.maxmembersize: self.em.rclog("extractone: entry %s size %d too big" % (ipath, rarinfo.file_size)) docdata = "" diff --git a/src/filters/rcltar b/src/filters/rcltar index cb2dec27..7e285a0f 100755 --- a/src/filters/rcltar +++ b/src/filters/rcltar @@ -24,7 +24,7 @@ class TarExtractor: docdata = "" try: info = self.tar.getmember(ipath) - if info.size > 50 * 1024 * 1024: + if info.size > self.em.maxmembersize: # skip docdata = "" self.em.rclog("extractone: entry %s size %d too big" % diff --git a/src/filters/rclzip b/src/filters/rclzip index 0e8311dd..d9ea904f 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -40,14 +40,14 @@ class ZipExtractor: def __init__(self, em): self.currentindex = 0 self.em = em - + def extractone(self, ipath): #self.em.rclog("extractone: [%s]" % ipath) docdata = "" try: info = self.zip.getinfo(ipath) # There could be a 4GB Iso in the zip. We have to set a limit - if info.file_size > 50 * 1024*1024: + if info.file_size > self.em.maxmembersize: self.em.rclog("extractone: entry %s size %d too big" % (ipath, info.file_size)) docdata = "" diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 7ce52f5d..55d5644b 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -73,6 +73,10 @@ bool MimeHandlerExec::next_document() LOGDEB(("MimeHandlerExec::next_document(): helper known missing\n")); return false; } + + int filtermaxseconds = 900; + m_config->getConfParam("filtermaxseconds", &filtermaxseconds); + if (params.empty()) { // Hu ho LOGERR(("MimeHandlerExec::mkDoc: empty params\n")); @@ -80,15 +84,11 @@ bool MimeHandlerExec::next_document() return false; } - int filtermaxseconds = 900; - m_config->getConfParam("filtermaxseconds", &filtermaxseconds); - // Command name string cmd = params.front(); // Build parameter vector: delete cmd name and add the file name - vector::iterator it = params.begin(); - vectormyparams(++it, params.end()); + vectormyparams(params.begin() + 1, params.end()); myparams.push_back(m_fn); if (!m_ipath.empty()) myparams.push_back(m_ipath); @@ -147,13 +147,18 @@ void MimeHandlerExec::finaldetails() { m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; - // cfgFilterOutputCharset comes from the mimeconf filter definition line + // cfgFilterOutputCharset comes from the mimeconf filter + // definition line If the value is "default", we use the charset + // value defined in recoll.conf (which may vary depending on + // directory) string& charset = m_metaData[cstr_dj_keycharset]; charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset; if (!stringlowercmp("default", charset)) { charset = m_dfltInputCharset; } + // The output mime type is html except if defined otherwise in the filter + // definition. string& mt = m_metaData[cstr_dj_keymt]; mt = cfgFilterOutputMtype.empty() ? "text/html" : cfgFilterOutputMtype; diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 29439e83..de0badf9 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -50,13 +50,18 @@ bool MimeHandlerExecMultiple::startCmd() // Command name string cmd = params.front(); - // Build parameter list: delete cmd name - vector::iterator it = params.begin(); - vectormyparams(++it, params.end()); + m_maxmemberkb = 50000; + m_config->getConfParam("maxmemberkb", &m_maxmemberkb); + ostringstream oss; + oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb; + m_cmd.putenv(oss.str()); - // Start filter m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : "RECOLL_FILTER_FORPREVIEW=no"); + + // Build parameter list: delete cmd name + vectormyparams(params.begin() + 1, params.end()); + if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) { m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; missingHelper = true; @@ -116,7 +121,11 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data) return false; } LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); - + if (len / 1024 > m_maxmemberkb) { + LOGERR(("MHExecMultiple: data len > maxmemberkb\n")); + return false; + } + // Hack: check for 'Document:' and read directly the document data // to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky // piece @@ -297,7 +306,6 @@ bool MimeHandlerExecMultiple::next_document() (void)txtdcode("mh_execm"); } - if (eofnext_received) m_havedoc = false; diff --git a/src/internfile/mh_execm.h b/src/internfile/mh_execm.h index 779613d7..eec824b8 100644 --- a/src/internfile/mh_execm.h +++ b/src/internfile/mh_execm.h @@ -107,6 +107,7 @@ private: bool startCmd(); bool readDataElement(string& name, string& data); bool m_filefirst; + int m_maxmemberkb; }; #endif /* _MH_EXECM_H_INCLUDED_ */ diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index fd1a1872..1ac0890a 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -590,8 +590,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) return; } - // We are dealing with an inline part of text/plain or text/html type - + // We are dealing with an inline part of text/plain or text/html + // type There may be several such parts, which is why we don't + // just return a text or html subdoc and let the filter stack + // work: we want to concatenate them in place instead LOGDEB2(("walkmime: final: body start offset %d, length %d\n", doc->getBodyStartOffset(), doc->getBodyLength())); diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index f9e5961e..231a9249 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -159,6 +159,12 @@ indexallfilenames = 1 # # indexedmimetypes = text/html application/pdf +# +# Size limit for archive members. This is passed to the filters in the +# environment as RECOLL_FILTER_MAXMEMBERKB +# +maxmemberkb = 50000 + # Size limit for compressed files. We need to decompress these in a # temporary directory for identification, which can be wasteful in some # cases. Limit the waste. Negative means no limit. 0 results in no