implemented maxmemberkb limit for multidoc (e.g. archive) members

2012-10-06 09:05:35 +02:00 · 2012-10-06 09:05:35 +02:00 · 29fe1e4927
commit 29fe1e4927
parent 1329265b7b
9 changed files with 45 additions and 18 deletions
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@ -17,6 +17,11 @@ class RclExecM:
    def __init__(self):
        self.myname = os.path.basename(sys.argv[0])
        self.mimetype = ""
        self.maxmembersize = int(os.environ.get("RECOLL_FILTER_MAXMEMBERKB"))
        if self.maxmembersize:
            self.maxmembersize = self.maxmembersize * 1024
        else:
            self.maxmembersize = 50 * 1024 * 1024
    def rclog(self, s, doexit = 0, exitvalue = 1):
        print >> sys.stderr, "RCLMFILT:", self.myname, ":", s
--- a/src/filters/rclrar
+++ b/src/filters/rclrar
@ -49,7 +49,7 @@ class RarExtractor:
        if not isdir:
            try:
-                if rarinfo.file_size > 50 * 1024 * 1024:
+                if rarinfo.file_size > self.em.maxmembersize:
                    self.em.rclog("extractone: entry %s size %d too big" %
                              (ipath, rarinfo.file_size))
                    docdata = ""
--- a/src/filters/rcltar
+++ b/src/filters/rcltar
@ -24,7 +24,7 @@ class TarExtractor:
        docdata = ""
        try:
            info = self.tar.getmember(ipath)
-            if info.size > 50 * 1024 * 1024:
+            if info.size > self.em.maxmembersize:
                # skip
                docdata = ""
                self.em.rclog("extractone: entry %s size %d too big" %
--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@ -47,7 +47,7 @@ class ZipExtractor:
        try:
            info = self.zip.getinfo(ipath)
            # There could be a 4GB Iso in the zip. We have to set a limit
-            if info.file_size > 50 * 1024*1024:
+            if info.file_size > self.em.maxmembersize:
                self.em.rclog("extractone: entry %s size %d too big" %
                              (ipath, info.file_size))
                docdata = ""
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -73,6 +73,10 @@ bool MimeHandlerExec::next_document()
 	LOGDEB(("MimeHandlerExec::next_document(): helper known missing\n"));
 	return false;
    }
    int filtermaxseconds = 900;
    m_config->getConfParam("filtermaxseconds", &filtermaxseconds);
    if (params.empty()) {
 	// Hu ho
 	LOGERR(("MimeHandlerExec::mkDoc: empty params\n"));
@ -80,15 +84,11 @@ bool MimeHandlerExec::next_document()
 	return false;
    }
    int filtermaxseconds = 900;
    m_config->getConfParam("filtermaxseconds", &filtermaxseconds);
    // Command name
    string cmd = params.front();
    // Build parameter vector: delete cmd name and add the file name
-    vector<string>::iterator it = params.begin();
+    vector<string>myparams(params.begin() + 1, params.end());
    vector<string>myparams(++it, params.end());
    myparams.push_back(m_fn);
    if (!m_ipath.empty())
 	myparams.push_back(m_ipath);
@ -147,13 +147,18 @@ void MimeHandlerExec::finaldetails()
 {
    m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
-    // cfgFilterOutputCharset comes from the mimeconf filter definition line
+    // cfgFilterOutputCharset comes from the mimeconf filter
    // definition line If the value is "default", we use the charset
    // value defined in recoll.conf (which may vary depending on
    // directory)
    string& charset = m_metaData[cstr_dj_keycharset];
    charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
    if (!stringlowercmp("default", charset)) {
 	charset = m_dfltInputCharset;
    }
    // The output mime type is html except if defined otherwise in the filter
    // definition.
    string& mt = m_metaData[cstr_dj_keymt];
    mt = cfgFilterOutputMtype.empty() ? "text/html" : 
 	cfgFilterOutputMtype;
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -50,13 +50,18 @@ bool MimeHandlerExecMultiple::startCmd()
    // Command name
    string cmd = params.front();
-    // Build parameter list: delete cmd name
+    m_maxmemberkb = 50000;
-    vector<string>::iterator it = params.begin();
+    m_config->getConfParam("maxmemberkb", &m_maxmemberkb);
-    vector<string>myparams(++it, params.end());
+    ostringstream oss;
    oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb;
    m_cmd.putenv(oss.str());
    // Start filter
    m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
 		"RECOLL_FILTER_FORPREVIEW=no");
    // Build parameter list: delete cmd name
    vector<string>myparams(params.begin() + 1, params.end());
    if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
        m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
        missingHelper = true;
@ -116,6 +121,10 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
        return false;
    }
    LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
    if (len / 1024 > m_maxmemberkb) {
        LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
        return false;
    }
    // Hack: check for 'Document:' and read directly the document data
    // to m_metaData[cstr_dj_keycontent] to avoid an extra copy of the bulky
@ -297,7 +306,6 @@ bool MimeHandlerExecMultiple::next_document()
 	(void)txtdcode("mh_execm");
    }
    if (eofnext_received)
        m_havedoc = false;
--- a/src/internfile/mh_execm.h
+++ b/src/internfile/mh_execm.h
@ -107,6 +107,7 @@ private:
    bool startCmd();
    bool readDataElement(string& name, string& data);
    bool m_filefirst;
    int  m_maxmemberkb;
 };
 #endif /* _MH_EXECM_H_INCLUDED_ */
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -590,8 +590,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	return;
    }
-    // We are dealing with an inline part of text/plain or text/html type
+    // We are dealing with an inline part of text/plain or text/html
-
+    // type There may be several such parts, which is why we don't
    // just return a text or html subdoc and let the filter stack
    // work: we want to concatenate them in place instead
    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 	     doc->getBodyStartOffset(), doc->getBodyLength()));
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@ -159,6 +159,12 @@ indexallfilenames = 1
 #
 # indexedmimetypes = text/html application/pdf
 #
 # Size limit for archive members. This is passed to the filters in the
 # environment as RECOLL_FILTER_MAXMEMBERKB
 # 
 maxmemberkb = 50000
 # Size limit for compressed files. We need to decompress these in a
 # temporary directory for identification, which can be wasteful in some
 # cases. Limit the waste. Negative means no limit. 0 results in no