diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 30708e11..534de783 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -34,14 +34,6 @@ using namespace std; -MimeHandlerExec::MimeHandlerExec(RclConfig *cnf, const std::string& id) - : RecollFilter(cnf, id), missingHelper(false), m_filtermaxseconds(900), - m_filtermaxmbytes(0) -{ - m_config->getConfParam("filtermaxseconds", &m_filtermaxseconds); - m_config->getConfParam("filtermaxmbytes", &m_filtermaxmbytes); -} - MEAdv::MEAdv(int maxsecs) : m_filtermaxseconds(maxsecs) { @@ -55,10 +47,11 @@ void MEAdv::reset() void MEAdv::newData(int n) { - LOGDEB2("MHExec:newData(" << (n) << ")\n" ); + LOGDEB2("MHExec:newData(" << n << ")\n"); if (m_filtermaxseconds > 0 && time(0L) - m_start > m_filtermaxseconds) { - LOGERR("MimeHandlerExec: filter timeout (" << (m_filtermaxseconds) << " S)\n" ); + LOGERR("MimeHandlerExec: filter timeout (" << m_filtermaxseconds << + " S)\n"); throw HandlerTimeout(); } // If a cancel request was set by the signal handler (or by us @@ -67,9 +60,65 @@ void MEAdv::newData(int n) CancelCheck::instance().checkCancel(); } + +MimeHandlerExec::MimeHandlerExec(RclConfig *cnf, const std::string& id) + : RecollFilter(cnf, id), missingHelper(false), m_filtermaxseconds(900), + m_filtermaxmbytes(0), m_handlernomd5(false), m_hnomd5init(false), + m_nomd5(false) +{ + m_config->getConfParam("filtermaxseconds", &m_filtermaxseconds); + m_config->getConfParam("filtermaxmbytes", &m_filtermaxmbytes); +} + +bool MimeHandlerExec::set_document_file_impl(const std::string& mt, + const std::string &file_path) +{ + // Can't do this in constructor as script name not set yet. Do it + // once on first call + unordered_set nomd5tps; + bool tpsread(false); + + if (false == m_hnomd5init) { + m_hnomd5init = true; + if (m_config->getConfParam("nomd5types", &nomd5tps)) { + tpsread = true; + if (!nomd5tps.empty()) { + if (params.size() && + nomd5tps.find(path_getsimple(params[0])) != + nomd5tps.end()) { + m_handlernomd5 = true; + } + // On windows the 1st param is often a script interp + // name (e.g. "python", and the script name is 2nd + if (params.size() > 1 && + nomd5tps.find(path_getsimple(params[1])) != + nomd5tps.end()) { + m_handlernomd5 = true; + } + } + } + } + + m_nomd5 = m_handlernomd5; + + if (!m_nomd5) { + // Check for MIME type based md5 suppression + if (!tpsread) { + m_config->getConfParam("nomd5types", &nomd5tps); + } + if (nomd5tps.find(mt) != nomd5tps.end()) { + m_nomd5 = true; + } + } + + m_fn = file_path; + m_havedoc = true; + return true; +} + bool MimeHandlerExec::skip_to_document(const string& ipath) { - LOGDEB("MimeHandlerExec:skip_to_document: [" << (ipath) << "]\n" ); + LOGDEB("MimeHandlerExec:skip_to_document: [" << ipath << "]\n"); m_ipath = ipath; return true; } @@ -82,13 +131,13 @@ bool MimeHandlerExec::next_document() return false; m_havedoc = false; if (missingHelper) { - LOGDEB("MimeHandlerExec::next_document(): helper known missing\n" ); + LOGDEB("MimeHandlerExec::next_document(): helper known missing\n"); return false; } if (params.empty()) { // Hu ho - LOGERR("MimeHandlerExec::mkDoc: empty params\n" ); + LOGERR("MimeHandlerExec::next_document: empty params\n"); m_reason = "RECFILTERROR BADCONFIG"; return false; } @@ -110,7 +159,7 @@ bool MimeHandlerExec::next_document() mexec.setAdvise(&adv); mexec.putenv("RECOLL_CONFDIR", m_config->getConfDir()); mexec.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : - "RECOLL_FILTER_FORPREVIEW=no"); + "RECOLL_FILTER_FORPREVIEW=no"); mexec.setrlimit_as(m_filtermaxmbytes); int status; @@ -125,7 +174,8 @@ bool MimeHandlerExec::next_document() } if (status) { - LOGERR("MimeHandlerExec: command status 0x" << (status) << " for " << (cmd) << "\n" ); + LOGERR("MimeHandlerExec: command status 0x" << status << " for " << + cmd << "\n"); if (WIFEXITED(status) && WEXITSTATUS(status) == 127) { // That's how execmd signals a failed exec (most probably // a missing command). Let'hope no filter uses the same value as @@ -188,12 +238,13 @@ void MimeHandlerExec::finaldetails() m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : cfgFilterOutputMtype; - if (!m_forPreview) { + if (!m_forPreview && !m_nomd5) { string md5, xmd5, reason; if (MD5File(m_fn, md5, &reason)) { m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); } else { - LOGERR("MimeHandlerExec: cant compute md5 for [" << (m_fn) << "]: " << (reason) << "\n" ); + LOGERR("MimeHandlerExec: cant compute md5 for [" << m_fn << "]: " << + reason << "\n"); } } diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h index 15e359b4..fdc595c6 100644 --- a/src/internfile/mh_exec.h +++ b/src/internfile/mh_exec.h @@ -62,14 +62,6 @@ class MimeHandlerExec : public RecollFilter { MimeHandlerExec(RclConfig *cnf, const std::string& id); - virtual bool set_document_file(const std::string& mt, - const std::string &file_path) { - RecollFilter::set_document_file(mt, file_path); - m_fn = file_path; - m_havedoc = true; - return true; - } - virtual bool next_document(); virtual bool skip_to_document(const std::string& ipath); @@ -80,9 +72,17 @@ class MimeHandlerExec : public RecollFilter { } protected: + virtual bool set_document_file_impl(const std::string& mt, + const std::string& file_path); + std::string m_fn; std::string m_ipath; - + // md5 computation excluded by handler name: can't change after init + bool m_handlernomd5; + bool m_hnomd5init; + // If md5 not excluded by handler name, allow/forbid depending on mime + bool m_nomd5; + // Set up the character set metadata fields and possibly transcode // text/plain output. // @param charset when called from mh_execm, a possible explicit diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 2625d87e..7066e53a 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -178,7 +178,7 @@ bool MimeHandlerExecMultiple::next_document() ostringstream obuf; string file_md5; if (m_filefirst) { - if (!m_forPreview) { + if (!m_forPreview && !m_nomd5) { string md5, xmd5, reason; if (MD5File(m_fn, md5, &reason)) { file_md5 = MD5HexPrint(md5, xmd5); diff --git a/src/internfile/mh_execm.h b/src/internfile/mh_execm.h index e5f2bd8c..36b8a7d3 100644 --- a/src/internfile/mh_execm.h +++ b/src/internfile/mh_execm.h @@ -102,22 +102,27 @@ class MimeHandlerExecMultiple : public MimeHandlerExec { /////// End un-cleared stuff. public: - MimeHandlerExecMultiple(RclConfig *cnf, const string& id) - : MimeHandlerExec(cnf, id) - {} + MimeHandlerExecMultiple(RclConfig *cnf, const std::string& id) + : MimeHandlerExec(cnf, id) { + } // No resources to clean up, the ExecCmd destructor does it. virtual ~MimeHandlerExecMultiple() {} - virtual bool set_document_file(const string& mt, const string &file_path) { - m_filefirst = true; - return MimeHandlerExec::set_document_file(mt, file_path); - } + virtual bool next_document(); // skip_to and clear inherited from MimeHandlerExec +protected: + // This is the only 2nd-level derived handler class. Use call-super. + virtual bool set_document_file_impl(const std::string& mt, + const std::string &file_path) { + m_filefirst = true; + return MimeHandlerExec::set_document_file_impl(mt, file_path); + } + private: bool startCmd(); - bool readDataElement(string& name, string& data); + bool readDataElement(std::string& name, std::string& data); bool m_filefirst; int m_maxmemberkb; MEAdv m_adv; diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index bc0ec594..96a0f340 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -34,23 +34,21 @@ using namespace std; #endif /* NO_NAMESPACES */ -bool MimeHandlerHtml::set_document_file(const string& mt, const string &fn) +bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn) { - LOGDEB0("textHtmlToDoc: " << (fn) << "\n" ); - RecollFilter::set_document_file(mt, fn); + LOGDEB0("textHtmlToDoc: " << fn << "\n"); string otext; if (!file_to_string(fn, otext)) { - LOGINFO("textHtmlToDoc: cant read: " << (fn) << "\n" ); + LOGINFO("textHtmlToDoc: cant read: " << fn << "\n"); return false; } m_filename = fn; return set_document_string(mt, otext); } -bool MimeHandlerHtml::set_document_string(const string& mt, - const string& htext) +bool MimeHandlerHtml::set_document_string_impl(const string& mt, + const string& htext) { - RecollFilter::set_document_string(mt, htext); m_html = htext; m_havedoc = true; @@ -73,12 +71,14 @@ bool MimeHandlerHtml::next_document() m_filename.erase(); string charset = m_dfltInputCharset; - LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << (charset) << "]\n" ); + LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset + << "]\n"); // Override default input charset if someone took care to set one: map::const_iterator it = m_metaData.find(cstr_dj_keycharset); if (it != m_metaData.end() && !it->second.empty()) { charset = it->second; - LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << (charset) << "]\n" ); + LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << + charset << "]\n"); } // - We first try to convert from the supposed charset @@ -91,13 +91,15 @@ bool MimeHandlerHtml::next_document() MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; - LOGDEB("Html::mkDoc: pass " << (pass) << "\n" ); + LOGDEB("Html::mkDoc: pass " << pass << "\n"); MyHtmlParser p; // Try transcoding. If it fails, use original text. int ecnt; if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { - LOGDEB("textHtmlToDoc: transcode failed from cs '" << (charset) << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << "]" ); + LOGDEB("textHtmlToDoc: transcode failed from cs '" << + charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << + "]"); transcoded = m_html; // We don't know the charset, at all p.reset_charsets(); @@ -105,9 +107,11 @@ bool MimeHandlerHtml::next_document() } else { if (ecnt) { if (pass == 0) { - LOGDEB("textHtmlToDoc: init transcode had " << (ecnt) << " errors for [" << (fn.empty()?"unknown":fn) << "]\n" ); + LOGDEB("textHtmlToDoc: init transcode had " << ecnt << + " errors for ["<<(fn.empty()?"unknown":fn)<< "]\n"); } else { - LOGERR("textHtmlToDoc: final transcode had " << (ecnt) << " errors for [" << (fn.empty()?"unknown":fn) << "]\n" ); + LOGERR("textHtmlToDoc: final transcode had " << ecnt << + " errors for ["<< (fn.empty()?"unknown":fn)<< "]\n"); } } // charset has the putative source charset, transcoded is now @@ -145,15 +149,16 @@ bool MimeHandlerHtml::next_document() break; } - LOGDEB("textHtmlToDoc: charset [" << (charset) << "] doc charset [" << (result.get_charset()) << "]\n" ); + LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< + result.get_charset() << "]\n"); if (!result.get_charset().empty() && !samecharset(result.get_charset(), result.fromcharset)) { - LOGDEB("textHtmlToDoc: reparse for charsets\n" ); + LOGDEB("textHtmlToDoc: reparse for charsets\n"); // Set the origin charset as specified in document before // transcoding again charset = result.get_charset(); } else { - LOGERR("textHtmlToDoc:: error: non charset exception\n" ); + LOGERR("textHtmlToDoc:: error: non charset exception\n"); return false; } } diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index d6335413..a4f21776 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -26,22 +26,18 @@ */ class MimeHandlerHtml : public RecollFilter { public: - MimeHandlerHtml(RclConfig *cnf, const string& id) - : RecollFilter(cnf, id) - { + MimeHandlerHtml(RclConfig *cnf, const std::string& id) + : RecollFilter(cnf, id) { } - virtual ~MimeHandlerHtml() - { - } - virtual bool set_document_file(const string& mt, const string &file_path); - virtual bool set_document_string(const string& mt, const string &data); + virtual ~MimeHandlerHtml() {} + virtual bool is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) return true; return false; } virtual bool next_document(); - const string& get_html() + const std::string& get_html() { return m_html; } @@ -50,9 +46,15 @@ class MimeHandlerHtml : public RecollFilter { m_html.erase(); RecollFilter::clear(); } +protected: + virtual bool set_document_file_impl(const std::string& mt, + const std::string &file_path); + virtual bool set_document_string_impl(const std::string& mt, + const std::string &data); + private: - string m_filename; - string m_html; + std::string m_filename; + std::string m_html; }; #endif /* _HTML_H_INCLUDED_ */ diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index e54812fa..34df1057 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -87,10 +87,9 @@ void MimeHandlerMail::clear() RecollFilter::clear(); } -bool MimeHandlerMail::set_document_file(const string& mt, const string &fn) +bool MimeHandlerMail::set_document_file_impl(const string& mt, const string &fn) { - LOGDEB("MimeHandlerMail::set_document_file(" << (fn) << ")\n" ); - RecollFilter::set_document_file(mt, fn); + LOGDEB("MimeHandlerMail::set_document_file(" << fn << ")\n"); if (m_fd >= 0) { close(m_fd); m_fd = -1; @@ -103,12 +102,13 @@ bool MimeHandlerMail::set_document_file(const string& mt, const string &fn) if (MD5File(fn, md5, &reason)) { m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); } else { - LOGERR("MimeHandlerMail: cant md5 [" << (fn) << "]: " << (reason) << "\n" ); + LOGERR("MimeHandlerMail: md5 [" << fn << "]: " << reason << "\n"); } } m_fd = open(fn.c_str(), 0); if (m_fd < 0) { - LOGERR("MimeHandlerMail::set_document_file: open(" << (fn) << ") errno " << (errno) << "\n" ); + LOGERR("MimeHandlerMail::set_document_file: open(" << fn << + ") errno " << errno << "\n"); return false; } #if defined O_NOATIME && O_NOATIME != 0 @@ -120,19 +120,18 @@ bool MimeHandlerMail::set_document_file(const string& mt, const string &fn) m_bincdoc = new Binc::MimeDocument; m_bincdoc->parseFull(m_fd); if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) { - LOGERR("MimeHandlerMail::mkDoc: mime parse error for " << (fn) << "\n" ); + LOGERR("MimeHandlerMail::mkDoc: mime parse error for " << fn << "\n"); return false; } m_havedoc = true; return true; } -bool MimeHandlerMail::set_document_string(const string& mt, - const string &msgtxt) +bool MimeHandlerMail::set_document_string_impl(const string& mt, + const string& msgtxt) { - LOGDEB1("MimeHandlerMail::set_document_string\n" ); - LOGDEB2("Message text: [" << (msgtxt) << "]\n" ); - RecollFilter::set_document_string(mt, msgtxt); + LOGDEB1("MimeHandlerMail::set_document_string\n"); + LOGDEB2("Message text: [" << msgtxt << "]\n"); delete m_stream; if (!m_forPreview) { @@ -142,17 +141,19 @@ bool MimeHandlerMail::set_document_string(const string& mt, } if ((m_stream = new stringstream(msgtxt)) == 0 || !m_stream->good()) { - LOGERR("MimeHandlerMail::set_document_string: stream create error.msgtxt.size() " << (int(msgtxt.size())) << "\n" ); + LOGERR("MimeHandlerMail::set_document_string: stream create error." + "msgtxt.size() " << msgtxt.size() << "\n"); return false; } delete m_bincdoc; if ((m_bincdoc = new Binc::MimeDocument) == 0) { - LOGERR("MimeHandlerMail::set_doc._string: new Binc:Document failed. Out of memory?" ); + LOGERR("MimeHandlerMail::set_doc._string: new Binc:Document failed. " + "Out of memory?"); return false; } m_bincdoc->parseFull(*m_stream); if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) { - LOGERR("MimeHandlerMail::set_document_string: mime parse error\n" ); + LOGERR("MimeHandlerMail::set_document_string: mime parse error\n"); return false; } m_havedoc = true; @@ -161,14 +162,14 @@ bool MimeHandlerMail::set_document_string(const string& mt, bool MimeHandlerMail::skip_to_document(const string& ipath) { - LOGDEB("MimeHandlerMail::skip_to_document(" << (ipath) << ")\n" ); + LOGDEB("MimeHandlerMail::skip_to_document(" << ipath << ")\n"); if (m_idx == -1) { // No decoding done yet. If ipath is null need do nothing if (ipath.empty() || ipath == "-1") return true; // ipath points to attachment: need to decode message if (!next_document()) { - LOGERR("MimeHandlerMail::skip_to_doc: next_document failed\n" ); + LOGERR("MimeHandlerMail::skip_to_doc: next_document failed\n"); return false; } } @@ -178,7 +179,8 @@ bool MimeHandlerMail::skip_to_document(const string& ipath) bool MimeHandlerMail::next_document() { - LOGDEB("MimeHandlerMail::next_document m_idx " << (m_idx) << " m_havedoc " << (m_havedoc) << "\n" ); + LOGDEB("MimeHandlerMail::next_document m_idx " << m_idx << " m_havedoc " << + m_havedoc << "\n"); if (!m_havedoc) return false; bool res = false; @@ -186,7 +188,9 @@ bool MimeHandlerMail::next_document() if (m_idx == -1) { m_metaData[cstr_dj_keymt] = cstr_textplain; res = processMsg(m_bincdoc, 0); - LOGDEB1("MimeHandlerMail::next_document: mt " << (m_metaData[cstr_dj_keymt]) << ", att cnt " << (m_attachments.size()) << "\n" ); + LOGDEB1("MimeHandlerMail::next_document: mt " << + m_metaData[cstr_dj_keymt] << ", att cnt " << + m_attachments.size() << "\n"); const string& txt = m_metaData[cstr_dj_keycontent]; if (m_startoftext < txt.size()) m_metaData[cstr_dj_keyabstract] = @@ -221,16 +225,16 @@ static bool decodeBody(const string& cte, // Content transfer encoding if (!stringlowercmp("quoted-printable", cte)) { if (!qp_decode(body, decoded)) { - LOGERR("decodeBody: quoted-printable decoding failed !\n" ); - LOGDEB(" Body: \n" << (body) << "\n" ); + LOGERR("decodeBody: quoted-printable decoding failed !\n"); + LOGDEB(" Body: \n" << body << "\n"); return false; } *respp = &decoded; } else if (!stringlowercmp("base64", cte)) { if (!base64_decode(body, decoded)) { // base64 encoding errors are actually relatively common - LOGERR("decodeBody: base64 decoding failed !\n" ); - LOGDEB(" Body: \n" << (body) << "\n" ); + LOGERR("decodeBody: base64 decoding failed !\n"); + LOGDEB(" Body: \n" << body << "\n"); return false; } *respp = &decoded; @@ -240,7 +244,7 @@ static bool decodeBody(const string& cte, // Content transfer encoding bool MimeHandlerMail::processAttach() { - LOGDEB("MimeHandlerMail::processAttach() m_idx " << (m_idx) << "\n" ); + LOGDEB("MimeHandlerMail::processAttach() m_idx " << m_idx << "\n"); if (!m_havedoc) return false; if (m_idx >= (int)m_attachments.size()) { @@ -254,7 +258,8 @@ bool MimeHandlerMail::processAttach() m_metaData[cstr_dj_keycharset] = att->m_charset; m_metaData[cstr_dj_keyfn] = att->m_filename; m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")"; - LOGDEB1(" processAttach:ct [" << (att->m_contentType) << "] cs [" << (att->m_charset) << "] fn [" << (att->m_filename) << "]\n" ); + LOGDEB1(" processAttach:ct [" << att->m_contentType << "] cs [" << + att->m_charset << "] fn [" << att->m_filename << "]\n"); // Erase current content and replace m_metaData[cstr_dj_keycontent] = string(); @@ -305,10 +310,11 @@ bool MimeHandlerMail::processAttach() // text bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) { - LOGDEB2("MimeHandlerMail::processMsg: depth " << (depth) << "\n" ); + LOGDEB2("MimeHandlerMail::processMsg: depth " << depth << "\n"); if (depth++ >= maxdepth) { // Have to stop somewhere - LOGINFO("MimeHandlerMail::processMsg: maxdepth " << (maxdepth) << " exceeded\n" ); + LOGINFO("MimeHandlerMail::processMsg: maxdepth " << maxdepth << + " exceeded\n"); // Return true anyway, better to index partially than not at all return true; } @@ -360,7 +366,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) m_metaData[cstr_dj_keymd] = ascuxtime; } else { // Leave mtime field alone, ftime will be used instead. - LOGDEB("rfc2822Date...: failed: [" << (decoded) << "]\n" ); + LOGDEB("rfc2822Date...: failed: [" << decoded << "]\n"); } } if (preview()) @@ -394,10 +400,12 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) text += '\n'; m_startoftext = text.size(); - LOGDEB2("MimeHandlerMail::processMsg:ismultipart " << (doc->isMultipart()) << " mime subtype '" << (doc->getSubType()) << "'\n" ); + LOGDEB2("MimeHandlerMail::processMsg:ismultipart " << + doc->isMultipart() << " mime subtype '"<getSubType()<< "'\n"); walkmime(doc, depth); - LOGDEB2("MimeHandlerMail::processMsg:text:[" << (m_metaData[cstr_dj_keycontent]) << "]\n" ); + LOGDEB2("MimeHandlerMail::processMsg:text:[" << + m_metaData[cstr_dj_keycontent] << "]\n"); return true; } @@ -413,16 +421,17 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) // message/rfc822 may also be of interest. void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) { - LOGDEB2("MimeHandlerMail::walkmime: depth " << (depth) << "\n" ); + LOGDEB2("MimeHandlerMail::walkmime: depth " << depth << "\n"); if (depth++ >= maxdepth) { - LOGINFO("walkmime: max depth (" << (maxdepth) << ") exceeded\n" ); + LOGINFO("walkmime: max depth (" << maxdepth << ") exceeded\n"); return; } string& out = m_metaData[cstr_dj_keycontent]; if (doc->isMultipart()) { - LOGDEB2("walkmime: ismultipart " << (doc->isMultipart()) << " subtype '" << (doc->getSubType()) << "'\n" ); + LOGDEB2("walkmime: ismultipart " << doc->isMultipart() << + " subtype '" << doc->getSubType() << "'\n"); // We only handle alternative, related and mixed (no digests). std::vector::iterator it; @@ -445,22 +454,22 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) // Get and parse content-type header Binc::HeaderItem hi; if (!it->h.getFirstHeader("Content-Type", hi)) { - LOGDEB("walkmime:no ctent-type header for part " << (i) << "\n" ); + LOGDEB("walkmime:no ctent-type header for part "<members.end()) { - LOGDEB2("walkmime: alternative: chose text/plain part\n" ); + LOGDEB2("walkmime: alternative: chose text/plain part\n"); walkmime(&(*ittxt), depth); } else if (ithtml != doc->members.end()) { - LOGDEB2("walkmime: alternative: chose text/html part\n" ); + LOGDEB2("walkmime: alternative: chose text/html part\n"); walkmime(&(*ithtml), depth); } } @@ -476,7 +485,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) if (doc->h.getFirstHeader("Content-Type", hi)) { ctt = hi.getValue(); } - LOGDEB2("walkmime:content-type: " << (ctt) << "\n" ); + LOGDEB2("walkmime:content-type: " << ctt << "\n"); MimeHeaderValue content_type; parseMimeHeaderValue(ctt, content_type); @@ -487,7 +496,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } MimeHeaderValue content_disposition; parseMimeHeaderValue(ctd, content_disposition); - LOGDEB2("Content_disposition:[" << (content_disposition.value) << "]\n" ); + LOGDEB2("Content_disposition:[" << content_disposition.value << "]\n"); string dispindic; if (stringlowercmp("inline", content_disposition.value)) dispindic = "Attachment"; @@ -507,7 +516,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } if (doc->isMessageRFC822()) { - LOGDEB2("walkmime: message/RFC822 part\n" ); + LOGDEB2("walkmime: message/RFC822 part\n"); // The first part is the already parsed message. Call // processMsg instead of walkmime so that mail headers get @@ -528,7 +537,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } // "Simple" part. - LOGDEB2("walkmime: simple part\n" ); + LOGDEB2("walkmime: simple part\n"); // Normally the default charset is us-ascii. But it happens that 8 // bit chars exist in a message that is stated as us-ascii. Ie the // mailer used by yahoo support ('KANA') does this. We could @@ -575,7 +584,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } MHMailAttach *att = new MHMailAttach; if (att == 0) { - LOGERR("Out of memory\n" ); + LOGERR("Out of memory\n"); return; } att->m_contentType = content_type.value; @@ -584,7 +593,9 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) att->m_charset = charset; att->m_contentTransferEncoding = cte; att->m_part = doc; - LOGDEB("walkmime: attachmnt: ct [" << (att->m_contentType) << "] cte [" << (att->m_contentTransferEncoding) << "] cs [" << (att->m_charset) << "] fn [" << (filename) << "]\n" ); + LOGDEB("walkmime: attachmnt: ct [" << att->m_contentType << + "] cte [" << att->m_contentTransferEncoding << "] cs [" << + att->m_charset << "] fn [" << filename << "]\n"); m_attachments.push_back(att); return; } @@ -594,14 +605,15 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) // filter stack work: this would create another subdocument, but // we want instead to decode a body part of this message document. - LOGDEB2("walkmime: final: body start offset " << (doc->getBodyStartOffset()) << ", length " << (doc->getBodyLength()) << "\n" ); + LOGDEB2("walkmime: final: body start offset " << + doc->getBodyStartOffset()<<", length "<getBodyLength()<<"\n"); string body; doc->getBody(body, 0, doc->bodylength); { string decoded; const string *bdp; if (!decodeBody(cte, body, decoded, &bdp)) { - LOGERR("MimeHandlerMail::walkmime: failed decoding body\n" ); + LOGERR("MimeHandlerMail::walkmime: failed decoding body\n"); } if (bdp != &body) body.swap(decoded); @@ -622,9 +634,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } else { string utf8; // Transcode to utf-8 - LOGDEB1("walkmime: transcoding from " << (charset) << " to UTF-8\n" ); + LOGDEB1("walkmime: transcoding from " << charset << " to UTF-8\n"); if (!transcode(body, utf8, charset, cstr_utf8)) { - LOGERR("walkmime: transcode failed from cs '" << (charset) << "' to UTF-8\n" ); + LOGERR("walkmime: transcode failed from cs '" << charset << + "' to UTF-8\n"); out += body; } else { out += utf8; @@ -634,6 +647,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) if (out.length() && out[out.length()-1] != '\n') out += '\n'; - LOGDEB2("walkmime: out now: [" << (out) << "]\n" ); + LOGDEB2("walkmime: out now: [" << out << "]\n"); } diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index 0a2e93d5..51eca62d 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -20,8 +20,6 @@ #include #include #include -using std::vector; -using std::map; #include "mimehandler.h" @@ -39,19 +37,23 @@ class MHMailAttach; */ class MimeHandlerMail : public RecollFilter { public: - MimeHandlerMail(RclConfig *cnf, const string &id); + MimeHandlerMail(RclConfig *cnf, const std::string &id); virtual ~MimeHandlerMail(); - virtual bool set_document_file(const string& mt, const string& file_path); - virtual bool set_document_string(const string& mt, const string& data); virtual bool is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) return true; return false; } virtual bool next_document(); - virtual bool skip_to_document(const string& ipath); + virtual bool skip_to_document(const std::string& ipath); virtual void clear(); +protected: + virtual bool set_document_file_impl(const std::string& mt, + const std::string& file_path); + virtual bool set_document_string_impl(const std::string& mt, + const std::string& data); + private: bool processMsg(Binc::MimePart *doc, int depth); void walkmime(Binc::MimePart* doc, int depth); @@ -65,19 +67,19 @@ private: int m_idx; // Start of actual text (after the reprinted headers. This is for // generating a semi-meaningful "abstract") - string::size_type m_startoftext; - string m_subject; - vector m_attachments; + std::string::size_type m_startoftext; + std::string m_subject; + std::vector m_attachments; // Additional headers to be process as per config + field name translation - map m_addProcdHdrs; + std::map m_addProcdHdrs; }; class MHMailAttach { public: - string m_contentType; - string m_filename; - string m_charset; - string m_contentTransferEncoding; + std::string m_contentType; + std::string m_filename; + std::string m_charset; + std::string m_contentTransferEncoding; Binc::MimePart *m_part; }; diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index e5385c25..9a078f22 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -100,43 +100,43 @@ public: ~MboxCache() {} mbhoff_type get_offset(RclConfig *config, const string& udi, int msgnum) { - LOGDEB0("MboxCache::get_offsets: udi [" << (udi) << "] msgnum " << (msgnum) << "\n" ); + LOGDEB0("MboxCache::get_offsets: udi [" << (udi) << "] msgnum " << (msgnum) << "\n"); if (!ok(config)) { - LOGDEB0("MboxCache::get_offsets: init failed\n" ); + LOGDEB0("MboxCache::get_offsets: init failed\n"); return -1; } std::unique_lock locker(o_mcache_mutex); string fn = makefilename(udi); FILE *fp = 0; if ((fp = fopen(fn.c_str(), "r")) == 0) { - LOGDEB("MboxCache::get_offsets: open failed, errno " << (errno) << "\n" ); + LOGDEB("MboxCache::get_offsets: open failed, errno " << (errno) << "\n"); return -1; } FpKeeper keeper(&fp); char blk1[M_o_b1size]; if (fread(blk1, 1, o_b1size, fp) != o_b1size) { - LOGDEB0("MboxCache::get_offsets: read blk1 errno " << (errno) << "\n" ); + LOGDEB0("MboxCache::get_offsets: read blk1 errno " << (errno) << "\n"); return -1; } ConfSimple cf(string(blk1, o_b1size)); string fudi; if (!cf.get("udi", fudi) || fudi.compare(udi)) { - LOGINFO("MboxCache::get_offset:badudi fn " << (fn) << " udi [" << (udi) << "], fudi [" << (fudi) << "]\n" ); + LOGINFO("MboxCache::get_offset:badudi fn " << (fn) << " udi [" << (udi) << "], fudi [" << (fudi) << "]\n"); return -1; } if (fseeko(fp, cacheoffset(msgnum), SEEK_SET) != 0) { - LOGDEB0("MboxCache::get_offsets: seek " << (lltodecstr(cacheoffset(msgnum))) << " errno " << (errno) << "\n" ); + LOGDEB0("MboxCache::get_offsets: seek " << (lltodecstr(cacheoffset(msgnum))) << " errno " << (errno) << "\n"); return -1; } mbhoff_type offset = -1; size_t ret; if ((ret = fread(&offset, 1, sizeof(mbhoff_type), fp)) != sizeof(mbhoff_type)) { - LOGDEB0("MboxCache::get_offsets: read ret " << (ret) << " errno " << (errno) << "\n" ); + LOGDEB0("MboxCache::get_offsets: read ret " << (ret) << " errno " << (errno) << "\n"); return -1; } - LOGDEB0("MboxCache::get_offsets: ret " << (lltodecstr(offset)) << "\n" ); + LOGDEB0("MboxCache::get_offsets: ret " << (lltodecstr(offset)) << "\n"); return offset; } @@ -144,7 +144,7 @@ public: void put_offsets(RclConfig *config, const string& udi, mbhoff_type fsize, vector& offs) { - LOGDEB0("MboxCache::put_offsets: " << (offs.size()) << " offsets\n" ); + LOGDEB0("MboxCache::put_offsets: " << (offs.size()) << " offsets\n"); if (!ok(config) || !maybemakedir()) return; if (fsize < m_minfsize) @@ -153,7 +153,7 @@ public: string fn = makefilename(udi); FILE *fp; if ((fp = fopen(fn.c_str(), "w")) == 0) { - LOGDEB("MboxCache::put_offsets: fopen errno " << (errno) << "\n" ); + LOGDEB("MboxCache::put_offsets: fopen errno " << (errno) << "\n"); return; } FpKeeper keeper(&fp); @@ -163,7 +163,7 @@ public: blk1.append(cstr_newline); blk1.resize(o_b1size, 0); if (fwrite(blk1.c_str(), 1, o_b1size, fp) != o_b1size) { - LOGDEB("MboxCache::put_offsets: fwrite errno " << (errno) << "\n" ); + LOGDEB("MboxCache::put_offsets: fwrite errno " << (errno) << "\n"); return; } @@ -255,10 +255,9 @@ void MimeHandlerMbox::clear() RecollFilter::clear(); } -bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn) +bool MimeHandlerMbox::set_document_file_impl(const string& mt, const string &fn) { - LOGDEB("MimeHandlerMbox::set_document_file(" << (fn) << ")\n" ); - RecollFilter::set_document_file(mt, fn); + LOGDEB("MimeHandlerMbox::set_document_file(" << fn << ")\n"); m_fn = fn; if (m_vfp) { fclose((FILE *)m_vfp); @@ -267,7 +266,8 @@ bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn) m_vfp = fopen(fn.c_str(), "r"); if (m_vfp == 0) { - LOGERR("MimeHandlerMail::set_document_file: error opening " << (fn) << "\n" ); + LOGERR("MimeHandlerMail::set_document_file: error opening " << fn << + "\n"); return false; } #if defined O_NOATIME && O_NOATIME != 0 @@ -278,7 +278,8 @@ bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn) // Used to use ftell() here: no good beyond 2GB {struct stat st; if (fstat(fileno((FILE*)m_vfp), &st) < 0) { - LOGERR("MimeHandlerMbox:setdocfile: fstat(" << (fn) << ") failed errno " << (errno) << "\n" ); + LOGERR("MimeHandlerMbox:setdocfile: fstat(" << fn << + ") failed errno " << errno << "\n"); return false; } m_fsize = st.st_size; @@ -291,7 +292,7 @@ bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn) string quirks; if (m_config && m_config->getConfParam(cstr_keyquirks, quirks)) { if (quirks == "tbird") { - LOGDEB("MimeHandlerMbox: setting quirks TBIRD\n" ); + LOGDEB("MimeHandlerMbox: setting quirks TBIRD\n"); m_quirks |= MBOXQUIRK_TBIRD; } } @@ -299,7 +300,7 @@ bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn) // And double check for thunderbird string tbirdmsf = fn + ".msf"; if ((m_quirks&MBOXQUIRK_TBIRD) == 0 && path_exists(tbirdmsf)) { - LOGDEB("MimeHandlerMbox: detected unconfigured tbird mbox in " << (fn) << "\n" ); + LOGDEB("MimeHandlerMbox: detected unconfigured tbird mbox in " << (fn) << "\n"); m_quirks |= MBOXQUIRK_TBIRD; } @@ -416,7 +417,7 @@ static void compileregexes() bool MimeHandlerMbox::next_document() { if (m_vfp == 0) { - LOGERR("MimeHandlerMbox::next_document: not open\n" ); + LOGERR("MimeHandlerMbox::next_document: not open\n"); return false; } if (!m_havedoc) { @@ -428,10 +429,10 @@ bool MimeHandlerMbox::next_document() sscanf(m_ipath.c_str(), "%d", &mtarg); } else if (m_forPreview) { // Can't preview an mbox. - LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n" ); + LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n"); return false; } - LOGDEB0("MimeHandlerMbox::next_document: fn " << (m_fn) << ", msgnum " << (m_msgnum) << " mtarg " << (mtarg) << " \n" ); + LOGDEB0("MimeHandlerMbox::next_document: fn " << (m_fn) << ", msgnum " << (m_msgnum) << " mtarg " << (mtarg) << " \n"); if (mtarg == 0) mtarg = -1; @@ -451,7 +452,7 @@ bool MimeHandlerMbox::next_document() if (mtarg > 0) { mbhoff_type off; line_type line; - LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << (mtarg) << " m_udi[" << (m_udi) << "]\n" ); + LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << (mtarg) << " m_udi[" << (m_udi) << "]\n"); if (!m_udi.empty() && (off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 && fseeko(fp, (off_t)off, SEEK_SET) >= 0 && @@ -459,7 +460,7 @@ bool MimeHandlerMbox::next_document() (!M_regexec(fromregex, line, 0, 0, 0) || ((m_quirks & MBOXQUIRK_TBIRD) && !M_regexec(minifromregex, line, 0, 0, 0))) ) { - LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n" ); + LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n"); fseeko(fp, (off_t)off, SEEK_SET); m_msgnum = mtarg -1; storeoffsets = false; @@ -478,7 +479,7 @@ bool MimeHandlerMbox::next_document() for (;;) { message_end = ftello(fp); if (!fgets(line, LL, fp)) { - LOGDEB2("MimeHandlerMbox:next: eof\n" ); + LOGDEB2("MimeHandlerMbox:next: eof\n"); iseof = true; m_msgnum++; break; @@ -486,7 +487,7 @@ bool MimeHandlerMbox::next_document() m_lineno++; int ll; stripendnl(line, ll); - LOGDEB2("mhmbox:next: hadempty " << (hademptyline) << " lineno " << (m_lineno) << " ll " << (ll) << " Line: [" << (line) << "]\n" ); + LOGDEB2("mhmbox:next: hadempty " << (hademptyline) << " lineno " << (m_lineno) << " ll " << (ll) << " Line: [" << (line) << "]\n"); if (hademptyline) { if (ll > 0) { // Non-empty line with empty line flag set, reset flag @@ -504,7 +505,7 @@ bool MimeHandlerMbox::next_document() ((m_quirks & MBOXQUIRK_TBIRD) && !M_regexec(minifromregex, line, 0, 0, 0))) ) { - LOGDEB0("MimeHandlerMbox: msgnum " << (m_msgnum) << ", From_ at line " << (m_lineno) << ": [" << (line) << "]\n" ); + LOGDEB0("MimeHandlerMbox: msgnum " << (m_msgnum) << ", From_ at line " << (m_lineno) << ": [" << (line) << "]\n"); if (storeoffsets) m_offsets.push_back(message_end); m_msgnum++; @@ -527,13 +528,13 @@ bool MimeHandlerMbox::next_document() line[ll+1] = 0; msgtxt += line; if (msgtxt.size() > max_mbox_member_size) { - LOGERR("mh_mbox: huge message (more than " << (max_mbox_member_size/(1024*1024)) << " MB) inside " << (m_fn) << ", giving up\n" ); + LOGERR("mh_mbox: huge message (more than " << (max_mbox_member_size/(1024*1024)) << " MB) inside " << (m_fn) << ", giving up\n"); return false; } } } - LOGDEB2("Message text length " << (msgtxt.size()) << "\n" ); - LOGDEB2("Message text: [" << (msgtxt) << "]\n" ); + LOGDEB2("Message text length " << (msgtxt.size()) << "\n"); + LOGDEB2("Message text: [" << (msgtxt) << "]\n"); char buf[20]; // m_msgnum was incremented when hitting the next From_ or eof, so the data // is for m_msgnum - 1 @@ -541,7 +542,7 @@ bool MimeHandlerMbox::next_document() m_metaData[cstr_dj_keyipath] = buf; m_metaData[cstr_dj_keymt] = "message/rfc822"; if (iseof) { - LOGDEB2("MimeHandlerMbox::next: eof hit\n" ); + LOGDEB2("MimeHandlerMbox::next: eof hit\n"); m_havedoc = false; if (!m_udi.empty() && storeoffsets) { o_mcache.put_offsets(m_config, m_udi, m_fsize, m_offsets); @@ -658,7 +659,7 @@ int main(int argc, char **argv) } else { size = it->second.length(); } - cout << "Doc " << docnt << " size " << size << endl; + cout << "Doc " << docnt << " size " << size << endl; } cout << docnt << " documents found in " << filename << endl; exit(0); diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h index 7238fee2..e2b0de9a 100644 --- a/src/internfile/mh_mbox.h +++ b/src/internfile/mh_mbox.h @@ -19,8 +19,6 @@ #include #include -using std::string; -using std::vector; #include "mimehandler.h" @@ -30,28 +28,32 @@ using std::vector; * file. */ class MimeHandlerMbox : public RecollFilter { - public: - MimeHandlerMbox(RclConfig *cnf, const string& id) +public: + MimeHandlerMbox(RclConfig *cnf, const std::string& id) : RecollFilter(cnf, id), m_vfp(0), m_msgnum(0), - m_lineno(0), m_fsize(0) - {} + m_lineno(0), m_fsize(0) { + } virtual ~MimeHandlerMbox(); - virtual bool set_document_file(const string& mt, const string &file_path); virtual bool next_document(); - virtual bool skip_to_document(const string& ipath) { + virtual bool skip_to_document(const std::string& ipath) { m_ipath = ipath; return true; } virtual void clear(); typedef long long mbhoff_type; - private: - string m_fn; // File name + +protected: + virtual bool set_document_file_impl(const std::string&, + const std::string&); + +private: + std::string m_fn; // File name void *m_vfp; // File pointer for folder int m_msgnum; // Current message number in folder. Starts at 1 - string m_ipath; + std::string m_ipath; int m_lineno; // debug mbhoff_type m_fsize; - vector m_offsets; + std::vector m_offsets; enum Quirks {MBOXQUIRK_TBIRD=1}; int m_quirks; }; diff --git a/src/internfile/mh_null.h b/src/internfile/mh_null.h index 5554d57d..b62453e1 100644 --- a/src/internfile/mh_null.h +++ b/src/internfile/mh_null.h @@ -17,34 +17,28 @@ #ifndef _MH_NULL_H_INCLUDED_ #define _MH_NULL_H_INCLUDED_ -// It may make sense in some cases to set this null filter (no output) -// instead of using recoll_noindex or leaving the default filter in -// case one doesn't want to install it: this will avoid endless retries -// to reindex the affected files, as recoll will think it has succeeded -// indexing them. Downside: the files won't be indexed when one -// actually installs the real filter, will need a -z -// Actually used for empty files -// Associated to application/x-zerosize, so use -// = internal application/x-zerosize -// in mimeconf #include #include "cstr.h" #include "mimehandler.h" +/// Null input handler always returning empty data. +/// +/// It may make sense in some cases to set this null filter (no output) +/// instead of using recoll_noindex or leaving the default filter in +/// case one doesn't want to install it: this will avoid endless retries +/// to reindex the affected files, as recoll will think it has succeeded +/// indexing them. Downside: the files won't be indexed when one +/// actually installs the real filter, will need a -z +/// Actually used for empty files. +/// Associated to application/x-zerosize, so use the following in mimeconf: +/// = internal application/x-zerosize class MimeHandlerNull : public RecollFilter { public: MimeHandlerNull(RclConfig *cnf, const std::string& id) - : RecollFilter(cnf, id) - { - } - virtual ~MimeHandlerNull() - { - } - virtual bool set_document_file(const string& mt, const string& fn) - { - RecollFilter::set_document_file(mt, fn); - return m_havedoc = true; + : RecollFilter(cnf, id) { } + virtual ~MimeHandlerNull() {} + virtual bool next_document() { if (m_havedoc == false) diff --git a/src/internfile/mh_symlink.h b/src/internfile/mh_symlink.h index 8f200af8..4d2ccfd9 100644 --- a/src/internfile/mh_symlink.h +++ b/src/internfile/mh_symlink.h @@ -36,18 +36,10 @@ class MimeHandlerSymlink : public RecollFilter { public: MimeHandlerSymlink(RclConfig *cnf, const std::string& id) - : RecollFilter(cnf, id) - { - } - virtual ~MimeHandlerSymlink() - { - } - virtual bool set_document_file(const string& mt, const string& fn) - { - RecollFilter::set_document_file(mt, fn); - m_fn = fn; - return m_havedoc = true; + : RecollFilter(cnf, id) { } + virtual ~MimeHandlerSymlink() {} + virtual bool next_document() { if (m_havedoc == false) @@ -61,11 +53,18 @@ class MimeHandlerSymlink : public RecollFilter { transcode(path_getsimple(slc), m_metaData[cstr_dj_keycontent], m_config->getDefCharset(true), "UTF-8"); } else { - LOGDEB("Symlink: readlink [" << (m_fn) << "] failed, errno " << (errno) << "\n" ); + LOGDEB("Symlink: readlink [" << m_fn << "] failed, errno " << + errno << "\n"); } m_metaData[cstr_dj_keymt] = cstr_textplain; return true; } +protected: + virtual bool set_document_file_impl(const string& mt, const string& fn) { + m_fn = fn; + return m_havedoc = true; + } + private: std::string m_fn; }; diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index 5cc38444..69c7721e 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -40,13 +40,11 @@ const int MB = 1024*1024; const int KB = 1024; // Process a plain text file -bool MimeHandlerText::set_document_file(const string& mt, const string &fn) +bool MimeHandlerText::set_document_file_impl(const string& mt, const string &fn) { - LOGDEB("MimeHandlerText::set_document_file: [" << fn << "] offs " << + LOGDEB("MimeHandlerText::set_document_file: [" << fn << "] offs " << m_offs << "\n"); - RecollFilter::set_document_file(mt, fn); - m_fn = fn; // This should not be necessary, but it happens on msw that offset is large // negative at this point, could not find the reason (still trying). @@ -93,9 +91,9 @@ bool MimeHandlerText::set_document_file(const string& mt, const string &fn) return true; } -bool MimeHandlerText::set_document_string(const string& mt, const string& otext) +bool MimeHandlerText::set_document_string_impl(const string& mt, + const string& otext) { - RecollFilter::set_document_string(mt, otext); m_text = otext; if (!m_forPreview) { string md5, xmd5; @@ -175,7 +173,7 @@ bool MimeHandlerText::readnext() string reason; m_text.clear(); if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { - LOGERR("MimeHandlerText: can't read file: " << (reason) << "\n" ); + LOGERR("MimeHandlerText: can't read file: " << reason << "\n" ); m_havedoc = false; return false; } diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index 9e4dfe12..7aca2e44 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -30,14 +30,10 @@ class MimeHandlerText : public RecollFilter { public: MimeHandlerText(RclConfig *cnf, const std::string& id) - : RecollFilter(cnf, id), m_paging(false), m_offs(0), m_pagesz(0) - { + : RecollFilter(cnf, id), m_paging(false), m_offs(0), m_pagesz(0) { } - virtual ~MimeHandlerText() - { - } - virtual bool set_document_file(const std::string& mt, const std::string &file_path); - virtual bool set_document_string(const std::string&, const std::string&); + virtual ~MimeHandlerText() {} + virtual bool is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING) return true; @@ -45,14 +41,20 @@ class MimeHandlerText : public RecollFilter { } virtual bool next_document(); virtual bool skip_to_document(const std::string& s); - virtual void clear() - { + virtual void clear() { m_paging = false; m_text.erase(); m_fn.erase(); m_offs = 0; RecollFilter::clear(); } + +protected: + virtual bool set_document_file_impl(const std::string& mt, + const std::string &file_path); + virtual bool set_document_string_impl(const std::string&, + const std::string&); + private: bool m_paging; std::string m_text; diff --git a/src/internfile/mh_unknown.h b/src/internfile/mh_unknown.h index b84ee4f0..00b799a5 100644 --- a/src/internfile/mh_unknown.h +++ b/src/internfile/mh_unknown.h @@ -29,21 +29,9 @@ class MimeHandlerUnknown : public RecollFilter { public: MimeHandlerUnknown(RclConfig *cnf, const string& id) - : RecollFilter(cnf, id) - { - } - virtual ~MimeHandlerUnknown() - { - } - virtual bool set_document_file(const string& mt, const string& fn) - { - RecollFilter::set_document_file(mt, fn); - return m_havedoc = true; - } - virtual bool set_document_string(const string& mt, const string& s) { - RecollFilter::set_document_string(mt, s); - return m_havedoc = true; + : RecollFilter(cnf, id) { } + virtual ~MimeHandlerUnknown() {} virtual bool next_document() { if (m_havedoc == false) return false; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index f75a9fe7..0e726abb 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -60,7 +60,8 @@ static RecollFilter *getMimeHandlerFromCache(const string& key) std::unique_lock locker(o_handlers_mutex); string xdigest; MD5HexPrint(key, xdigest); - LOGDEB("getMimeHandlerFromCache: " << (xdigest) << " cache size " << (o_handlers.size()) << "\n" ); + LOGDEB("getMimeHandlerFromCache: " << xdigest << " cache size " << + o_handlers.size() << "\n"); multimap::iterator it = o_handlers.find(key); if (it != o_handlers.end()) { @@ -69,13 +70,14 @@ static RecollFilter *getMimeHandlerFromCache(const string& key) if (it1 != o_hlru.end()) { o_hlru.erase(it1); } else { - LOGERR("getMimeHandlerFromCache: lru position not found\n" ); + LOGERR("getMimeHandlerFromCache: lru position not found\n"); } o_handlers.erase(it); - LOGDEB("getMimeHandlerFromCache: " << (xdigest) << " found size " << (o_handlers.size()) << "\n" ); + LOGDEB("getMimeHandlerFromCache: " << xdigest << " found size " << + o_handlers.size() << "\n"); return h; } - LOGDEB("getMimeHandlerFromCache: " << (xdigest) << " not found\n" ); + LOGDEB("getMimeHandlerFromCache: " << xdigest << " not found\n"); return 0; } @@ -85,14 +87,16 @@ void returnMimeHandler(RecollFilter *handler) typedef multimap::value_type value_type; if (handler == 0) { - LOGERR("returnMimeHandler: bad parameter\n" ); + LOGERR("returnMimeHandler: bad parameter\n"); return; } handler->clear(); std::unique_lock locker(o_handlers_mutex); - LOGDEB("returnMimeHandler: returning filter for " << (handler->get_mime_type()) << " cache size " << (o_handlers.size()) << "\n" ); + LOGDEB("returnMimeHandler: returning filter for " << + handler->get_mime_type() << " cache size " << o_handlers.size() << + "\n"); // Limit pool size. The pool can grow quite big because there are // many filter types, each of which can be used in several copies @@ -105,9 +109,9 @@ void returnMimeHandler(RecollFilter *handler) if (once) { once = 0; for (it = o_handlers.begin(); it != o_handlers.end(); it++) { - LOGDEB1("Cache full. key: " << (it->first) << "\n" ); + LOGDEB1("Cache full. key: " << it->first << "\n"); } - LOGDEB1("Cache LRU size: " << (o_hlru.size()) << "\n" ); + LOGDEB1("Cache LRU size: " << o_hlru.size() << "\n"); } if (o_hlru.size() > 0) { it = o_hlru.back(); @@ -122,7 +126,7 @@ void returnMimeHandler(RecollFilter *handler) void clearMimeHandlerCache() { - LOGDEB("clearMimeHandlerCache()\n" ); + LOGDEB("clearMimeHandlerCache()\n"); multimap::iterator it; std::unique_lock locker(o_handlers_mutex); for (it = o_handlers.begin(); it != o_handlers.end(); it++) { @@ -136,31 +140,31 @@ void clearMimeHandlerCache() static RecollFilter *mhFactory(RclConfig *config, const string &mime, bool nobuild, string& id) { - LOGDEB2("mhFactory(" << (mime) << ")\n" ); + LOGDEB2("mhFactory(" << mime << ")\n"); string lmime(mime); stringtolower(lmime); if (cstr_textplain == lmime) { - LOGDEB2("mhFactory(" << (mime) << "): returning MimeHandlerText\n" ); + LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n"); MD5String("MimeHandlerText", id); return nobuild ? 0 : new MimeHandlerText(config, id); } else if ("text/html" == lmime) { - LOGDEB2("mhFactory(" << (mime) << "): returning MimeHandlerHtml\n" ); + LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n"); MD5String("MimeHandlerHtml", id); return nobuild ? 0 : new MimeHandlerHtml(config, id); } else if ("text/x-mail" == lmime) { - LOGDEB2("mhFactory(" << (mime) << "): returning MimeHandlerMbox\n" ); + LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerMbox\n"); MD5String("MimeHandlerMbox", id); return nobuild ? 0 : new MimeHandlerMbox(config, id); } else if ("message/rfc822" == lmime) { - LOGDEB2("mhFactory(" << (mime) << "): returning MimeHandlerMail\n" ); + LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerMail\n"); MD5String("MimeHandlerMail", id); return nobuild ? 0 : new MimeHandlerMail(config, id); } else if ("inode/symlink" == lmime) { - LOGDEB2("mhFactory(" << (mime) << "): ret MimeHandlerSymlink\n" ); + LOGDEB2("mhFactory(" << mime << "): ret MimeHandlerSymlink\n"); MD5String("MimeHandlerSymlink", id); return nobuild ? 0 : new MimeHandlerSymlink(config, id); } else if ("application/x-zerosize" == lmime) { - LOGDEB("mhFactory(" << (mime) << "): ret MimeHandlerNull\n" ); + LOGDEB("mhFactory(" << mime << "): ret MimeHandlerNull\n"); MD5String("MimeHandlerNull", id); return nobuild ? 0 : new MimeHandlerNull(config, id); } else if (lmime.find("text/") == 0) { @@ -169,14 +173,15 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime, // mimeconf, not at random. For programs, for example this // allows indexing and previewing as text/plain (no filter // exec) but still opening with a specific editor. - LOGDEB2("mhFactory(" << (mime) << "): returning MimeHandlerText(x)\n" ); + LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText(x)\n"); MD5String("MimeHandlerText", id); return nobuild ? 0 : new MimeHandlerText(config, id); } else { // We should not get there. It means that "internal" was set // as a handler in mimeconf for a mime type we actually can't // handle. - LOGERR("mhFactory: mime type [" << (lmime) << "] set as internal but unknown\n" ); + LOGERR("mhFactory: mime type [" << lmime << + "] set as internal but unknown\n"); MD5String("MimeHandlerUnknown", id); return nobuild ? 0 : new MimeHandlerUnknown(config, id); } @@ -199,7 +204,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, string cmdstr; if (!cfg->valueSplitAttributes(hs, cmdstr, attrs)) { - LOGERR("mhExecFactory: bad config line for [" << (mtype) << "]: [" << (hs) << "]\n" ); + LOGERR("mhExecFactory: bad config line for [" << + mtype << "]: [" << hs << "]\n"); return 0; } @@ -207,7 +213,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, vector cmdtoks; stringToStrings(cmdstr, cmdtoks); if (cmdtoks.empty()) { - LOGERR("mhExecFactory: bad config line for [" << (mtype) << "]: [" << (hs) << "]\n" ); + LOGERR("mhExecFactory: bad config line for [" << mtype << + "]: [" << hs << "]\n"); return 0; } MimeHandlerExec *h = multiple ? @@ -221,7 +228,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, // the same change if we ever want to use the same cmdling as windows if (!stringlowercmp("python", *it) || !stringlowercmp("perl", *it)) { if (cmdtoks.size() < 2) { - LOGERR("mhExecFactory: python/perl cmd: no script?. [" << (mtype) << "]: [" << (hs) << "]\n" ); + LOGERR("mhExecFactory: python/perl cmd: no script?. [" << + mtype << "]: [" << hs << "]\n"); } vector::iterator it1(it); it1++; @@ -244,7 +252,9 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, for (it = h->params.begin(); it != h->params.end(); it++) { scmd += string("[") + *it + "] "; } - LOGDEB("mhExecFactory:mt [" << (mtype) << "] cfgmt [" << (h->cfgFilterOutputMtype) << "] cfgcs [" << (h->cfgFilterOutputCharset) << "] cmd: [" << (scmd) << "]\n" ); + LOGDEB("mhExecFactory:mt [" << mtype << "] cfgmt [" << + h->cfgFilterOutputMtype << "] cfgcs [" << + h->cfgFilterOutputCharset << "] cmd: [" << scmd << "]\n"); #endif return h; @@ -254,7 +264,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, bool filtertypes) { - LOGDEB("getMimeHandler: mtype [" << (mtype) << "] filtertypes " << (filtertypes) << "\n" ); + LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " << + filtertypes << "\n"); RecollFilter *h = 0; // Get handler definition for mime type. We do this even if an @@ -292,7 +303,7 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, if (h != 0) goto out; - LOGDEB2("getMimeHandler: " << (mtype) << " not in cache\n" ); + LOGDEB2("getMimeHandler: " << mtype << " not in cache\n"); // Not in cache. if (internal) { @@ -303,13 +314,14 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, // partly redundant with the localfields/rclaptg, but // better and the latter will probably go away at some // point in the future. - LOGDEB2("handlertype internal, cmdstr [" << (cmdstr) << "]\n" ); + LOGDEB2("handlertype internal, cmdstr [" << cmdstr << "]\n"); h = mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, false, id); goto out; } else if (!stringlowercmp("dll", handlertype)) { } else { if (cmdstr.empty()) { - LOGERR("getMimeHandler: bad line for " << (mtype) << ": " << (hs) << "\n" ); + LOGERR("getMimeHandler: bad line for " << mtype << ": " << + hs << "\n"); goto out; } if (!stringlowercmp("exec", handlertype)) { @@ -319,7 +331,8 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, h = mhExecFactory(cfg, mtype, cmdstr, true, id); goto out; } else { - LOGERR("getMimeHandler: bad line for " << (mtype) << ": " << (hs) << "\n" ); + LOGERR("getMimeHandler: bad line for " << mtype << ": " << + hs << "\n"); goto out; } } diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 981436f8..86a4f452 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -31,13 +31,14 @@ class RclConfig; class RecollFilter : public Dijon::Filter { public: RecollFilter(RclConfig *config, const std::string& id) - : m_config(config), m_forPreview(false), m_havedoc(false), m_id(id) - {} + : m_config(config), m_forPreview(false), m_havedoc(false), m_id(id) { + } virtual ~RecollFilter() {} - virtual void setConfig(RclConfig *config) - { + + virtual void setConfig(RclConfig *config) { m_config = config; } + virtual bool set_property(Properties p, const std::string &v) { switch (p) { case DJF_UDI: @@ -58,34 +59,23 @@ public: // We don't use this for now virtual bool set_document_uri(const std::string& mtype, - const std::string &) - { + const std::string &) { m_mimeType = mtype; return false; } - // This does nothing right now but should be called from the - // subclass method in case we need some common processing one day - // (was used for xattrs at some point). Yes this is the "call - // super" anti-pattern, bad, but we have several layers of derived - // classes, so that implementing the template method approach (by - // having a pure virtual called from here and implemented in the - // subclass) would have to be repeated in each derived class. It's - // just simpler this way. virtual bool set_document_file(const std::string& mtype, - const std::string & /*file_path*/) - { + const std::string &file_path) { m_mimeType = mtype; - return true; + return set_document_file_impl(mtype, file_path); } - // Default implementations virtual bool set_document_string(const std::string& mtype, - const std::string &) - { + const std::string &contents) { m_mimeType = mtype; - return false; + return set_document_string_impl(mtype, contents); } + virtual bool set_document_data(const std::string& mtype, const char *cp, size_t sz) { @@ -95,11 +85,14 @@ public: virtual void set_docsize(off_t size) { m_docsize = size; } + virtual off_t get_docsize() const { return m_docsize; } - virtual bool has_documents() const {return m_havedoc;} + virtual bool has_documents() const { + return m_havedoc; + } // Most doc types are single-doc virtual bool skip_to_document(const std::string& s) { @@ -118,8 +111,7 @@ public: return m_reason; } - virtual const std::string& get_id() const - { + virtual const std::string& get_id() const { return m_id; } @@ -137,7 +129,21 @@ public: bool txtdcode(const std::string& who); protected: - bool preview() {return m_forPreview;} + + // We provide default implementation as not all handlers need both methods + virtual bool set_document_file_impl(const std::string&, + const std::string&) { + return m_havedoc = true; + } + + virtual bool set_document_string_impl(const std::string&, + const std::string&) { + return m_havedoc = true; + } + + bool preview() { + return m_forPreview; + } RclConfig *m_config; bool m_forPreview; diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 4a418093..37f8cbc2 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -122,6 +122,16 @@ skippedPaths = /media # redefined for subtrees. #excludedmimetypes = +# Don't compute md5 for +# these types.md5 checksums are used only for deduplicating +# results, and can be very expensive to compute on multimedia or other big +# files. This list lets you turn off md5 computation for selected types. It +# is global (no redefinition for subtrees). At the moment, it only has an +# effect for external handlers (exec and execm). The file types can be +# specified by listing either MIME types (e.g. audio/mpeg) or handler names +# (e.g. rclaudio). +nomd5types = rclaudio + # Size limit for compressed # files.We need to decompress these in a # temporary directory for identification, which can be wasteful in some