From 19a4b2a287aa8ed2fad4a68773b0e66bb3ec55c8 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 8 Jun 2017 10:09:05 +0200 Subject: [PATCH] Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes --- src/common/cstr.h | 1 + src/internfile/internfile.cpp | 31 ++++++++++++++++++++++--------- src/internfile/mh_exec.cpp | 2 +- src/internfile/mh_execm.cpp | 2 +- src/internfile/mimehandler.cpp | 2 +- website/BUGS.html | 8 ++++++++ website/pages/recoll-windows.txt | 2 +- 7 files changed, 35 insertions(+), 13 deletions(-) diff --git a/src/common/cstr.h b/src/common/cstr.h index 7f41099b..17dead32 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n"); DEF_CSTR(null, ""); DEF_CSTR(plus, "+"); DEF_CSTR(textplain, "text/plain"); +DEF_CSTR(texthtml, "text/html"); DEF_CSTR(url, "url"); // Marker for HTML format fields DEF_CSTR(fldhtm, "\007"); diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 39790254..8ad3e115 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -654,7 +654,8 @@ int FileInterner::addHandler() getKeyValue(docdata, cstr_dj_keycharset, charset); getKeyValue(docdata, cstr_dj_keymt, mimetype); - LOGDEB("FileInterner::addHandler: next_doc is " << (mimetype) << " target [" << (m_targetMType) << "]\n" ); + LOGDEB("FileInterner::addHandler: next_doc is " << mimetype << + " target [" << m_targetMType << "]\n"); // If we find a document of the target type (text/plain in // general), we're done decoding. If we hit text/plain, we're done @@ -662,7 +663,7 @@ int FileInterner::addHandler() if (!stringicmp(mimetype, m_targetMType) || !stringicmp(mimetype, cstr_textplain)) { m_reachedMType = mimetype; - LOGDEB1("FileInterner::addHandler: target reached\n" ); + LOGDEB1("FileInterner::addHandler: target reached\n"); return ADD_BREAK; } @@ -670,15 +671,26 @@ int FileInterner::addHandler() if (m_handlers.size() >= MAXHANDLERS) { // Stack too big. Skip this and go on to check if there is // something else in the current back() - LOGERR("FileInterner::addHandler: stack too high\n" ); + LOGERR("FileInterner::addHandler: stack too high\n"); return ADD_CONTINUE; } - RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview); + // We must not filter out HTML when it is an intermediate + // conversion format. We discriminate between e.g. an HTML email + // attachment (needs filtering) and a result of pdf conversion + // (must process) by looking at the last ipath element: a + // conversion will have an empty one (same test as in + // collectIpathAndMT). + string ipathel; + getKeyValue(docdata, cstr_dj_keyipath, ipathel); + bool dofilter = !m_forPreview && + (mimetype.compare(cstr_texthtml) || !ipathel.empty()); + RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter); if (!newflt) { // If we can't find a handler, this doc can't be handled // but there can be other ones so we go on - LOGINFO("FileInterner::addHandler: no filter for [" << (mimetype) << "]\n" ); + LOGINFO("FileInterner::addHandler: no filter for [" << mimetype << + "]\n"); return ADD_CONTINUE; } newflt->set_property(Dijon::Filter::OPERATING_MODE, @@ -717,7 +729,8 @@ int FileInterner::addHandler() } } if (!setres) { - LOGINFO("FileInterner::addHandler: set_doc failed inside " << (m_fn) << " for mtype " << (mimetype) << "\n" ); + LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn << + " for mtype " << mimetype << "\n"); delete newflt; if (m_forPreview) return ADD_ERROR; @@ -725,7 +738,7 @@ int FileInterner::addHandler() } // add handler and go on, maybe this one will give us text... m_handlers.push_back(newflt); - LOGDEB1("FileInterner::addHandler: added\n" ); + LOGDEB1("FileInterner::addHandler: added\n"); return ADD_OK; } @@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile, // performed. A common case would be an "Open" on an html file // (we'd end up with text/plain content). As the html version is // saved in this case, use it. - if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) { + if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) { doc.text = get_html(); - doc.mimetype = "text/html"; + doc.mimetype = cstr_texthtml; } const char *filename; diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 534de783..6b302799 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails() { // The default output mime type is html, but it may be defined // otherwise in the filter definition. - m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : + m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml : cfgFilterOutputMtype; if (!m_forPreview && !m_nomd5) { diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 8ac1f150..b5c06fb2 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document() } } else { // "Self" document. - m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype; + m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype; m_metaData.erase(cstr_dj_keyipath); if (!m_forPreview) { m_metaData[cstr_dj_keymd5] = file_md5; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index e6b6c1f4..609b6c33 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime, LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n"); MD5String("MimeHandlerText", id); return nobuild ? 0 : new MimeHandlerText(config, id); - } else if ("text/html" == lmime) { + } else if (cstr_texthtml == lmime) { LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n"); MD5String("MimeHandlerHtml", id); return nobuild ? 0 : new MimeHandlerHtml(config, id); diff --git a/website/BUGS.html b/website/BUGS.html index 516584b0..607b1920 100644 --- a/website/BUGS.html +++ b/website/BUGS.html @@ -32,6 +32,14 @@ versions.

recoll 1.23.2