Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes

2017-06-08 10:09:05 +02:00 · 2017-06-08 10:09:05 +02:00 · 19a4b2a287
commit 19a4b2a287
parent 65387963ed
7 changed files with 35 additions and 13 deletions
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
 DEF_CSTR(plus, "+");
 DEF_CSTR(textplain, "text/plain");
 DEF_CSTR(texthtml, "text/html");
 DEF_CSTR(url, "url");
 // Marker for HTML format fields
 DEF_CSTR(fldhtm, "\007");
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -654,7 +654,8 @@ int FileInterner::addHandler()
    getKeyValue(docdata, cstr_dj_keycharset, charset);
    getKeyValue(docdata, cstr_dj_keymt, mimetype);
-    LOGDEB("FileInterner::addHandler: next_doc is "  << (mimetype) << " target ["  << (m_targetMType) << "]\n" );
+    LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
           " target ["  << m_targetMType << "]\n");
    // If we find a document of the target type (text/plain in
    // general), we're done decoding. If we hit text/plain, we're done
@ -662,7 +663,7 @@ int FileInterner::addHandler()
    if (!stringicmp(mimetype, m_targetMType) || 
 	!stringicmp(mimetype, cstr_textplain)) {
 	m_reachedMType = mimetype;
-	LOGDEB1("FileInterner::addHandler: target reached\n" );
+	LOGDEB1("FileInterner::addHandler: target reached\n");
 	return ADD_BREAK;
    }
@ -670,15 +671,26 @@ int FileInterner::addHandler()
    if (m_handlers.size() >= MAXHANDLERS) {
 	// Stack too big. Skip this and go on to check if there is
 	// something else in the current back()
-	LOGERR("FileInterner::addHandler: stack too high\n" );
+	LOGERR("FileInterner::addHandler: stack too high\n");
 	return ADD_CONTINUE;
    }
-    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
+    // We must not filter out HTML when it is an intermediate
    // conversion format. We discriminate between e.g. an HTML email
    // attachment (needs filtering) and a result of pdf conversion
    // (must process) by looking at the last ipath element: a
    // conversion will have an empty one (same test as in
    // collectIpathAndMT).
    string ipathel;
    getKeyValue(docdata, cstr_dj_keyipath, ipathel);
    bool dofilter = !m_forPreview &&
        (mimetype.compare(cstr_texthtml) || !ipathel.empty());
    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
    if (!newflt) {
 	// If we can't find a handler, this doc can't be handled
 	// but there can be other ones so we go on
-	LOGINFO("FileInterner::addHandler: no filter for ["  << (mimetype) << "]\n" );
+	LOGINFO("FileInterner::addHandler: no filter for ["  << mimetype <<
                "]\n");
 	return ADD_CONTINUE;
    }
    newflt->set_property(Dijon::Filter::OPERATING_MODE, 
@ -717,7 +729,8 @@ int FileInterner::addHandler()
 	}
    }
    if (!setres) {
-	LOGINFO("FileInterner::addHandler: set_doc failed inside "  << (m_fn) << "  for mtype "  << (mimetype) << "\n" );
+	LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
                "  for mtype " << mimetype << "\n");
 	delete newflt;
 	if (m_forPreview)
 	    return ADD_ERROR;
@ -725,7 +738,7 @@ int FileInterner::addHandler()
    }
    // add handler and go on, maybe this one will give us text...
    m_handlers.push_back(newflt);
-    LOGDEB1("FileInterner::addHandler: added\n" );
+    LOGDEB1("FileInterner::addHandler: added\n");
    return ADD_OK;
 }
@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
    // performed. A common case would be an "Open" on an html file
    // (we'd end up with text/plain content). As the html version is
    // saved in this case, use it.  
-    if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
+    if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
        doc.text = get_html();
-        doc.mimetype = "text/html";
+        doc.mimetype = cstr_texthtml;
    }
    const char *filename;
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
 {
    // The default output mime type is html, but it may be defined
    // otherwise in the filter definition.
-    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : 
+    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml : 
 	cfgFilterOutputMtype;
    if (!m_forPreview && !m_nomd5) {
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
 	}
    } else {
 	// "Self" document.
-        m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
+        m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
        m_metaData.erase(cstr_dj_keyipath);
 	if (!m_forPreview) {
            m_metaData[cstr_dj_keymd5] = file_md5;
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
 	MD5String("MimeHandlerText", id);
 	return nobuild ? 0 : new MimeHandlerText(config, id);
-    } else if ("text/html" == lmime) {
+    } else if (cstr_texthtml == lmime) {
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
 	MD5String("MimeHandlerHtml", id);
 	return nobuild ? 0 : new MimeHandlerHtml(config, id);
--- a/website/BUGS.html
+++ b/website/BUGS.html
@ -32,6 +32,14 @@ versions.</i></p>
 <h2><a name="b_latest">recoll 1.23.2</a></h2>
 <ul>
  <li>When indexedmimetypes is set and does not include text/html (or
    if text/html is excluded by excludedmimetypes), PDF (and other)
    contents will not be indexed because the file handler initially
    produces text/html. The workaround is to include text/html in the
    processed types, and maybe use suffix exclusion if you really
    don't want to index html files. This is fixed in development code,
    will be in the next release.</li>
  <li>The Recoll GUI configuration (things set from the <tt>GUI
      Configuration</tt> menu) is stored in a file
      named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
--- a/website/pages/recoll-windows.txt
+++ b/website/pages/recoll-windows.txt
@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
 :date:
 :recollversion: 1.23.0-2017-01-07-78b8ad
-:windir: downwin-fa352
+:windir: downwin-adb3c
 image:recoll-windows10-thumb.png[link="recoll-windows10.png"]