Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes

2017-06-08 10:09:05 +02:00 · 2017-06-08 10:09:05 +02:00 · 19a4b2a287
commit 19a4b2a287
parent 65387963ed
7 changed files with 35 additions and 13 deletions
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
 DEF_CSTR(plus, "+");
 DEF_CSTR(textplain, "text/plain");
+DEF_CSTR(texthtml, "text/html");
 DEF_CSTR(url, "url");
 // Marker for HTML format fields
 DEF_CSTR(fldhtm, "\007");
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -654,7 +654,8 @@ int FileInterner::addHandler()
    getKeyValue(docdata, cstr_dj_keycharset, charset);
    getKeyValue(docdata, cstr_dj_keymt, mimetype);

-    LOGDEB("FileInterner::addHandler: next_doc is "  << (mimetype) << " target ["  << (m_targetMType) << "]\n" );
+    LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
+           " target ["  << m_targetMType << "]\n");

    // If we find a document of the target type (text/plain in
    // general), we're done decoding. If we hit text/plain, we're done
@ -662,7 +663,7 @@ int FileInterner::addHandler()
    if (!stringicmp(mimetype, m_targetMType) || 
 	!stringicmp(mimetype, cstr_textplain)) {
 	m_reachedMType = mimetype;
-	LOGDEB1("FileInterner::addHandler: target reached\n" );
+	LOGDEB1("FileInterner::addHandler: target reached\n");
 	return ADD_BREAK;
    }

@ -670,15 +671,26 @@ int FileInterner::addHandler()
    if (m_handlers.size() >= MAXHANDLERS) {
 	// Stack too big. Skip this and go on to check if there is
 	// something else in the current back()
-	LOGERR("FileInterner::addHandler: stack too high\n" );
+	LOGERR("FileInterner::addHandler: stack too high\n");
 	return ADD_CONTINUE;
    }

-    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
+    // We must not filter out HTML when it is an intermediate
+    // conversion format. We discriminate between e.g. an HTML email
+    // attachment (needs filtering) and a result of pdf conversion
+    // (must process) by looking at the last ipath element: a
+    // conversion will have an empty one (same test as in
+    // collectIpathAndMT).
+    string ipathel;
+    getKeyValue(docdata, cstr_dj_keyipath, ipathel);
+    bool dofilter = !m_forPreview &&
+        (mimetype.compare(cstr_texthtml) || !ipathel.empty());
+    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
    if (!newflt) {
 	// If we can't find a handler, this doc can't be handled
 	// but there can be other ones so we go on
-	LOGINFO("FileInterner::addHandler: no filter for ["  << (mimetype) << "]\n" );
+	LOGINFO("FileInterner::addHandler: no filter for ["  << mimetype <<
+                "]\n");
 	return ADD_CONTINUE;
    }
    newflt->set_property(Dijon::Filter::OPERATING_MODE, 
@ -717,7 +729,8 @@ int FileInterner::addHandler()
 	}
    }
    if (!setres) {
-	LOGINFO("FileInterner::addHandler: set_doc failed inside "  << (m_fn) << "  for mtype "  << (mimetype) << "\n" );
+	LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
+                "  for mtype " << mimetype << "\n");
 	delete newflt;
 	if (m_forPreview)
 	    return ADD_ERROR;
@ -725,7 +738,7 @@ int FileInterner::addHandler()
    }
    // add handler and go on, maybe this one will give us text...
    m_handlers.push_back(newflt);
-    LOGDEB1("FileInterner::addHandler: added\n" );
+    LOGDEB1("FileInterner::addHandler: added\n");
    return ADD_OK;
 }

@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
    // performed. A common case would be an "Open" on an html file
    // (we'd end up with text/plain content). As the html version is
    // saved in this case, use it.  
-    if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
+    if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
        doc.text = get_html();
-        doc.mimetype = "text/html";
+        doc.mimetype = cstr_texthtml;
    }

    const char *filename;
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
 {
    // The default output mime type is html, but it may be defined
    // otherwise in the filter definition.
-    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : 
+    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml : 
 	cfgFilterOutputMtype;

    if (!m_forPreview && !m_nomd5) {
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
 	}
    } else {
 	// "Self" document.
-        m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
+        m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
        m_metaData.erase(cstr_dj_keyipath);
 	if (!m_forPreview) {
            m_metaData[cstr_dj_keymd5] = file_md5;
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
 	MD5String("MimeHandlerText", id);
 	return nobuild ? 0 : new MimeHandlerText(config, id);
-    } else if ("text/html" == lmime) {
+    } else if (cstr_texthtml == lmime) {
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
 	MD5String("MimeHandlerHtml", id);
 	return nobuild ? 0 : new MimeHandlerHtml(config, id);
--- a/website/BUGS.html
+++ b/website/BUGS.html
@ -32,6 +32,14 @@ versions.</i></p>
 <h2><a name="b_latest">recoll 1.23.2</a></h2>
 <ul>

+  <li>When indexedmimetypes is set and does not include text/html (or
+    if text/html is excluded by excludedmimetypes), PDF (and other)
+    contents will not be indexed because the file handler initially
+    produces text/html. The workaround is to include text/html in the
+    processed types, and maybe use suffix exclusion if you really
+    don't want to index html files. This is fixed in development code,
+    will be in the next release.</li>
+  
  <li>The Recoll GUI configuration (things set from the <tt>GUI
      Configuration</tt> menu) is stored in a file
      named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
--- a/website/pages/recoll-windows.txt
+++ b/website/pages/recoll-windows.txt
@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
 :date:

 :recollversion: 1.23.0-2017-01-07-78b8ad
-:windir: downwin-fa352
+:windir: downwin-adb3c

 image:recoll-windows10-thumb.png[link="recoll-windows10.png"]