From 19a4b2a287aa8ed2fad4a68773b0e66bb3ec55c8 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Thu, 8 Jun 2017 10:09:05 +0200
Subject: [PATCH] Do not filter out text/html when it results from a
 conversion, even if excluded by indexedmimetypes/excludedmimetypes

---
 src/common/cstr.h                |  1 +
 src/internfile/internfile.cpp    | 31 ++++++++++++++++++++++---------
 src/internfile/mh_exec.cpp       |  2 +-
 src/internfile/mh_execm.cpp      |  2 +-
 src/internfile/mimehandler.cpp   |  2 +-
 website/BUGS.html                |  8 ++++++++
 website/pages/recoll-windows.txt |  2 +-
 7 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/common/cstr.h b/src/common/cstr.h
index 7f41099b..17dead32 100644
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
 DEF_CSTR(plus, "+");
 DEF_CSTR(textplain, "text/plain");
+DEF_CSTR(texthtml, "text/html");
 DEF_CSTR(url, "url");
 // Marker for HTML format fields
 DEF_CSTR(fldhtm, "\007");
diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
index 39790254..8ad3e115 100644
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -654,7 +654,8 @@ int FileInterner::addHandler()
     getKeyValue(docdata, cstr_dj_keycharset, charset);
     getKeyValue(docdata, cstr_dj_keymt, mimetype);
 
-    LOGDEB("FileInterner::addHandler: next_doc is "  << (mimetype) << " target ["  << (m_targetMType) << "]\n" );
+    LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
+           " target ["  << m_targetMType << "]\n");
 
     // If we find a document of the target type (text/plain in
     // general), we're done decoding. If we hit text/plain, we're done
@@ -662,7 +663,7 @@ int FileInterner::addHandler()
     if (!stringicmp(mimetype, m_targetMType) || 
 	!stringicmp(mimetype, cstr_textplain)) {
 	m_reachedMType = mimetype;
-	LOGDEB1("FileInterner::addHandler: target reached\n" );
+	LOGDEB1("FileInterner::addHandler: target reached\n");
 	return ADD_BREAK;
     }
 
@@ -670,15 +671,26 @@ int FileInterner::addHandler()
     if (m_handlers.size() >= MAXHANDLERS) {
 	// Stack too big. Skip this and go on to check if there is
 	// something else in the current back()
-	LOGERR("FileInterner::addHandler: stack too high\n" );
+	LOGERR("FileInterner::addHandler: stack too high\n");
 	return ADD_CONTINUE;
     }
 
-    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
+    // We must not filter out HTML when it is an intermediate
+    // conversion format. We discriminate between e.g. an HTML email
+    // attachment (needs filtering) and a result of pdf conversion
+    // (must process) by looking at the last ipath element: a
+    // conversion will have an empty one (same test as in
+    // collectIpathAndMT).
+    string ipathel;
+    getKeyValue(docdata, cstr_dj_keyipath, ipathel);
+    bool dofilter = !m_forPreview &&
+        (mimetype.compare(cstr_texthtml) || !ipathel.empty());
+    RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
     if (!newflt) {
 	// If we can't find a handler, this doc can't be handled
 	// but there can be other ones so we go on
-	LOGINFO("FileInterner::addHandler: no filter for ["  << (mimetype) << "]\n" );
+	LOGINFO("FileInterner::addHandler: no filter for ["  << mimetype <<
+                "]\n");
 	return ADD_CONTINUE;
     }
     newflt->set_property(Dijon::Filter::OPERATING_MODE, 
@@ -717,7 +729,8 @@ int FileInterner::addHandler()
 	}
     }
     if (!setres) {
-	LOGINFO("FileInterner::addHandler: set_doc failed inside "  << (m_fn) << "  for mtype "  << (mimetype) << "\n" );
+	LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
+                "  for mtype " << mimetype << "\n");
 	delete newflt;
 	if (m_forPreview)
 	    return ADD_ERROR;
@@ -725,7 +738,7 @@ int FileInterner::addHandler()
     }
     // add handler and go on, maybe this one will give us text...
     m_handlers.push_back(newflt);
-    LOGDEB1("FileInterner::addHandler: added\n" );
+    LOGDEB1("FileInterner::addHandler: added\n");
     return ADD_OK;
 }
 
@@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
     // performed. A common case would be an "Open" on an html file
     // (we'd end up with text/plain content). As the html version is
     // saved in this case, use it.  
-    if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
+    if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
         doc.text = get_html();
-        doc.mimetype = "text/html";
+        doc.mimetype = cstr_texthtml;
     }
 
     const char *filename;
diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp
index 534de783..6b302799 100644
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
 {
     // The default output mime type is html, but it may be defined
     // otherwise in the filter definition.
-    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : 
+    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml : 
 	cfgFilterOutputMtype;
 
     if (!m_forPreview && !m_nomd5) {
diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp
index 8ac1f150..b5c06fb2 100644
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
 	}
     } else {
 	// "Self" document.
-        m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
+        m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
         m_metaData.erase(cstr_dj_keyipath);
 	if (!m_forPreview) {
             m_metaData[cstr_dj_keymd5] = file_md5;
diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp
index e6b6c1f4..609b6c33 100644
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
 	MD5String("MimeHandlerText", id);
 	return nobuild ? 0 : new MimeHandlerText(config, id);
-    } else if ("text/html" == lmime) {
+    } else if (cstr_texthtml == lmime) {
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
 	MD5String("MimeHandlerHtml", id);
 	return nobuild ? 0 : new MimeHandlerHtml(config, id);
diff --git a/website/BUGS.html b/website/BUGS.html
index 516584b0..607b1920 100644
--- a/website/BUGS.html
+++ b/website/BUGS.html
@@ -32,6 +32,14 @@ versions.</i></p>
 <h2><a name="b_latest">recoll 1.23.2</a></h2>
 <ul>
 
+  <li>When indexedmimetypes is set and does not include text/html (or
+    if text/html is excluded by excludedmimetypes), PDF (and other)
+    contents will not be indexed because the file handler initially
+    produces text/html. The workaround is to include text/html in the
+    processed types, and maybe use suffix exclusion if you really
+    don't want to index html files. This is fixed in development code,
+    will be in the next release.</li>
+  
   <li>The Recoll GUI configuration (things set from the <tt>GUI
       Configuration</tt> menu) is stored in a file
       named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
diff --git a/website/pages/recoll-windows.txt b/website/pages/recoll-windows.txt
index 5b2716a2..5a3bfad2 100644
--- a/website/pages/recoll-windows.txt
+++ b/website/pages/recoll-windows.txt
@@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
 :date:
 
 :recollversion: 1.23.0-2017-01-07-78b8ad
-:windir: downwin-fa352
+:windir: downwin-adb3c
 
 image:recoll-windows10-thumb.png[link="recoll-windows10.png"]