From 19a4b2a287aa8ed2fad4a68773b0e66bb3ec55c8 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Thu, 8 Jun 2017 10:09:05 +0200
Subject: [PATCH] Do not filter out text/html when it results from a
conversion, even if excluded by indexedmimetypes/excludedmimetypes
---
src/common/cstr.h | 1 +
src/internfile/internfile.cpp | 31 ++++++++++++++++++++++---------
src/internfile/mh_exec.cpp | 2 +-
src/internfile/mh_execm.cpp | 2 +-
src/internfile/mimehandler.cpp | 2 +-
website/BUGS.html | 8 ++++++++
website/pages/recoll-windows.txt | 2 +-
7 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/src/common/cstr.h b/src/common/cstr.h
index 7f41099b..17dead32 100644
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
DEF_CSTR(null, "");
DEF_CSTR(plus, "+");
DEF_CSTR(textplain, "text/plain");
+DEF_CSTR(texthtml, "text/html");
DEF_CSTR(url, "url");
// Marker for HTML format fields
DEF_CSTR(fldhtm, "\007");
diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
index 39790254..8ad3e115 100644
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -654,7 +654,8 @@ int FileInterner::addHandler()
getKeyValue(docdata, cstr_dj_keycharset, charset);
getKeyValue(docdata, cstr_dj_keymt, mimetype);
- LOGDEB("FileInterner::addHandler: next_doc is " << (mimetype) << " target [" << (m_targetMType) << "]\n" );
+ LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
+ " target [" << m_targetMType << "]\n");
// If we find a document of the target type (text/plain in
// general), we're done decoding. If we hit text/plain, we're done
@@ -662,7 +663,7 @@ int FileInterner::addHandler()
if (!stringicmp(mimetype, m_targetMType) ||
!stringicmp(mimetype, cstr_textplain)) {
m_reachedMType = mimetype;
- LOGDEB1("FileInterner::addHandler: target reached\n" );
+ LOGDEB1("FileInterner::addHandler: target reached\n");
return ADD_BREAK;
}
@@ -670,15 +671,26 @@ int FileInterner::addHandler()
if (m_handlers.size() >= MAXHANDLERS) {
// Stack too big. Skip this and go on to check if there is
// something else in the current back()
- LOGERR("FileInterner::addHandler: stack too high\n" );
+ LOGERR("FileInterner::addHandler: stack too high\n");
return ADD_CONTINUE;
}
- RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
+ // We must not filter out HTML when it is an intermediate
+ // conversion format. We discriminate between e.g. an HTML email
+ // attachment (needs filtering) and a result of pdf conversion
+ // (must process) by looking at the last ipath element: a
+ // conversion will have an empty one (same test as in
+ // collectIpathAndMT).
+ string ipathel;
+ getKeyValue(docdata, cstr_dj_keyipath, ipathel);
+ bool dofilter = !m_forPreview &&
+ (mimetype.compare(cstr_texthtml) || !ipathel.empty());
+ RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
if (!newflt) {
// If we can't find a handler, this doc can't be handled
// but there can be other ones so we go on
- LOGINFO("FileInterner::addHandler: no filter for [" << (mimetype) << "]\n" );
+ LOGINFO("FileInterner::addHandler: no filter for [" << mimetype <<
+ "]\n");
return ADD_CONTINUE;
}
newflt->set_property(Dijon::Filter::OPERATING_MODE,
@@ -717,7 +729,8 @@ int FileInterner::addHandler()
}
}
if (!setres) {
- LOGINFO("FileInterner::addHandler: set_doc failed inside " << (m_fn) << " for mtype " << (mimetype) << "\n" );
+ LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
+ " for mtype " << mimetype << "\n");
delete newflt;
if (m_forPreview)
return ADD_ERROR;
@@ -725,7 +738,7 @@ int FileInterner::addHandler()
}
// add handler and go on, maybe this one will give us text...
m_handlers.push_back(newflt);
- LOGDEB1("FileInterner::addHandler: added\n" );
+ LOGDEB1("FileInterner::addHandler: added\n");
return ADD_OK;
}
@@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
// performed. A common case would be an "Open" on an html file
// (we'd end up with text/plain content). As the html version is
// saved in this case, use it.
- if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
+ if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
doc.text = get_html();
- doc.mimetype = "text/html";
+ doc.mimetype = cstr_texthtml;
}
const char *filename;
diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp
index 534de783..6b302799 100644
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
{
// The default output mime type is html, but it may be defined
// otherwise in the filter definition.
- m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
+ m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml :
cfgFilterOutputMtype;
if (!m_forPreview && !m_nomd5) {
diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp
index 8ac1f150..b5c06fb2 100644
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
}
} else {
// "Self" document.
- m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
+ m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
m_metaData.erase(cstr_dj_keyipath);
if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = file_md5;
diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp
index e6b6c1f4..609b6c33 100644
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
MD5String("MimeHandlerText", id);
return nobuild ? 0 : new MimeHandlerText(config, id);
- } else if ("text/html" == lmime) {
+ } else if (cstr_texthtml == lmime) {
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
MD5String("MimeHandlerHtml", id);
return nobuild ? 0 : new MimeHandlerHtml(config, id);
diff --git a/website/BUGS.html b/website/BUGS.html
index 516584b0..607b1920 100644
--- a/website/BUGS.html
+++ b/website/BUGS.html
@@ -32,6 +32,14 @@ versions.
+ - When indexedmimetypes is set and does not include text/html (or
+ if text/html is excluded by excludedmimetypes), PDF (and other)
+ contents will not be indexed because the file handler initially
+ produces text/html. The workaround is to include text/html in the
+ processed types, and maybe use suffix exclusion if you really
+ don't want to index html files. This is fixed in development code,
+ will be in the next release.
+
- The Recoll GUI configuration (things set from the GUI
Configuration menu) is stored in a file
named ~/.config/Recoll.org/recoll.conf, which is
diff --git a/website/pages/recoll-windows.txt b/website/pages/recoll-windows.txt
index 5b2716a2..5a3bfad2 100644
--- a/website/pages/recoll-windows.txt
+++ b/website/pages/recoll-windows.txt
@@ -3,7 +3,7 @@ Jean-Francois Dockes
:date:
:recollversion: 1.23.0-2017-01-07-78b8ad
-:windir: downwin-fa352
+:windir: downwin-adb3c
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]