Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes

This commit is contained in:
Jean-Francois Dockes 2017-06-08 10:09:05 +02:00
parent 65387963ed
commit 19a4b2a287
7 changed files with 35 additions and 13 deletions

View File

@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
DEF_CSTR(null, "");
DEF_CSTR(plus, "+");
DEF_CSTR(textplain, "text/plain");
DEF_CSTR(texthtml, "text/html");
DEF_CSTR(url, "url");
// Marker for HTML format fields
DEF_CSTR(fldhtm, "\007");

View File

@ -654,7 +654,8 @@ int FileInterner::addHandler()
getKeyValue(docdata, cstr_dj_keycharset, charset);
getKeyValue(docdata, cstr_dj_keymt, mimetype);
LOGDEB("FileInterner::addHandler: next_doc is " << (mimetype) << " target [" << (m_targetMType) << "]\n" );
LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
" target [" << m_targetMType << "]\n");
// If we find a document of the target type (text/plain in
// general), we're done decoding. If we hit text/plain, we're done
@ -662,7 +663,7 @@ int FileInterner::addHandler()
if (!stringicmp(mimetype, m_targetMType) ||
!stringicmp(mimetype, cstr_textplain)) {
m_reachedMType = mimetype;
LOGDEB1("FileInterner::addHandler: target reached\n" );
LOGDEB1("FileInterner::addHandler: target reached\n");
return ADD_BREAK;
}
@ -670,15 +671,26 @@ int FileInterner::addHandler()
if (m_handlers.size() >= MAXHANDLERS) {
// Stack too big. Skip this and go on to check if there is
// something else in the current back()
LOGERR("FileInterner::addHandler: stack too high\n" );
LOGERR("FileInterner::addHandler: stack too high\n");
return ADD_CONTINUE;
}
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
// We must not filter out HTML when it is an intermediate
// conversion format. We discriminate between e.g. an HTML email
// attachment (needs filtering) and a result of pdf conversion
// (must process) by looking at the last ipath element: a
// conversion will have an empty one (same test as in
// collectIpathAndMT).
string ipathel;
getKeyValue(docdata, cstr_dj_keyipath, ipathel);
bool dofilter = !m_forPreview &&
(mimetype.compare(cstr_texthtml) || !ipathel.empty());
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
if (!newflt) {
// If we can't find a handler, this doc can't be handled
// but there can be other ones so we go on
LOGINFO("FileInterner::addHandler: no filter for [" << (mimetype) << "]\n" );
LOGINFO("FileInterner::addHandler: no filter for [" << mimetype <<
"]\n");
return ADD_CONTINUE;
}
newflt->set_property(Dijon::Filter::OPERATING_MODE,
@ -717,7 +729,8 @@ int FileInterner::addHandler()
}
}
if (!setres) {
LOGINFO("FileInterner::addHandler: set_doc failed inside " << (m_fn) << " for mtype " << (mimetype) << "\n" );
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
" for mtype " << mimetype << "\n");
delete newflt;
if (m_forPreview)
return ADD_ERROR;
@ -725,7 +738,7 @@ int FileInterner::addHandler()
}
// add handler and go on, maybe this one will give us text...
m_handlers.push_back(newflt);
LOGDEB1("FileInterner::addHandler: added\n" );
LOGDEB1("FileInterner::addHandler: added\n");
return ADD_OK;
}
@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
// performed. A common case would be an "Open" on an html file
// (we'd end up with text/plain content). As the html version is
// saved in this case, use it.
if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
doc.text = get_html();
doc.mimetype = "text/html";
doc.mimetype = cstr_texthtml;
}
const char *filename;

View File

@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
{
// The default output mime type is html, but it may be defined
// otherwise in the filter definition.
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml :
cfgFilterOutputMtype;
if (!m_forPreview && !m_nomd5) {

View File

@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
}
} else {
// "Self" document.
m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
m_metaData.erase(cstr_dj_keyipath);
if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = file_md5;

View File

@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
MD5String("MimeHandlerText", id);
return nobuild ? 0 : new MimeHandlerText(config, id);
} else if ("text/html" == lmime) {
} else if (cstr_texthtml == lmime) {
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
MD5String("MimeHandlerHtml", id);
return nobuild ? 0 : new MimeHandlerHtml(config, id);

View File

@ -32,6 +32,14 @@ versions.</i></p>
<h2><a name="b_latest">recoll 1.23.2</a></h2>
<ul>
<li>When indexedmimetypes is set and does not include text/html (or
if text/html is excluded by excludedmimetypes), PDF (and other)
contents will not be indexed because the file handler initially
produces text/html. The workaround is to include text/html in the
processed types, and maybe use suffix exclusion if you really
don't want to index html files. This is fixed in development code,
will be in the next release.</li>
<li>The Recoll GUI configuration (things set from the <tt>GUI
Configuration</tt> menu) is stored in a file
named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is

View File

@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
:date:
:recollversion: 1.23.0-2017-01-07-78b8ad
:windir: downwin-fa352
:windir: downwin-adb3c
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]