Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes
This commit is contained in:
parent
65387963ed
commit
19a4b2a287
@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
|
||||
DEF_CSTR(null, "");
|
||||
DEF_CSTR(plus, "+");
|
||||
DEF_CSTR(textplain, "text/plain");
|
||||
DEF_CSTR(texthtml, "text/html");
|
||||
DEF_CSTR(url, "url");
|
||||
// Marker for HTML format fields
|
||||
DEF_CSTR(fldhtm, "\007");
|
||||
|
||||
@ -654,7 +654,8 @@ int FileInterner::addHandler()
|
||||
getKeyValue(docdata, cstr_dj_keycharset, charset);
|
||||
getKeyValue(docdata, cstr_dj_keymt, mimetype);
|
||||
|
||||
LOGDEB("FileInterner::addHandler: next_doc is " << (mimetype) << " target [" << (m_targetMType) << "]\n" );
|
||||
LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
|
||||
" target [" << m_targetMType << "]\n");
|
||||
|
||||
// If we find a document of the target type (text/plain in
|
||||
// general), we're done decoding. If we hit text/plain, we're done
|
||||
@ -662,7 +663,7 @@ int FileInterner::addHandler()
|
||||
if (!stringicmp(mimetype, m_targetMType) ||
|
||||
!stringicmp(mimetype, cstr_textplain)) {
|
||||
m_reachedMType = mimetype;
|
||||
LOGDEB1("FileInterner::addHandler: target reached\n" );
|
||||
LOGDEB1("FileInterner::addHandler: target reached\n");
|
||||
return ADD_BREAK;
|
||||
}
|
||||
|
||||
@ -670,15 +671,26 @@ int FileInterner::addHandler()
|
||||
if (m_handlers.size() >= MAXHANDLERS) {
|
||||
// Stack too big. Skip this and go on to check if there is
|
||||
// something else in the current back()
|
||||
LOGERR("FileInterner::addHandler: stack too high\n" );
|
||||
LOGERR("FileInterner::addHandler: stack too high\n");
|
||||
return ADD_CONTINUE;
|
||||
}
|
||||
|
||||
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
|
||||
// We must not filter out HTML when it is an intermediate
|
||||
// conversion format. We discriminate between e.g. an HTML email
|
||||
// attachment (needs filtering) and a result of pdf conversion
|
||||
// (must process) by looking at the last ipath element: a
|
||||
// conversion will have an empty one (same test as in
|
||||
// collectIpathAndMT).
|
||||
string ipathel;
|
||||
getKeyValue(docdata, cstr_dj_keyipath, ipathel);
|
||||
bool dofilter = !m_forPreview &&
|
||||
(mimetype.compare(cstr_texthtml) || !ipathel.empty());
|
||||
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
|
||||
if (!newflt) {
|
||||
// If we can't find a handler, this doc can't be handled
|
||||
// but there can be other ones so we go on
|
||||
LOGINFO("FileInterner::addHandler: no filter for [" << (mimetype) << "]\n" );
|
||||
LOGINFO("FileInterner::addHandler: no filter for [" << mimetype <<
|
||||
"]\n");
|
||||
return ADD_CONTINUE;
|
||||
}
|
||||
newflt->set_property(Dijon::Filter::OPERATING_MODE,
|
||||
@ -717,7 +729,8 @@ int FileInterner::addHandler()
|
||||
}
|
||||
}
|
||||
if (!setres) {
|
||||
LOGINFO("FileInterner::addHandler: set_doc failed inside " << (m_fn) << " for mtype " << (mimetype) << "\n" );
|
||||
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
|
||||
" for mtype " << mimetype << "\n");
|
||||
delete newflt;
|
||||
if (m_forPreview)
|
||||
return ADD_ERROR;
|
||||
@ -725,7 +738,7 @@ int FileInterner::addHandler()
|
||||
}
|
||||
// add handler and go on, maybe this one will give us text...
|
||||
m_handlers.push_back(newflt);
|
||||
LOGDEB1("FileInterner::addHandler: added\n" );
|
||||
LOGDEB1("FileInterner::addHandler: added\n");
|
||||
return ADD_OK;
|
||||
}
|
||||
|
||||
@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
|
||||
// performed. A common case would be an "Open" on an html file
|
||||
// (we'd end up with text/plain content). As the html version is
|
||||
// saved in this case, use it.
|
||||
if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
|
||||
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
|
||||
doc.text = get_html();
|
||||
doc.mimetype = "text/html";
|
||||
doc.mimetype = cstr_texthtml;
|
||||
}
|
||||
|
||||
const char *filename;
|
||||
|
||||
@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
|
||||
{
|
||||
// The default output mime type is html, but it may be defined
|
||||
// otherwise in the filter definition.
|
||||
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml :
|
||||
cfgFilterOutputMtype;
|
||||
|
||||
if (!m_forPreview && !m_nomd5) {
|
||||
|
||||
@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
}
|
||||
} else {
|
||||
// "Self" document.
|
||||
m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
|
||||
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
|
||||
m_metaData.erase(cstr_dj_keyipath);
|
||||
if (!m_forPreview) {
|
||||
m_metaData[cstr_dj_keymd5] = file_md5;
|
||||
|
||||
@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
|
||||
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
|
||||
MD5String("MimeHandlerText", id);
|
||||
return nobuild ? 0 : new MimeHandlerText(config, id);
|
||||
} else if ("text/html" == lmime) {
|
||||
} else if (cstr_texthtml == lmime) {
|
||||
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
|
||||
MD5String("MimeHandlerHtml", id);
|
||||
return nobuild ? 0 : new MimeHandlerHtml(config, id);
|
||||
|
||||
@ -32,6 +32,14 @@ versions.</i></p>
|
||||
<h2><a name="b_latest">recoll 1.23.2</a></h2>
|
||||
<ul>
|
||||
|
||||
<li>When indexedmimetypes is set and does not include text/html (or
|
||||
if text/html is excluded by excludedmimetypes), PDF (and other)
|
||||
contents will not be indexed because the file handler initially
|
||||
produces text/html. The workaround is to include text/html in the
|
||||
processed types, and maybe use suffix exclusion if you really
|
||||
don't want to index html files. This is fixed in development code,
|
||||
will be in the next release.</li>
|
||||
|
||||
<li>The Recoll GUI configuration (things set from the <tt>GUI
|
||||
Configuration</tt> menu) is stored in a file
|
||||
named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
|
||||
|
||||
@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
|
||||
:date:
|
||||
|
||||
:recollversion: 1.23.0-2017-01-07-78b8ad
|
||||
:windir: downwin-fa352
|
||||
:windir: downwin-adb3c
|
||||
|
||||
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user