Do not filter out text/html when it results from a conversion, even if excluded by indexedmimetypes/excludedmimetypes
This commit is contained in:
parent
65387963ed
commit
19a4b2a287
@ -55,6 +55,7 @@ DEF_CSTR(newline, "\n");
|
|||||||
DEF_CSTR(null, "");
|
DEF_CSTR(null, "");
|
||||||
DEF_CSTR(plus, "+");
|
DEF_CSTR(plus, "+");
|
||||||
DEF_CSTR(textplain, "text/plain");
|
DEF_CSTR(textplain, "text/plain");
|
||||||
|
DEF_CSTR(texthtml, "text/html");
|
||||||
DEF_CSTR(url, "url");
|
DEF_CSTR(url, "url");
|
||||||
// Marker for HTML format fields
|
// Marker for HTML format fields
|
||||||
DEF_CSTR(fldhtm, "\007");
|
DEF_CSTR(fldhtm, "\007");
|
||||||
|
|||||||
@ -654,7 +654,8 @@ int FileInterner::addHandler()
|
|||||||
getKeyValue(docdata, cstr_dj_keycharset, charset);
|
getKeyValue(docdata, cstr_dj_keycharset, charset);
|
||||||
getKeyValue(docdata, cstr_dj_keymt, mimetype);
|
getKeyValue(docdata, cstr_dj_keymt, mimetype);
|
||||||
|
|
||||||
LOGDEB("FileInterner::addHandler: next_doc is " << (mimetype) << " target [" << (m_targetMType) << "]\n" );
|
LOGDEB("FileInterner::addHandler: next_doc is " << mimetype <<
|
||||||
|
" target [" << m_targetMType << "]\n");
|
||||||
|
|
||||||
// If we find a document of the target type (text/plain in
|
// If we find a document of the target type (text/plain in
|
||||||
// general), we're done decoding. If we hit text/plain, we're done
|
// general), we're done decoding. If we hit text/plain, we're done
|
||||||
@ -662,7 +663,7 @@ int FileInterner::addHandler()
|
|||||||
if (!stringicmp(mimetype, m_targetMType) ||
|
if (!stringicmp(mimetype, m_targetMType) ||
|
||||||
!stringicmp(mimetype, cstr_textplain)) {
|
!stringicmp(mimetype, cstr_textplain)) {
|
||||||
m_reachedMType = mimetype;
|
m_reachedMType = mimetype;
|
||||||
LOGDEB1("FileInterner::addHandler: target reached\n" );
|
LOGDEB1("FileInterner::addHandler: target reached\n");
|
||||||
return ADD_BREAK;
|
return ADD_BREAK;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -670,15 +671,26 @@ int FileInterner::addHandler()
|
|||||||
if (m_handlers.size() >= MAXHANDLERS) {
|
if (m_handlers.size() >= MAXHANDLERS) {
|
||||||
// Stack too big. Skip this and go on to check if there is
|
// Stack too big. Skip this and go on to check if there is
|
||||||
// something else in the current back()
|
// something else in the current back()
|
||||||
LOGERR("FileInterner::addHandler: stack too high\n" );
|
LOGERR("FileInterner::addHandler: stack too high\n");
|
||||||
return ADD_CONTINUE;
|
return ADD_CONTINUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, !m_forPreview);
|
// We must not filter out HTML when it is an intermediate
|
||||||
|
// conversion format. We discriminate between e.g. an HTML email
|
||||||
|
// attachment (needs filtering) and a result of pdf conversion
|
||||||
|
// (must process) by looking at the last ipath element: a
|
||||||
|
// conversion will have an empty one (same test as in
|
||||||
|
// collectIpathAndMT).
|
||||||
|
string ipathel;
|
||||||
|
getKeyValue(docdata, cstr_dj_keyipath, ipathel);
|
||||||
|
bool dofilter = !m_forPreview &&
|
||||||
|
(mimetype.compare(cstr_texthtml) || !ipathel.empty());
|
||||||
|
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter);
|
||||||
if (!newflt) {
|
if (!newflt) {
|
||||||
// If we can't find a handler, this doc can't be handled
|
// If we can't find a handler, this doc can't be handled
|
||||||
// but there can be other ones so we go on
|
// but there can be other ones so we go on
|
||||||
LOGINFO("FileInterner::addHandler: no filter for [" << (mimetype) << "]\n" );
|
LOGINFO("FileInterner::addHandler: no filter for [" << mimetype <<
|
||||||
|
"]\n");
|
||||||
return ADD_CONTINUE;
|
return ADD_CONTINUE;
|
||||||
}
|
}
|
||||||
newflt->set_property(Dijon::Filter::OPERATING_MODE,
|
newflt->set_property(Dijon::Filter::OPERATING_MODE,
|
||||||
@ -717,7 +729,8 @@ int FileInterner::addHandler()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!setres) {
|
if (!setres) {
|
||||||
LOGINFO("FileInterner::addHandler: set_doc failed inside " << (m_fn) << " for mtype " << (mimetype) << "\n" );
|
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
|
||||||
|
" for mtype " << mimetype << "\n");
|
||||||
delete newflt;
|
delete newflt;
|
||||||
if (m_forPreview)
|
if (m_forPreview)
|
||||||
return ADD_ERROR;
|
return ADD_ERROR;
|
||||||
@ -725,7 +738,7 @@ int FileInterner::addHandler()
|
|||||||
}
|
}
|
||||||
// add handler and go on, maybe this one will give us text...
|
// add handler and go on, maybe this one will give us text...
|
||||||
m_handlers.push_back(newflt);
|
m_handlers.push_back(newflt);
|
||||||
LOGDEB1("FileInterner::addHandler: added\n" );
|
LOGDEB1("FileInterner::addHandler: added\n");
|
||||||
return ADD_OK;
|
return ADD_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1003,9 +1016,9 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
|
|||||||
// performed. A common case would be an "Open" on an html file
|
// performed. A common case would be an "Open" on an html file
|
||||||
// (we'd end up with text/plain content). As the html version is
|
// (we'd end up with text/plain content). As the html version is
|
||||||
// saved in this case, use it.
|
// saved in this case, use it.
|
||||||
if (!stringlowercmp("text/html", mimetype) && !get_html().empty()) {
|
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
|
||||||
doc.text = get_html();
|
doc.text = get_html();
|
||||||
doc.mimetype = "text/html";
|
doc.mimetype = cstr_texthtml;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *filename;
|
const char *filename;
|
||||||
|
|||||||
@ -235,7 +235,7 @@ void MimeHandlerExec::finaldetails()
|
|||||||
{
|
{
|
||||||
// The default output mime type is html, but it may be defined
|
// The default output mime type is html, but it may be defined
|
||||||
// otherwise in the filter definition.
|
// otherwise in the filter definition.
|
||||||
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
|
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? cstr_texthtml :
|
||||||
cfgFilterOutputMtype;
|
cfgFilterOutputMtype;
|
||||||
|
|
||||||
if (!m_forPreview && !m_nomd5) {
|
if (!m_forPreview && !m_nomd5) {
|
||||||
|
|||||||
@ -320,7 +320,7 @@ bool MimeHandlerExecMultiple::next_document()
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// "Self" document.
|
// "Self" document.
|
||||||
m_metaData[cstr_dj_keymt] = mtype.empty() ? "text/html" : mtype;
|
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
|
||||||
m_metaData.erase(cstr_dj_keyipath);
|
m_metaData.erase(cstr_dj_keyipath);
|
||||||
if (!m_forPreview) {
|
if (!m_forPreview) {
|
||||||
m_metaData[cstr_dj_keymd5] = file_md5;
|
m_metaData[cstr_dj_keymd5] = file_md5;
|
||||||
|
|||||||
@ -147,7 +147,7 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
|
|||||||
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
|
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
|
||||||
MD5String("MimeHandlerText", id);
|
MD5String("MimeHandlerText", id);
|
||||||
return nobuild ? 0 : new MimeHandlerText(config, id);
|
return nobuild ? 0 : new MimeHandlerText(config, id);
|
||||||
} else if ("text/html" == lmime) {
|
} else if (cstr_texthtml == lmime) {
|
||||||
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
|
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerHtml\n");
|
||||||
MD5String("MimeHandlerHtml", id);
|
MD5String("MimeHandlerHtml", id);
|
||||||
return nobuild ? 0 : new MimeHandlerHtml(config, id);
|
return nobuild ? 0 : new MimeHandlerHtml(config, id);
|
||||||
|
|||||||
@ -32,6 +32,14 @@ versions.</i></p>
|
|||||||
<h2><a name="b_latest">recoll 1.23.2</a></h2>
|
<h2><a name="b_latest">recoll 1.23.2</a></h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
|
||||||
|
<li>When indexedmimetypes is set and does not include text/html (or
|
||||||
|
if text/html is excluded by excludedmimetypes), PDF (and other)
|
||||||
|
contents will not be indexed because the file handler initially
|
||||||
|
produces text/html. The workaround is to include text/html in the
|
||||||
|
processed types, and maybe use suffix exclusion if you really
|
||||||
|
don't want to index html files. This is fixed in development code,
|
||||||
|
will be in the next release.</li>
|
||||||
|
|
||||||
<li>The Recoll GUI configuration (things set from the <tt>GUI
|
<li>The Recoll GUI configuration (things set from the <tt>GUI
|
||||||
Configuration</tt> menu) is stored in a file
|
Configuration</tt> menu) is stored in a file
|
||||||
named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
|
named <tt>~/.config/Recoll.org/recoll.conf</tt>, which is
|
||||||
|
|||||||
@ -3,7 +3,7 @@ Jean-Francois Dockes <jf at dockes.org>
|
|||||||
:date:
|
:date:
|
||||||
|
|
||||||
:recollversion: 1.23.0-2017-01-07-78b8ad
|
:recollversion: 1.23.0-2017-01-07-78b8ad
|
||||||
:windir: downwin-fa352
|
:windir: downwin-adb3c
|
||||||
|
|
||||||
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
|
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user