HTML indexing: use the same size limit as for text files (textfilemaxmbs). Some gigantic files are sometimes mis-identified as HTML by xdg-mime

This commit is contained in:
Jean-Francois Dockes 2021-04-11 19:38:07 +02:00
parent 3031b82a40
commit 0c021b5236

View File

@ -28,6 +28,8 @@
#include "smallut.h" #include "smallut.h"
#include "rclutil.h" #include "rclutil.h"
#include "md5ut.h" #include "md5ut.h"
#include "pathut.h"
#include "rclconfig.h"
#include <iostream> #include <iostream>
@ -35,19 +37,36 @@ using namespace std;
bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn) bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
{ {
LOGDEB0("textHtmlToDoc: " << fn << "\n"); LOGDEB0("MimeHandlerHtml::set_document_file_impl: " << fn << "\n");
string otext;
string reason; // Check file size against limit. We use the same value as for
if (!file_to_string(fn, otext, &reason)) { // text/plain. xdg-mime sometimes wrongly returns text/html for
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n"); // gigantic files (had a case with multi-GB xxx.enex evernote
// export files).
int maxmbs = -1;
m_config->getConfParam("textfilemaxmbs", &maxmbs);
auto totlen = path_filesize(fn);
if (totlen < 0) {
LOGSYSERR("MimeHandlerHtml::set_document_file", "stat", fn);
return false; return false;
} }
string otext;
if (maxmbs != -1 && totlen / (1024*1024) > maxmbs) {
LOGINF("MimeHandlerHtml: file too big (textfilemaxmbs=" << maxmbs <<
"), contents will not be indexed: " << fn << "\n");
} else {
string reason;
if (!file_to_string(fn, otext, &reason)) {
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
return false;
}
}
m_filename = fn; m_filename = fn;
return set_document_string(mt, otext); return set_document_string(mt, otext);
} }
bool MimeHandlerHtml::set_document_string_impl(const string&, bool MimeHandlerHtml::set_document_string_impl(const string&, const string& htext)
const string& htext)
{ {
m_html = htext; m_html = htext;
m_havedoc = true; m_havedoc = true;
@ -71,14 +90,12 @@ bool MimeHandlerHtml::next_document()
m_filename.erase(); m_filename.erase();
string charset = m_dfltInputCharset; string charset = m_dfltInputCharset;
LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset << "]\n");
<< "]\n");
// Override default input charset if someone took care to set one: // Override default input charset if someone took care to set one:
const auto it = m_metaData.find(cstr_dj_keycharset); const auto it = m_metaData.find(cstr_dj_keycharset);
if (it != m_metaData.end() && !it->second.empty()) { if (it != m_metaData.end() && !it->second.empty()) {
charset = it->second; charset = it->second;
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << charset << "]\n");
charset << "]\n");
} }
// - We first try to convert from the supposed charset // - We first try to convert from the supposed charset
@ -98,8 +115,7 @@ bool MimeHandlerHtml::next_document()
int ecnt; int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
LOGDEB("textHtmlToDoc: transcode failed from cs '" << LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << "]");
"]");
transcoded = m_html; transcoded = m_html;
// We don't know the charset, at all // We don't know the charset, at all
p.reset_charsets(); p.reset_charsets();
@ -149,7 +165,7 @@ bool MimeHandlerHtml::next_document()
break; break;
} }
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset [" <<
result.get_charset() << "]\n"); result.get_charset() << "]\n");
if (!result.get_charset().empty() && if (!result.get_charset().empty() &&
!samecharset(result.get_charset(), result.fromcharset)) { !samecharset(result.get_charset(), result.fromcharset)) {