From 0c021b5236029b7fc053f8f4c17f117f7c2134a5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 11 Apr 2021 19:38:07 +0200 Subject: [PATCH] HTML indexing: use the same size limit as for text files (textfilemaxmbs). Some gigantic files are sometimes mis-identified as HTML by xdg-mime --- src/internfile/mh_html.cpp | 44 ++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index e64a3eae..be333ade 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -28,6 +28,8 @@ #include "smallut.h" #include "rclutil.h" #include "md5ut.h" +#include "pathut.h" +#include "rclconfig.h" #include @@ -35,19 +37,36 @@ using namespace std; bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn) { - LOGDEB0("textHtmlToDoc: " << fn << "\n"); - string otext; - string reason; - if (!file_to_string(fn, otext, &reason)) { - LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n"); + LOGDEB0("MimeHandlerHtml::set_document_file_impl: " << fn << "\n"); + + // Check file size against limit. We use the same value as for + // text/plain. xdg-mime sometimes wrongly returns text/html for + // gigantic files (had a case with multi-GB xxx.enex evernote + // export files). + int maxmbs = -1; + m_config->getConfParam("textfilemaxmbs", &maxmbs); + auto totlen = path_filesize(fn); + if (totlen < 0) { + LOGSYSERR("MimeHandlerHtml::set_document_file", "stat", fn); return false; } + + string otext; + if (maxmbs != -1 && totlen / (1024*1024) > maxmbs) { + LOGINF("MimeHandlerHtml: file too big (textfilemaxmbs=" << maxmbs << + "), contents will not be indexed: " << fn << "\n"); + } else { + string reason; + if (!file_to_string(fn, otext, &reason)) { + LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n"); + return false; + } + } m_filename = fn; return set_document_string(mt, otext); } -bool MimeHandlerHtml::set_document_string_impl(const string&, - const string& htext) +bool MimeHandlerHtml::set_document_string_impl(const string&, const string& htext) { m_html = htext; m_havedoc = true; @@ -71,14 +90,12 @@ bool MimeHandlerHtml::next_document() m_filename.erase(); string charset = m_dfltInputCharset; - LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset - << "]\n"); + LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset << "]\n"); // Override default input charset if someone took care to set one: const auto it = m_metaData.find(cstr_dj_keycharset); if (it != m_metaData.end() && !it->second.empty()) { charset = it->second; - LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << - charset << "]\n"); + LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << charset << "]\n"); } // - We first try to convert from the supposed charset @@ -98,8 +115,7 @@ bool MimeHandlerHtml::next_document() int ecnt; if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { LOGDEB("textHtmlToDoc: transcode failed from cs '" << - charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << - "]"); + charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << "]"); transcoded = m_html; // We don't know the charset, at all p.reset_charsets(); @@ -149,7 +165,7 @@ bool MimeHandlerHtml::next_document() break; } - LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< + LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset [" << result.get_charset() << "]\n"); if (!result.get_charset().empty() && !samecharset(result.get_charset(), result.fromcharset)) {