diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index e593c5ee..ab3240a6 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -1,4 +1,4 @@ - /* Copyright (C) 2005 J.F.Dockes +/* Copyright (C) 2005 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -38,10 +38,10 @@ bool MimeHandlerExecMultiple::startCmd() { LOGDEB("MimeHandlerExecMultiple::startCmd\n"); if (params.empty()) { - // Hu ho - LOGERR("MHExecMultiple::startCmd: empty params\n"); - m_reason = "RECFILTERROR BADCONFIG"; - return false; + // Hu ho + LOGERR("MHExecMultiple::startCmd: empty params\n"); + m_reason = "RECFILTERROR BADCONFIG"; + return false; } // Command name @@ -55,7 +55,7 @@ bool MimeHandlerExecMultiple::startCmd() m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir()); m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : - "RECOLL_FILTER_FORPREVIEW=no"); + "RECOLL_FILTER_FORPREVIEW=no"); m_cmd.setrlimit_as(m_filtermaxmbytes); m_adv.setmaxsecs(m_filtermaxseconds); @@ -156,11 +156,11 @@ bool MimeHandlerExecMultiple::next_document() { LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n"); if (m_havedoc == false) - return false; + return false; if (missingHelper) { - LOGDEB("MHExecMultiple::next_document(): helper known missing\n"); - return false; + LOGDEB("MHExecMultiple::next_document(): helper known missing\n"); + return false; } if (m_cmd.getChildPid() <= 0 && !startCmd()) { @@ -178,15 +178,15 @@ bool MimeHandlerExecMultiple::next_document() ostringstream obuf; string file_md5; if (m_filefirst) { - if (!m_forPreview && !m_nomd5) { - string md5, xmd5, reason; - if (MD5File(m_fn, md5, &reason)) { - file_md5 = MD5HexPrint(md5, xmd5); - } else { - LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn << + if (!m_forPreview && !m_nomd5) { + string md5, xmd5, reason; + if (MD5File(m_fn, md5, &reason)) { + file_md5 = MD5HexPrint(md5, xmd5); + } else { + LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn << "]: " << reason << "\n"); - } - } + } + } obuf << "FileName: " << m_fn.length() << "\n" << m_fn; // m_filefirst is set to true by set_document_file() m_filefirst = false; @@ -194,13 +194,13 @@ bool MimeHandlerExecMultiple::next_document() obuf << "Filename: " << 0 << "\n"; } if (!m_ipath.empty()) { - LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" << + LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" << m_ipath << "]\n"); obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath; } if (!m_dfltInputCharset.empty()) { obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n" - << m_dfltInputCharset; + << m_dfltInputCharset; } obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType; obuf << "\n"; @@ -247,10 +247,10 @@ bool MimeHandlerExecMultiple::next_document() eofnow_received = true; } else if (!stringlowercmp("fileerror:", name)) { LOGDEB("MHExecMultiple: got FILEERROR\n"); - fileerror_received = true; + fileerror_received = true; } else if (!stringlowercmp("subdocerror:", name)) { LOGDEB("MHExecMultiple: got SUBDOCERROR\n"); - subdocerror_received = true; + subdocerror_received = true; } else if (!stringlowercmp("ipath:", name)) { ipath = data; LOGDEB("MHExecMultiple: got ipath [" << data << "]\n"); @@ -264,7 +264,11 @@ bool MimeHandlerExecMultiple::next_document() string nm = stringtolower((const string&)name); trimstring(nm, ":"); LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n"); - m_metaData[nm] += data; + auto it = m_metaData.find(nm); + if (it == m_metaData.end() || + it->second.find(data) == std::string::npos) { + m_metaData[nm] += data; + } } if (loop == 200) { // ?? @@ -279,7 +283,7 @@ bool MimeHandlerExecMultiple::next_document() return false; } if (subdocerror_received) { - return false; + return false; } // It used to be that eof could be signalled just by an empty document, but @@ -291,13 +295,13 @@ bool MimeHandlerExecMultiple::next_document() } if (!ipath.empty()) { - // If this has an ipath, it is an internal doc from a - // multi-document file. In this case, either the filter - // supplies the mimetype, or the ipath MUST be a filename-like - // string which we can use to compute a mime type + // If this has an ipath, it is an internal doc from a + // multi-document file. In this case, either the filter + // supplies the mimetype, or the ipath MUST be a filename-like + // string which we can use to compute a mime type m_metaData[cstr_dj_keyipath] = ipath; if (mtype.empty()) { - LOGDEB0("MHExecMultiple: no mime type from filter, using ipath " + LOGDEB0("MHExecMultiple: no mime type from filter, using ipath " "for a guess\n"); mtype = mimetype(ipath, 0, m_config, false); if (mtype.empty()) { @@ -313,16 +317,16 @@ bool MimeHandlerExecMultiple::next_document() } } m_metaData[cstr_dj_keymt] = mtype; - if (!m_forPreview) { - string md5, xmd5; - MD5String(m_metaData[cstr_dj_keycontent], md5); - m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); - } + if (!m_forPreview) { + string md5, xmd5; + MD5String(m_metaData[cstr_dj_keycontent], md5); + m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + } } else { - // "Self" document. + // "Self" document. m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype; m_metaData.erase(cstr_dj_keyipath); - if (!m_forPreview) { + if (!m_forPreview) { m_metaData[cstr_dj_keymd5] = file_md5; } } @@ -339,4 +343,3 @@ bool MimeHandlerExecMultiple::next_document() LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString()); return true; } - diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index a212693b..4cc81ba9 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -38,7 +38,7 @@ bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn) string reason; if (!file_to_string(fn, otext, &reason)) { LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n"); - return false; + return false; } m_filename = fn; return set_document_string(mt, otext); @@ -51,10 +51,10 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt, m_havedoc = true; if (!m_forPreview) { - // We want to compute the md5 now because we may modify m_html later - string md5, xmd5; - MD5String(htext, md5); - m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + // We want to compute the md5 now because we may modify m_html later + string md5, xmd5; + MD5String(htext, md5); + m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); } return true; } @@ -62,7 +62,7 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt, bool MimeHandlerHtml::next_document() { if (m_havedoc == false) - return false; + return false; m_havedoc = false; // If set_doc(fn), take note of file name. string fn = m_filename; @@ -70,12 +70,12 @@ bool MimeHandlerHtml::next_document() string charset = m_dfltInputCharset; LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset - << "]\n"); + << "]\n"); // Override default input charset if someone took care to set one: map::const_iterator it = m_metaData.find(cstr_dj_keycharset); if (it != m_metaData.end() && !it->second.empty()) { - charset = it->second; - LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << + charset = it->second; + LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << charset << "]\n"); } @@ -88,78 +88,78 @@ bool MimeHandlerHtml::next_document() MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { - string transcoded; - LOGDEB("Html::mkDoc: pass " << pass << "\n"); - MyHtmlParser p; + string transcoded; + LOGDEB("Html::mkDoc: pass " << pass << "\n"); + MyHtmlParser p; - // Try transcoding. If it fails, use original text. - int ecnt; - if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { - LOGDEB("textHtmlToDoc: transcode failed from cs '" << + // Try transcoding. If it fails, use original text. + int ecnt; + if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { + LOGDEB("textHtmlToDoc: transcode failed from cs '" << charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << "]"); - transcoded = m_html; - // We don't know the charset, at all - p.reset_charsets(); - charset.clear(); - } else { - if (ecnt) { - if (pass == 0) { - LOGDEB("textHtmlToDoc: init transcode had " << ecnt << + transcoded = m_html; + // We don't know the charset, at all + p.reset_charsets(); + charset.clear(); + } else { + if (ecnt) { + if (pass == 0) { + LOGDEB("textHtmlToDoc: init transcode had " << ecnt << " errors for ["<<(fn.empty()?"unknown":fn)<< "]\n"); - } else { - LOGERR("textHtmlToDoc: final transcode had " << ecnt << + } else { + LOGERR("textHtmlToDoc: final transcode had " << ecnt << " errors for ["<< (fn.empty()?"unknown":fn)<< "]\n"); - } - } - // charset has the putative source charset, transcoded is now - // in utf-8 - p.set_charsets(charset, "utf-8"); - } + } + } + // charset has the putative source charset, transcoded is now + // in utf-8 + p.set_charsets(charset, "utf-8"); + } - try { - p.parse_html(transcoded); - // No exception: ok? But throw true to use the same - // code path as if an exception had been thrown by parse_html - throw true; - break; - } catch (bool diag) { - result = p; - if (diag == true) { - // Parser throws true at end of text. ok + try { + p.parse_html(transcoded); + // No exception: ok? But throw true to use the same + // code path as if an exception had been thrown by parse_html + throw true; + break; + } catch (bool diag) { + result = p; + if (diag == true) { + // Parser throws true at end of text. ok - if (m_forPreview) { - // Save the html text - m_html = transcoded; - // In many cases, we need to change the charset decl, - // because the file was transcoded. It seems that just - // inserting one is enough (only the 1st one seems to - // be used by browsers/qtextedit). + if (m_forPreview) { + // Save the html text + m_html = transcoded; + // In many cases, we need to change the charset decl, + // because the file was transcoded. It seems that just + // inserting one is enough (only the 1st one seems to + // be used by browsers/qtextedit). string::size_type idx = m_html.find(""); - if (idx == string::npos) - idx = m_html.find(""); - if (idx != string::npos) - m_html.replace(idx+6, 0, - ""); - } + if (idx == string::npos) + idx = m_html.find(""); + if (idx != string::npos) + m_html.replace(idx+6, 0, + ""); + } - break; - } + break; + } - LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< + LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< result.get_charset() << "]\n"); - if (!result.get_charset().empty() && - !samecharset(result.get_charset(), result.fromcharset)) { - LOGDEB("textHtmlToDoc: reparse for charsets\n"); - // Set the origin charset as specified in document before - // transcoding again - charset = result.get_charset(); - } else { - LOGERR("textHtmlToDoc:: error: non charset exception\n"); - return false; - } - } + if (!result.get_charset().empty() && + !samecharset(result.get_charset(), result.fromcharset)) { + LOGDEB("textHtmlToDoc: reparse for charsets\n"); + // Set the origin charset as specified in document before + // transcoding again + charset = result.get_charset(); + } else { + LOGERR("textHtmlToDoc:: error: non charset exception\n"); + return false; + } + } } m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); @@ -168,13 +168,13 @@ bool MimeHandlerHtml::next_document() // Avoid setting empty values which would crush ones possibly inherited // from parent (if we're an attachment) if (!result.dmtime.empty()) - m_metaData[cstr_dj_keymd] = result.dmtime; + m_metaData[cstr_dj_keymd] = result.dmtime; m_metaData[cstr_dj_keymt] = cstr_textplain; - for (map::const_iterator it = result.meta.begin(); - it != result.meta.end(); it++) { - if (!it->second.empty()) - m_metaData[it->first] = it->second; + for (const auto& entry : result.meta) { + if (!entry.second.empty()) { + m_metaData[entry.first] = entry.second; + } } return true; } diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index d9fc0c74..10f2ab13 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -161,19 +161,19 @@ map my_named_ents; class NamedEntsInitializer { public: NamedEntsInitializer() - { - for (int i = 0;;) { - const char *ent; - const char *val; - ent = epairs[i++]; - if (ent == 0) - break; - val = epairs[i++]; - if (val == 0) - break; - my_named_ents[string(ent)] = val; - } - } + { + for (int i = 0;;) { + const char *ent; + const char *val; + ent = epairs[i++]; + if (ent == 0) + break; + val = epairs[i++]; + if (val == 0) + break; + my_named_ents[string(ent)] = val; + } + } }; static NamedEntsInitializer namedEntsInitializerInstance; @@ -198,58 +198,58 @@ void MyHtmlParser::decode_entities(string &s) // so don't do it. If charset known, caller has converted text to utf-8, // and this is also how we translate entities // if (tocharset != "utf-8") - // return; + // return; // We need a const_iterator version of s.end() - otherwise the // find() and find_if() templates don't work... string::const_iterator amp = s.begin(), s_end = s.end(); while ((amp = find(amp, s_end, '&')) != s_end) { - unsigned int val = 0; - string::const_iterator end, p = amp + 1; - string subs; - if (p != s_end && *p == '#') { - p++; - if (p != s_end && (*p == 'x' || *p == 'X')) { - // hex - p++; - end = find_if(p, s_end, p_notxdigit); - sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); - } else { - // number - end = find_if(p, s_end, p_notdigit); - val = atoi(s.substr(p - s.begin(), end - p).c_str()); - } - } else { - end = find_if(p, s_end, p_notalnum); - string code = s.substr(p - s.begin(), end - p); - map::const_iterator i; - i = my_named_ents.find(code); - if (i != my_named_ents.end()) - subs = i->second; - } + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + string subs; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = my_named_ents.find(code); + if (i != my_named_ents.end()) + subs = i->second; + } - if (end < s_end && *end == ';') - end++; - - if (val) { - // The code is the code position for a unicode char. We need - // to translate it to an utf-8 string. - string utf16be; - utf16be += char(val / 256); - utf16be += char(val % 256); - transcode(utf16be, subs, "UTF-16BE", "UTF-8"); - } + if (end < s_end && *end == ';') + end++; + + if (val) { + // The code is the code position for a unicode char. We need + // to translate it to an utf-8 string. + string utf16be; + utf16be += char(val / 256); + utf16be += char(val % 256); + transcode(utf16be, subs, "UTF-16BE", "UTF-8"); + } - if (subs.length() > 0) { - string::size_type amp_pos = amp - s.begin(); - s.replace(amp_pos, end - amp, subs); - s_end = s.end(); - // We've modified the string, so the iterators are no longer - // valid... - amp = s.begin() + amp_pos + subs.length(); - } else { - amp = end; - } + if (subs.length() > 0) { + string::size_type amp_pos = amp - s.begin(); + s.replace(amp_pos, end - amp, subs); + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + subs.length(); + } else { + amp = end; + } } } @@ -265,35 +265,35 @@ MyHtmlParser::process_text(const string &text) CancelCheck::instance().checkCancel(); if (!in_script_tag && !in_style_tag) { - if (in_title_tag) { - titledump += text; - } else if (!in_pre_tag) { - string::size_type b = 0; - bool only_space = true; - while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { - only_space = false; - // If space specifically needed or chunk begins with - // whitespace, add exactly one space - if (pending_space || b != 0) { - dump += ' '; - } - pending_space = true; - string::size_type e = text.find_first_of(WHITESPACE, b); - if (e == string::npos) { - dump += text.substr(b); - pending_space = false; - break; - } - dump += text.substr(b, e - b); - b = e + 1; - } - if (only_space) - pending_space = true; - } else { - if (pending_space) - dump += ' '; - dump += text; - } + if (in_title_tag) { + titledump += text; + } else if (!in_pre_tag) { + string::size_type b = 0; + bool only_space = true; + while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { + only_space = false; + // If space specifically needed or chunk begins with + // whitespace, add exactly one space + if (pending_space || b != 0) { + dump += ' '; + } + pending_space = true; + string::size_type e = text.find_first_of(WHITESPACE, b); + if (e == string::npos) { + dump += text.substr(b); + pending_space = false; + break; + } + dump += text.substr(b, e - b); + b = e + 1; + } + if (only_space) + pending_space = true; + } else { + if (pending_space) + dump += ' '; + dump += text; + } } } @@ -305,175 +305,186 @@ MyHtmlParser::opening_tag(const string &tag) cout << "TAG: " << tag << ": " << endl; map::const_iterator x; for (x = p.begin(); x != p.end(); x++) { - cout << " " << x->first << " -> '" << x->second << "'" << endl; + cout << " " << x->first << " -> '" << x->second << "'" << endl; } #endif if (tag.empty()) return true; switch (tag[0]) { - case 'a': - if (tag == "address") pending_space = true; - break; - case 'b': - // body: some bad docs have several opening body tags and - // even text before the body is displayed by Opera and - // Firefox. We used to reset the dump each time we saw a - // body tag, but I can't see any reason to do so. + case 'a': + if (tag == "address") pending_space = true; + break; + case 'b': + // body: some bad docs have several opening body tags and + // even text before the body is displayed by Opera and + // Firefox. We used to reset the dump each time we saw a + // body tag, but I can't see any reason to do so. - if (tag == "blockquote" || tag == "br") { - dump += '\n'; - pending_space = true; - } - break; - case 'c': - if (tag == "center") pending_space = true; - break; - case 'd': - if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || - tag == "dt") pending_space = true; - if (tag == "dt") - dump += '\n'; - break; - case 'e': - if (tag == "embed") pending_space = true; - break; - case 'f': - if (tag == "fieldset" || tag == "form") pending_space = true; - break; - case 'h': - // hr, and h1, ..., h6 - if (tag.length() == 2 && strchr("r123456", tag[1])) { - dump += '\n'; - pending_space = true; - } - break; - case 'i': - if (tag == "iframe" || tag == "img" || tag == "isindex" || - tag == "input") pending_space = true; - break; - case 'k': - if (tag == "keygen") pending_space = true; - break; - case 'l': - if (tag == "legend" || tag == "li" || tag == "listing") { - dump += '\n'; - pending_space = true; - } - break; - case 'm': - if (tag == "meta") { - string content; - if (get_parameter(cstr_html_content, content)) { - string name; - if (get_parameter("name", name)) { - lowercase_term(name); - if (name == "date") { - // Specific to Recoll filters. - decode_entities(content); - struct tm tm; - memset(&tm, 0, sizeof(tm)); - if (strptime(content.c_str(), - " %Y-%m-%d %H:%M:%S ", &tm) || - strptime(content.c_str(), - "%Y-%m-%dT%H:%M:%S", &tm) - ) { - char ascuxtime[100]; - sprintf(ascuxtime, "%ld", (long)mktime(&tm)); - dmtime = ascuxtime; - } - } else if (name == "robots") { - } else { - string markup; - bool ishtml = false; - if (get_parameter("markup", markup)) { - if (!stringlowercmp("html", markup)) { - ishtml = true; - } - } - if (!meta[name].empty()) - meta[name] += ' '; - decode_entities(content); - meta[name] += content; - if (ishtml && - meta[name].compare(0, cstr_fldhtm.size(), - cstr_fldhtm)) { - meta[name].insert(0, cstr_fldhtm); - } - } - } - string hdr; - if (get_parameter("http-equiv", hdr)) { - lowercase_term(hdr); - if (hdr == "content-type") { - MimeHeaderValue p; - parseMimeHeaderValue(content, p); - map::const_iterator k; - if ((k = p.params.find(cstr_html_charset)) != - p.params.end()) { - charset = k->second; - if (!charset.empty() && - !samecharset(charset, fromcharset)) { - LOGDEB1("Doc http-equiv charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" ); - throw false; - } - } - } - } - } - string newcharset; - if (get_parameter(cstr_html_charset, newcharset)) { - // HTML5 added: - lowercase_term(newcharset); - charset = newcharset; - if (!charset.empty() && - !samecharset(charset, fromcharset)) { - LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" ); - throw false; - } - } - break; - } else if (tag == "marquee" || tag == "menu" || tag == "multicol") - pending_space = true; - break; - case 'o': - if (tag == "ol" || tag == "option") pending_space = true; - break; - case 'p': - if (tag == "p" || tag == "plaintext") { - dump += '\n'; - pending_space = true; - } else if (tag == "pre") { - in_pre_tag = true; - dump += '\n'; - pending_space = true; - } - break; - case 'q': - if (tag == "q") pending_space = true; - break; - case 's': - if (tag == "style") { - in_style_tag = true; - break; - } else if (tag == "script") { - in_script_tag = true; - break; - } else if (tag == "select") - pending_space = true; - break; - case 't': - if (tag == "table" || tag == "td" || tag == "textarea" || - tag == "th") { - pending_space = true; - } else if (tag == "title") { - in_title_tag = true; - } - break; - case 'u': - if (tag == "ul") pending_space = true; - break; - case 'x': - if (tag == "xmp") pending_space = true; - break; + if (tag == "blockquote" || tag == "br") { + dump += '\n'; + pending_space = true; + } + break; + case 'c': + if (tag == "center") pending_space = true; + break; + case 'd': + if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || + tag == "dt") pending_space = true; + if (tag == "dt") + dump += '\n'; + break; + case 'e': + if (tag == "embed") pending_space = true; + break; + case 'f': + if (tag == "fieldset" || tag == "form") pending_space = true; + break; + case 'h': + // hr, and h1, ..., h6 + if (tag.length() == 2 && strchr("r123456", tag[1])) { + dump += '\n'; + pending_space = true; + } + break; + case 'i': + if (tag == "iframe" || tag == "img" || tag == "isindex" || + tag == "input") pending_space = true; + break; + case 'k': + if (tag == "keygen") pending_space = true; + break; + case 'l': + if (tag == "legend" || tag == "li" || tag == "listing") { + dump += '\n'; + pending_space = true; + } + break; + case 'm': + if (tag == "meta") { + string content; + if (get_parameter(cstr_html_content, content)) { + string name; + if (get_parameter("name", name)) { + lowercase_term(name); + if (name == "date") { + // Specific to Recoll filters. + decode_entities(content); + struct tm tm; + memset(&tm, 0, sizeof(tm)); + if (strptime(content.c_str(), + " %Y-%m-%d %H:%M:%S ", &tm) || + strptime(content.c_str(), + "%Y-%m-%dT%H:%M:%S", &tm) + ) { + char ascuxtime[100]; + sprintf(ascuxtime, "%ld", (long)mktime(&tm)); + dmtime = ascuxtime; + } + } else if (name == "robots") { + } else { + string markup; + bool ishtml = false; + if (get_parameter("markup", markup)) { + if (!stringlowercmp("html", markup)) { + ishtml = true; + } + } + decode_entities(content); + // Set metadata field, avoid appending + // multiple identical instances. + auto it = meta.find(name); + if (it == meta.end() || it->second.find(content) == + string::npos) { + if (it != meta.end()) { + it->second += ' '; + it->second += content; + } else { + meta[name] = content; + } + } + if (ishtml && + meta[name].compare(0, cstr_fldhtm.size(), + cstr_fldhtm)) { + meta[name].insert(0, cstr_fldhtm); + } + } + } + string hdr; + if (get_parameter("http-equiv", hdr)) { + lowercase_term(hdr); + if (hdr == "content-type") { + MimeHeaderValue p; + parseMimeHeaderValue(content, p); + map::const_iterator k; + if ((k = p.params.find(cstr_html_charset)) != + p.params.end()) { + charset = k->second; + if (!charset.empty() && + !samecharset(charset, fromcharset)) { + LOGDEB1("Doc http-equiv charset '" << charset << + "' differs from dir deflt '" << + fromcharset << "'\n"); + throw false; + } + } + } + } + } + string newcharset; + if (get_parameter(cstr_html_charset, newcharset)) { + // HTML5 added: + lowercase_term(newcharset); + charset = newcharset; + if (!charset.empty() && + !samecharset(charset, fromcharset)) { + LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" ); + throw false; + } + } + break; + } else if (tag == "marquee" || tag == "menu" || tag == "multicol") + pending_space = true; + break; + case 'o': + if (tag == "ol" || tag == "option") pending_space = true; + break; + case 'p': + if (tag == "p" || tag == "plaintext") { + dump += '\n'; + pending_space = true; + } else if (tag == "pre") { + in_pre_tag = true; + dump += '\n'; + pending_space = true; + } + break; + case 'q': + if (tag == "q") pending_space = true; + break; + case 's': + if (tag == "style") { + in_style_tag = true; + break; + } else if (tag == "script") { + in_script_tag = true; + break; + } else if (tag == "select") + pending_space = true; + break; + case 't': + if (tag == "table" || tag == "td" || tag == "textarea" || + tag == "th") { + pending_space = true; + } else if (tag == "title") { + in_title_tag = true; + } + break; + case 'u': + if (tag == "ul") pending_space = true; + break; + case 'x': + if (tag == "xmp") pending_space = true; + break; } return true; } @@ -484,85 +495,85 @@ MyHtmlParser::closing_tag(const string &tag) LOGDEB2("closing_tag: [" << (tag) << "]\n" ); if (tag.empty()) return true; switch (tag[0]) { - case 'a': - if (tag == "address") pending_space = true; - break; - case 'b': - // body: We used to signal and end of doc here by returning - // false but the browsers just ignore body and html - // closing tags if there is further text, so it seems right - // to do the same + case 'a': + if (tag == "address") pending_space = true; + break; + case 'b': + // body: We used to signal and end of doc here by returning + // false but the browsers just ignore body and html + // closing tags if there is further text, so it seems right + // to do the same - if (tag == "blockquote" || tag == "br") pending_space = true; - break; - case 'c': - if (tag == "center") pending_space = true; - break; - case 'd': - if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || - tag == "dt") pending_space = true; - break; - case 'f': - if (tag == "fieldset" || tag == "form") pending_space = true; - break; - case 'h': - // hr, and h1, ..., h6 - if (tag.length() == 2 && strchr("r123456", tag[1])) - pending_space = true; - break; - case 'i': - if (tag == "iframe") pending_space = true; - break; - case 'l': - if (tag == "legend" || tag == "li" || tag == "listing") - pending_space = true; - break; - case 'm': - if (tag == "marquee" || tag == "menu") pending_space = true; - break; - case 'o': - if (tag == "ol" || tag == "option") pending_space = true; - break; - case 'p': - if (tag == "p") { - pending_space = true; - } else if (tag == "pre") { - pending_space = true; - in_pre_tag = false; - } - break; - case 'q': - if (tag == "q") pending_space = true; - break; - case 's': - if (tag == "style") { - in_style_tag = false; - break; - } - if (tag == "script") { - in_script_tag = false; - break; - } - if (tag == "select") pending_space = true; - break; - case 't': - if (tag == "title") { - in_title_tag = false; - if (meta.find("title") == meta.end()|| meta["title"].empty()) { - meta["title"] = titledump; - titledump.clear(); - } - break; - } - if (tag == "table" || tag == "td" || tag == "textarea" || - tag == "th") pending_space = true; - break; - case 'u': - if (tag == "ul") pending_space = true; - break; - case 'x': - if (tag == "xmp") pending_space = true; - break; + if (tag == "blockquote" || tag == "br") pending_space = true; + break; + case 'c': + if (tag == "center") pending_space = true; + break; + case 'd': + if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || + tag == "dt") pending_space = true; + break; + case 'f': + if (tag == "fieldset" || tag == "form") pending_space = true; + break; + case 'h': + // hr, and h1, ..., h6 + if (tag.length() == 2 && strchr("r123456", tag[1])) + pending_space = true; + break; + case 'i': + if (tag == "iframe") pending_space = true; + break; + case 'l': + if (tag == "legend" || tag == "li" || tag == "listing") + pending_space = true; + break; + case 'm': + if (tag == "marquee" || tag == "menu") pending_space = true; + break; + case 'o': + if (tag == "ol" || tag == "option") pending_space = true; + break; + case 'p': + if (tag == "p") { + pending_space = true; + } else if (tag == "pre") { + pending_space = true; + in_pre_tag = false; + } + break; + case 'q': + if (tag == "q") pending_space = true; + break; + case 's': + if (tag == "style") { + in_style_tag = false; + break; + } + if (tag == "script") { + in_script_tag = false; + break; + } + if (tag == "select") pending_space = true; + break; + case 't': + if (tag == "title") { + in_title_tag = false; + if (meta.find("title") == meta.end()|| meta["title"].empty()) { + meta["title"] = titledump; + titledump.clear(); + } + break; + } + if (tag == "table" || tag == "td" || tag == "textarea" || + tag == "th") pending_space = true; + break; + case 'u': + if (tag == "ul") pending_space = true; + break; + case 'x': + if (tag == "xmp") pending_space = true; + break; } return true; }