html/xml meta: avoid appending a value that is already present in the string

This commit is contained in:
Jean-Francois Dockes 2020-01-30 08:37:46 +01:00
parent 552510db06
commit e5af1651fa
3 changed files with 454 additions and 440 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2005 J.F.Dockes /* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
@ -38,10 +38,10 @@ bool MimeHandlerExecMultiple::startCmd()
{ {
LOGDEB("MimeHandlerExecMultiple::startCmd\n"); LOGDEB("MimeHandlerExecMultiple::startCmd\n");
if (params.empty()) { if (params.empty()) {
// Hu ho // Hu ho
LOGERR("MHExecMultiple::startCmd: empty params\n"); LOGERR("MHExecMultiple::startCmd: empty params\n");
m_reason = "RECFILTERROR BADCONFIG"; m_reason = "RECFILTERROR BADCONFIG";
return false; return false;
} }
// Command name // Command name
@ -55,7 +55,7 @@ bool MimeHandlerExecMultiple::startCmd()
m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir()); m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
"RECOLL_FILTER_FORPREVIEW=no"); "RECOLL_FILTER_FORPREVIEW=no");
m_cmd.setrlimit_as(m_filtermaxmbytes); m_cmd.setrlimit_as(m_filtermaxmbytes);
m_adv.setmaxsecs(m_filtermaxseconds); m_adv.setmaxsecs(m_filtermaxseconds);
@ -156,11 +156,11 @@ bool MimeHandlerExecMultiple::next_document()
{ {
LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n"); LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
if (m_havedoc == false) if (m_havedoc == false)
return false; return false;
if (missingHelper) { if (missingHelper) {
LOGDEB("MHExecMultiple::next_document(): helper known missing\n"); LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
return false; return false;
} }
if (m_cmd.getChildPid() <= 0 && !startCmd()) { if (m_cmd.getChildPid() <= 0 && !startCmd()) {
@ -178,15 +178,15 @@ bool MimeHandlerExecMultiple::next_document()
ostringstream obuf; ostringstream obuf;
string file_md5; string file_md5;
if (m_filefirst) { if (m_filefirst) {
if (!m_forPreview && !m_nomd5) { if (!m_forPreview && !m_nomd5) {
string md5, xmd5, reason; string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) { if (MD5File(m_fn, md5, &reason)) {
file_md5 = MD5HexPrint(md5, xmd5); file_md5 = MD5HexPrint(md5, xmd5);
} else { } else {
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn << LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
"]: " << reason << "\n"); "]: " << reason << "\n");
} }
} }
obuf << "FileName: " << m_fn.length() << "\n" << m_fn; obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
// m_filefirst is set to true by set_document_file() // m_filefirst is set to true by set_document_file()
m_filefirst = false; m_filefirst = false;
@ -194,13 +194,13 @@ bool MimeHandlerExecMultiple::next_document()
obuf << "Filename: " << 0 << "\n"; obuf << "Filename: " << 0 << "\n";
} }
if (!m_ipath.empty()) { if (!m_ipath.empty()) {
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" << LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
m_ipath << "]\n"); m_ipath << "]\n");
obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath; obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
} }
if (!m_dfltInputCharset.empty()) { if (!m_dfltInputCharset.empty()) {
obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n" obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n"
<< m_dfltInputCharset; << m_dfltInputCharset;
} }
obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType; obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
obuf << "\n"; obuf << "\n";
@ -247,10 +247,10 @@ bool MimeHandlerExecMultiple::next_document()
eofnow_received = true; eofnow_received = true;
} else if (!stringlowercmp("fileerror:", name)) { } else if (!stringlowercmp("fileerror:", name)) {
LOGDEB("MHExecMultiple: got FILEERROR\n"); LOGDEB("MHExecMultiple: got FILEERROR\n");
fileerror_received = true; fileerror_received = true;
} else if (!stringlowercmp("subdocerror:", name)) { } else if (!stringlowercmp("subdocerror:", name)) {
LOGDEB("MHExecMultiple: got SUBDOCERROR\n"); LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
subdocerror_received = true; subdocerror_received = true;
} else if (!stringlowercmp("ipath:", name)) { } else if (!stringlowercmp("ipath:", name)) {
ipath = data; ipath = data;
LOGDEB("MHExecMultiple: got ipath [" << data << "]\n"); LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
@ -264,7 +264,11 @@ bool MimeHandlerExecMultiple::next_document()
string nm = stringtolower((const string&)name); string nm = stringtolower((const string&)name);
trimstring(nm, ":"); trimstring(nm, ":");
LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n"); LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
m_metaData[nm] += data; auto it = m_metaData.find(nm);
if (it == m_metaData.end() ||
it->second.find(data) == std::string::npos) {
m_metaData[nm] += data;
}
} }
if (loop == 200) { if (loop == 200) {
// ?? // ??
@ -279,7 +283,7 @@ bool MimeHandlerExecMultiple::next_document()
return false; return false;
} }
if (subdocerror_received) { if (subdocerror_received) {
return false; return false;
} }
// It used to be that eof could be signalled just by an empty document, but // It used to be that eof could be signalled just by an empty document, but
@ -291,13 +295,13 @@ bool MimeHandlerExecMultiple::next_document()
} }
if (!ipath.empty()) { if (!ipath.empty()) {
// If this has an ipath, it is an internal doc from a // If this has an ipath, it is an internal doc from a
// multi-document file. In this case, either the filter // multi-document file. In this case, either the filter
// supplies the mimetype, or the ipath MUST be a filename-like // supplies the mimetype, or the ipath MUST be a filename-like
// string which we can use to compute a mime type // string which we can use to compute a mime type
m_metaData[cstr_dj_keyipath] = ipath; m_metaData[cstr_dj_keyipath] = ipath;
if (mtype.empty()) { if (mtype.empty()) {
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath " LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
"for a guess\n"); "for a guess\n");
mtype = mimetype(ipath, 0, m_config, false); mtype = mimetype(ipath, 0, m_config, false);
if (mtype.empty()) { if (mtype.empty()) {
@ -313,16 +317,16 @@ bool MimeHandlerExecMultiple::next_document()
} }
} }
m_metaData[cstr_dj_keymt] = mtype; m_metaData[cstr_dj_keymt] = mtype;
if (!m_forPreview) { if (!m_forPreview) {
string md5, xmd5; string md5, xmd5;
MD5String(m_metaData[cstr_dj_keycontent], md5); MD5String(m_metaData[cstr_dj_keycontent], md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
} }
} else { } else {
// "Self" document. // "Self" document.
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype; m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
m_metaData.erase(cstr_dj_keyipath); m_metaData.erase(cstr_dj_keyipath);
if (!m_forPreview) { if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = file_md5; m_metaData[cstr_dj_keymd5] = file_md5;
} }
} }
@ -339,4 +343,3 @@ bool MimeHandlerExecMultiple::next_document()
LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString()); LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
return true; return true;
} }

View File

@ -38,7 +38,7 @@ bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
string reason; string reason;
if (!file_to_string(fn, otext, &reason)) { if (!file_to_string(fn, otext, &reason)) {
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n"); LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
return false; return false;
} }
m_filename = fn; m_filename = fn;
return set_document_string(mt, otext); return set_document_string(mt, otext);
@ -51,10 +51,10 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
m_havedoc = true; m_havedoc = true;
if (!m_forPreview) { if (!m_forPreview) {
// We want to compute the md5 now because we may modify m_html later // We want to compute the md5 now because we may modify m_html later
string md5, xmd5; string md5, xmd5;
MD5String(htext, md5); MD5String(htext, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
} }
return true; return true;
} }
@ -62,7 +62,7 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
bool MimeHandlerHtml::next_document() bool MimeHandlerHtml::next_document()
{ {
if (m_havedoc == false) if (m_havedoc == false)
return false; return false;
m_havedoc = false; m_havedoc = false;
// If set_doc(fn), take note of file name. // If set_doc(fn), take note of file name.
string fn = m_filename; string fn = m_filename;
@ -70,12 +70,12 @@ bool MimeHandlerHtml::next_document()
string charset = m_dfltInputCharset; string charset = m_dfltInputCharset;
LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset
<< "]\n"); << "]\n");
// Override default input charset if someone took care to set one: // Override default input charset if someone took care to set one:
map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset); map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset);
if (it != m_metaData.end() && !it->second.empty()) { if (it != m_metaData.end() && !it->second.empty()) {
charset = it->second; charset = it->second;
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
charset << "]\n"); charset << "]\n");
} }
@ -88,78 +88,78 @@ bool MimeHandlerHtml::next_document()
MyHtmlParser result; MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) { for (int pass = 0; pass < 2; pass++) {
string transcoded; string transcoded;
LOGDEB("Html::mkDoc: pass " << pass << "\n"); LOGDEB("Html::mkDoc: pass " << pass << "\n");
MyHtmlParser p; MyHtmlParser p;
// Try transcoding. If it fails, use original text. // Try transcoding. If it fails, use original text.
int ecnt; int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
LOGDEB("textHtmlToDoc: transcode failed from cs '" << LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) <<
"]"); "]");
transcoded = m_html; transcoded = m_html;
// We don't know the charset, at all // We don't know the charset, at all
p.reset_charsets(); p.reset_charsets();
charset.clear(); charset.clear();
} else { } else {
if (ecnt) { if (ecnt) {
if (pass == 0) { if (pass == 0) {
LOGDEB("textHtmlToDoc: init transcode had " << ecnt << LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
" errors for ["<<(fn.empty()?"unknown":fn)<< "]\n"); " errors for ["<<(fn.empty()?"unknown":fn)<< "]\n");
} else { } else {
LOGERR("textHtmlToDoc: final transcode had " << ecnt << LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
" errors for ["<< (fn.empty()?"unknown":fn)<< "]\n"); " errors for ["<< (fn.empty()?"unknown":fn)<< "]\n");
} }
} }
// charset has the putative source charset, transcoded is now // charset has the putative source charset, transcoded is now
// in utf-8 // in utf-8
p.set_charsets(charset, "utf-8"); p.set_charsets(charset, "utf-8");
} }
try { try {
p.parse_html(transcoded); p.parse_html(transcoded);
// No exception: ok? But throw true to use the same // No exception: ok? But throw true to use the same
// code path as if an exception had been thrown by parse_html // code path as if an exception had been thrown by parse_html
throw true; throw true;
break; break;
} catch (bool diag) { } catch (bool diag) {
result = p; result = p;
if (diag == true) { if (diag == true) {
// Parser throws true at end of text. ok // Parser throws true at end of text. ok
if (m_forPreview) { if (m_forPreview) {
// Save the html text // Save the html text
m_html = transcoded; m_html = transcoded;
// In many cases, we need to change the charset decl, // In many cases, we need to change the charset decl,
// because the file was transcoded. It seems that just // because the file was transcoded. It seems that just
// inserting one is enough (only the 1st one seems to // inserting one is enough (only the 1st one seems to
// be used by browsers/qtextedit). // be used by browsers/qtextedit).
string::size_type idx = m_html.find("<head>"); string::size_type idx = m_html.find("<head>");
if (idx == string::npos) if (idx == string::npos)
idx = m_html.find("<HEAD>"); idx = m_html.find("<HEAD>");
if (idx != string::npos) if (idx != string::npos)
m_html.replace(idx+6, 0, m_html.replace(idx+6, 0,
"<meta http-equiv=\"content-type\" " "<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\">"); "content=\"text/html; charset=utf-8\">");
} }
break; break;
} }
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<< LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
result.get_charset() << "]\n"); result.get_charset() << "]\n");
if (!result.get_charset().empty() && if (!result.get_charset().empty() &&
!samecharset(result.get_charset(), result.fromcharset)) { !samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB("textHtmlToDoc: reparse for charsets\n"); LOGDEB("textHtmlToDoc: reparse for charsets\n");
// Set the origin charset as specified in document before // Set the origin charset as specified in document before
// transcoding again // transcoding again
charset = result.get_charset(); charset = result.get_charset();
} else { } else {
LOGERR("textHtmlToDoc:: error: non charset exception\n"); LOGERR("textHtmlToDoc:: error: non charset exception\n");
return false; return false;
} }
} }
} }
m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
@ -168,13 +168,13 @@ bool MimeHandlerHtml::next_document()
// Avoid setting empty values which would crush ones possibly inherited // Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment) // from parent (if we're an attachment)
if (!result.dmtime.empty()) if (!result.dmtime.empty())
m_metaData[cstr_dj_keymd] = result.dmtime; m_metaData[cstr_dj_keymd] = result.dmtime;
m_metaData[cstr_dj_keymt] = cstr_textplain; m_metaData[cstr_dj_keymt] = cstr_textplain;
for (map<string,string>::const_iterator it = result.meta.begin(); for (const auto& entry : result.meta) {
it != result.meta.end(); it++) { if (!entry.second.empty()) {
if (!it->second.empty()) m_metaData[entry.first] = entry.second;
m_metaData[it->first] = it->second; }
} }
return true; return true;
} }

View File

@ -161,19 +161,19 @@ map<string, string> my_named_ents;
class NamedEntsInitializer { class NamedEntsInitializer {
public: public:
NamedEntsInitializer() NamedEntsInitializer()
{ {
for (int i = 0;;) { for (int i = 0;;) {
const char *ent; const char *ent;
const char *val; const char *val;
ent = epairs[i++]; ent = epairs[i++];
if (ent == 0) if (ent == 0)
break; break;
val = epairs[i++]; val = epairs[i++];
if (val == 0) if (val == 0)
break; break;
my_named_ents[string(ent)] = val; my_named_ents[string(ent)] = val;
} }
} }
}; };
static NamedEntsInitializer namedEntsInitializerInstance; static NamedEntsInitializer namedEntsInitializerInstance;
@ -198,58 +198,58 @@ void MyHtmlParser::decode_entities(string &s)
// so don't do it. If charset known, caller has converted text to utf-8, // so don't do it. If charset known, caller has converted text to utf-8,
// and this is also how we translate entities // and this is also how we translate entities
// if (tocharset != "utf-8") // if (tocharset != "utf-8")
// return; // return;
// We need a const_iterator version of s.end() - otherwise the // We need a const_iterator version of s.end() - otherwise the
// find() and find_if() templates don't work... // find() and find_if() templates don't work...
string::const_iterator amp = s.begin(), s_end = s.end(); string::const_iterator amp = s.begin(), s_end = s.end();
while ((amp = find(amp, s_end, '&')) != s_end) { while ((amp = find(amp, s_end, '&')) != s_end) {
unsigned int val = 0; unsigned int val = 0;
string::const_iterator end, p = amp + 1; string::const_iterator end, p = amp + 1;
string subs; string subs;
if (p != s_end && *p == '#') { if (p != s_end && *p == '#') {
p++; p++;
if (p != s_end && (*p == 'x' || *p == 'X')) { if (p != s_end && (*p == 'x' || *p == 'X')) {
// hex // hex
p++; p++;
end = find_if(p, s_end, p_notxdigit); end = find_if(p, s_end, p_notxdigit);
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
} else { } else {
// number // number
end = find_if(p, s_end, p_notdigit); end = find_if(p, s_end, p_notdigit);
val = atoi(s.substr(p - s.begin(), end - p).c_str()); val = atoi(s.substr(p - s.begin(), end - p).c_str());
} }
} else { } else {
end = find_if(p, s_end, p_notalnum); end = find_if(p, s_end, p_notalnum);
string code = s.substr(p - s.begin(), end - p); string code = s.substr(p - s.begin(), end - p);
map<string, string>::const_iterator i; map<string, string>::const_iterator i;
i = my_named_ents.find(code); i = my_named_ents.find(code);
if (i != my_named_ents.end()) if (i != my_named_ents.end())
subs = i->second; subs = i->second;
} }
if (end < s_end && *end == ';') if (end < s_end && *end == ';')
end++; end++;
if (val) { if (val) {
// The code is the code position for a unicode char. We need // The code is the code position for a unicode char. We need
// to translate it to an utf-8 string. // to translate it to an utf-8 string.
string utf16be; string utf16be;
utf16be += char(val / 256); utf16be += char(val / 256);
utf16be += char(val % 256); utf16be += char(val % 256);
transcode(utf16be, subs, "UTF-16BE", "UTF-8"); transcode(utf16be, subs, "UTF-16BE", "UTF-8");
} }
if (subs.length() > 0) { if (subs.length() > 0) {
string::size_type amp_pos = amp - s.begin(); string::size_type amp_pos = amp - s.begin();
s.replace(amp_pos, end - amp, subs); s.replace(amp_pos, end - amp, subs);
s_end = s.end(); s_end = s.end();
// We've modified the string, so the iterators are no longer // We've modified the string, so the iterators are no longer
// valid... // valid...
amp = s.begin() + amp_pos + subs.length(); amp = s.begin() + amp_pos + subs.length();
} else { } else {
amp = end; amp = end;
} }
} }
} }
@ -265,35 +265,35 @@ MyHtmlParser::process_text(const string &text)
CancelCheck::instance().checkCancel(); CancelCheck::instance().checkCancel();
if (!in_script_tag && !in_style_tag) { if (!in_script_tag && !in_style_tag) {
if (in_title_tag) { if (in_title_tag) {
titledump += text; titledump += text;
} else if (!in_pre_tag) { } else if (!in_pre_tag) {
string::size_type b = 0; string::size_type b = 0;
bool only_space = true; bool only_space = true;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
only_space = false; only_space = false;
// If space specifically needed or chunk begins with // If space specifically needed or chunk begins with
// whitespace, add exactly one space // whitespace, add exactly one space
if (pending_space || b != 0) { if (pending_space || b != 0) {
dump += ' '; dump += ' ';
} }
pending_space = true; pending_space = true;
string::size_type e = text.find_first_of(WHITESPACE, b); string::size_type e = text.find_first_of(WHITESPACE, b);
if (e == string::npos) { if (e == string::npos) {
dump += text.substr(b); dump += text.substr(b);
pending_space = false; pending_space = false;
break; break;
} }
dump += text.substr(b, e - b); dump += text.substr(b, e - b);
b = e + 1; b = e + 1;
} }
if (only_space) if (only_space)
pending_space = true; pending_space = true;
} else { } else {
if (pending_space) if (pending_space)
dump += ' '; dump += ' ';
dump += text; dump += text;
} }
} }
} }
@ -305,175 +305,186 @@ MyHtmlParser::opening_tag(const string &tag)
cout << "TAG: " << tag << ": " << endl; cout << "TAG: " << tag << ": " << endl;
map<string, string>::const_iterator x; map<string, string>::const_iterator x;
for (x = p.begin(); x != p.end(); x++) { for (x = p.begin(); x != p.end(); x++) {
cout << " " << x->first << " -> '" << x->second << "'" << endl; cout << " " << x->first << " -> '" << x->second << "'" << endl;
} }
#endif #endif
if (tag.empty()) return true; if (tag.empty()) return true;
switch (tag[0]) { switch (tag[0]) {
case 'a': case 'a':
if (tag == "address") pending_space = true; if (tag == "address") pending_space = true;
break; break;
case 'b': case 'b':
// body: some bad docs have several opening body tags and // body: some bad docs have several opening body tags and
// even text before the body is displayed by Opera and // even text before the body is displayed by Opera and
// Firefox. We used to reset the dump each time we saw a // Firefox. We used to reset the dump each time we saw a
// body tag, but I can't see any reason to do so. // body tag, but I can't see any reason to do so.
if (tag == "blockquote" || tag == "br") { if (tag == "blockquote" || tag == "br") {
dump += '\n'; dump += '\n';
pending_space = true; pending_space = true;
} }
break; break;
case 'c': case 'c':
if (tag == "center") pending_space = true; if (tag == "center") pending_space = true;
break; break;
case 'd': case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true; tag == "dt") pending_space = true;
if (tag == "dt") if (tag == "dt")
dump += '\n'; dump += '\n';
break; break;
case 'e': case 'e':
if (tag == "embed") pending_space = true; if (tag == "embed") pending_space = true;
break; break;
case 'f': case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true; if (tag == "fieldset" || tag == "form") pending_space = true;
break; break;
case 'h': case 'h':
// hr, and h1, ..., h6 // hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1])) { if (tag.length() == 2 && strchr("r123456", tag[1])) {
dump += '\n'; dump += '\n';
pending_space = true; pending_space = true;
} }
break; break;
case 'i': case 'i':
if (tag == "iframe" || tag == "img" || tag == "isindex" || if (tag == "iframe" || tag == "img" || tag == "isindex" ||
tag == "input") pending_space = true; tag == "input") pending_space = true;
break; break;
case 'k': case 'k':
if (tag == "keygen") pending_space = true; if (tag == "keygen") pending_space = true;
break; break;
case 'l': case 'l':
if (tag == "legend" || tag == "li" || tag == "listing") { if (tag == "legend" || tag == "li" || tag == "listing") {
dump += '\n'; dump += '\n';
pending_space = true; pending_space = true;
} }
break; break;
case 'm': case 'm':
if (tag == "meta") { if (tag == "meta") {
string content; string content;
if (get_parameter(cstr_html_content, content)) { if (get_parameter(cstr_html_content, content)) {
string name; string name;
if (get_parameter("name", name)) { if (get_parameter("name", name)) {
lowercase_term(name); lowercase_term(name);
if (name == "date") { if (name == "date") {
// Specific to Recoll filters. // Specific to Recoll filters.
decode_entities(content); decode_entities(content);
struct tm tm; struct tm tm;
memset(&tm, 0, sizeof(tm)); memset(&tm, 0, sizeof(tm));
if (strptime(content.c_str(), if (strptime(content.c_str(),
" %Y-%m-%d %H:%M:%S ", &tm) || " %Y-%m-%d %H:%M:%S ", &tm) ||
strptime(content.c_str(), strptime(content.c_str(),
"%Y-%m-%dT%H:%M:%S", &tm) "%Y-%m-%dT%H:%M:%S", &tm)
) { ) {
char ascuxtime[100]; char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)mktime(&tm)); sprintf(ascuxtime, "%ld", (long)mktime(&tm));
dmtime = ascuxtime; dmtime = ascuxtime;
} }
} else if (name == "robots") { } else if (name == "robots") {
} else { } else {
string markup; string markup;
bool ishtml = false; bool ishtml = false;
if (get_parameter("markup", markup)) { if (get_parameter("markup", markup)) {
if (!stringlowercmp("html", markup)) { if (!stringlowercmp("html", markup)) {
ishtml = true; ishtml = true;
} }
} }
if (!meta[name].empty()) decode_entities(content);
meta[name] += ' '; // Set metadata field, avoid appending
decode_entities(content); // multiple identical instances.
meta[name] += content; auto it = meta.find(name);
if (ishtml && if (it == meta.end() || it->second.find(content) ==
meta[name].compare(0, cstr_fldhtm.size(), string::npos) {
cstr_fldhtm)) { if (it != meta.end()) {
meta[name].insert(0, cstr_fldhtm); it->second += ' ';
} it->second += content;
} } else {
} meta[name] = content;
string hdr; }
if (get_parameter("http-equiv", hdr)) { }
lowercase_term(hdr); if (ishtml &&
if (hdr == "content-type") { meta[name].compare(0, cstr_fldhtm.size(),
MimeHeaderValue p; cstr_fldhtm)) {
parseMimeHeaderValue(content, p); meta[name].insert(0, cstr_fldhtm);
map<string, string>::const_iterator k; }
if ((k = p.params.find(cstr_html_charset)) != }
p.params.end()) { }
charset = k->second; string hdr;
if (!charset.empty() && if (get_parameter("http-equiv", hdr)) {
!samecharset(charset, fromcharset)) { lowercase_term(hdr);
LOGDEB1("Doc http-equiv charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" ); if (hdr == "content-type") {
throw false; MimeHeaderValue p;
} parseMimeHeaderValue(content, p);
} map<string, string>::const_iterator k;
} if ((k = p.params.find(cstr_html_charset)) !=
} p.params.end()) {
} charset = k->second;
string newcharset; if (!charset.empty() &&
if (get_parameter(cstr_html_charset, newcharset)) { !samecharset(charset, fromcharset)) {
// HTML5 added: <meta charset="..."> LOGDEB1("Doc http-equiv charset '" << charset <<
lowercase_term(newcharset); "' differs from dir deflt '" <<
charset = newcharset; fromcharset << "'\n");
if (!charset.empty() && throw false;
!samecharset(charset, fromcharset)) { }
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" ); }
throw false; }
} }
} }
break; string newcharset;
} else if (tag == "marquee" || tag == "menu" || tag == "multicol") if (get_parameter(cstr_html_charset, newcharset)) {
pending_space = true; // HTML5 added: <meta charset="...">
break; lowercase_term(newcharset);
case 'o': charset = newcharset;
if (tag == "ol" || tag == "option") pending_space = true; if (!charset.empty() &&
break; !samecharset(charset, fromcharset)) {
case 'p': LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
if (tag == "p" || tag == "plaintext") { throw false;
dump += '\n'; }
pending_space = true; }
} else if (tag == "pre") { break;
in_pre_tag = true; } else if (tag == "marquee" || tag == "menu" || tag == "multicol")
dump += '\n'; pending_space = true;
pending_space = true; break;
} case 'o':
break; if (tag == "ol" || tag == "option") pending_space = true;
case 'q': break;
if (tag == "q") pending_space = true; case 'p':
break; if (tag == "p" || tag == "plaintext") {
case 's': dump += '\n';
if (tag == "style") { pending_space = true;
in_style_tag = true; } else if (tag == "pre") {
break; in_pre_tag = true;
} else if (tag == "script") { dump += '\n';
in_script_tag = true; pending_space = true;
break; }
} else if (tag == "select") break;
pending_space = true; case 'q':
break; if (tag == "q") pending_space = true;
case 't': break;
if (tag == "table" || tag == "td" || tag == "textarea" || case 's':
tag == "th") { if (tag == "style") {
pending_space = true; in_style_tag = true;
} else if (tag == "title") { break;
in_title_tag = true; } else if (tag == "script") {
} in_script_tag = true;
break; break;
case 'u': } else if (tag == "select")
if (tag == "ul") pending_space = true; pending_space = true;
break; break;
case 'x': case 't':
if (tag == "xmp") pending_space = true; if (tag == "table" || tag == "td" || tag == "textarea" ||
break; tag == "th") {
pending_space = true;
} else if (tag == "title") {
in_title_tag = true;
}
break;
case 'u':
if (tag == "ul") pending_space = true;
break;
case 'x':
if (tag == "xmp") pending_space = true;
break;
} }
return true; return true;
} }
@ -484,85 +495,85 @@ MyHtmlParser::closing_tag(const string &tag)
LOGDEB2("closing_tag: [" << (tag) << "]\n" ); LOGDEB2("closing_tag: [" << (tag) << "]\n" );
if (tag.empty()) return true; if (tag.empty()) return true;
switch (tag[0]) { switch (tag[0]) {
case 'a': case 'a':
if (tag == "address") pending_space = true; if (tag == "address") pending_space = true;
break; break;
case 'b': case 'b':
// body: We used to signal and end of doc here by returning // body: We used to signal and end of doc here by returning
// false but the browsers just ignore body and html // false but the browsers just ignore body and html
// closing tags if there is further text, so it seems right // closing tags if there is further text, so it seems right
// to do the same // to do the same
if (tag == "blockquote" || tag == "br") pending_space = true; if (tag == "blockquote" || tag == "br") pending_space = true;
break; break;
case 'c': case 'c':
if (tag == "center") pending_space = true; if (tag == "center") pending_space = true;
break; break;
case 'd': case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" || if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true; tag == "dt") pending_space = true;
break; break;
case 'f': case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true; if (tag == "fieldset" || tag == "form") pending_space = true;
break; break;
case 'h': case 'h':
// hr, and h1, ..., h6 // hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1])) if (tag.length() == 2 && strchr("r123456", tag[1]))
pending_space = true; pending_space = true;
break; break;
case 'i': case 'i':
if (tag == "iframe") pending_space = true; if (tag == "iframe") pending_space = true;
break; break;
case 'l': case 'l':
if (tag == "legend" || tag == "li" || tag == "listing") if (tag == "legend" || tag == "li" || tag == "listing")
pending_space = true; pending_space = true;
break; break;
case 'm': case 'm':
if (tag == "marquee" || tag == "menu") pending_space = true; if (tag == "marquee" || tag == "menu") pending_space = true;
break; break;
case 'o': case 'o':
if (tag == "ol" || tag == "option") pending_space = true; if (tag == "ol" || tag == "option") pending_space = true;
break; break;
case 'p': case 'p':
if (tag == "p") { if (tag == "p") {
pending_space = true; pending_space = true;
} else if (tag == "pre") { } else if (tag == "pre") {
pending_space = true; pending_space = true;
in_pre_tag = false; in_pre_tag = false;
} }
break; break;
case 'q': case 'q':
if (tag == "q") pending_space = true; if (tag == "q") pending_space = true;
break; break;
case 's': case 's':
if (tag == "style") { if (tag == "style") {
in_style_tag = false; in_style_tag = false;
break; break;
} }
if (tag == "script") { if (tag == "script") {
in_script_tag = false; in_script_tag = false;
break; break;
} }
if (tag == "select") pending_space = true; if (tag == "select") pending_space = true;
break; break;
case 't': case 't':
if (tag == "title") { if (tag == "title") {
in_title_tag = false; in_title_tag = false;
if (meta.find("title") == meta.end()|| meta["title"].empty()) { if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = titledump; meta["title"] = titledump;
titledump.clear(); titledump.clear();
} }
break; break;
} }
if (tag == "table" || tag == "td" || tag == "textarea" || if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") pending_space = true; tag == "th") pending_space = true;
break; break;
case 'u': case 'u':
if (tag == "ul") pending_space = true; if (tag == "ul") pending_space = true;
break; break;
case 'x': case 'x':
if (tag == "xmp") pending_space = true; if (tag == "xmp") pending_space = true;
break; break;
} }
return true; return true;
} }