html/xml meta: avoid appending a value that is already present in the string

This commit is contained in:
Jean-Francois Dockes 2020-01-30 08:37:46 +01:00
parent 552510db06
commit e5af1651fa
3 changed files with 454 additions and 440 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2005 J.F.Dockes
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -38,10 +38,10 @@ bool MimeHandlerExecMultiple::startCmd()
{
LOGDEB("MimeHandlerExecMultiple::startCmd\n");
if (params.empty()) {
// Hu ho
LOGERR("MHExecMultiple::startCmd: empty params\n");
m_reason = "RECFILTERROR BADCONFIG";
return false;
// Hu ho
LOGERR("MHExecMultiple::startCmd: empty params\n");
m_reason = "RECFILTERROR BADCONFIG";
return false;
}
// Command name
@ -55,7 +55,7 @@ bool MimeHandlerExecMultiple::startCmd()
m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
"RECOLL_FILTER_FORPREVIEW=no");
"RECOLL_FILTER_FORPREVIEW=no");
m_cmd.setrlimit_as(m_filtermaxmbytes);
m_adv.setmaxsecs(m_filtermaxseconds);
@ -156,11 +156,11 @@ bool MimeHandlerExecMultiple::next_document()
{
LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
if (m_havedoc == false)
return false;
return false;
if (missingHelper) {
LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
return false;
LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
return false;
}
if (m_cmd.getChildPid() <= 0 && !startCmd()) {
@ -178,15 +178,15 @@ bool MimeHandlerExecMultiple::next_document()
ostringstream obuf;
string file_md5;
if (m_filefirst) {
if (!m_forPreview && !m_nomd5) {
string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) {
file_md5 = MD5HexPrint(md5, xmd5);
} else {
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
if (!m_forPreview && !m_nomd5) {
string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) {
file_md5 = MD5HexPrint(md5, xmd5);
} else {
LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
"]: " << reason << "\n");
}
}
}
}
obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
// m_filefirst is set to true by set_document_file()
m_filefirst = false;
@ -194,13 +194,13 @@ bool MimeHandlerExecMultiple::next_document()
obuf << "Filename: " << 0 << "\n";
}
if (!m_ipath.empty()) {
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
m_ipath << "]\n");
obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
}
if (!m_dfltInputCharset.empty()) {
obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n"
<< m_dfltInputCharset;
<< m_dfltInputCharset;
}
obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
obuf << "\n";
@ -247,10 +247,10 @@ bool MimeHandlerExecMultiple::next_document()
eofnow_received = true;
} else if (!stringlowercmp("fileerror:", name)) {
LOGDEB("MHExecMultiple: got FILEERROR\n");
fileerror_received = true;
fileerror_received = true;
} else if (!stringlowercmp("subdocerror:", name)) {
LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
subdocerror_received = true;
subdocerror_received = true;
} else if (!stringlowercmp("ipath:", name)) {
ipath = data;
LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
@ -264,7 +264,11 @@ bool MimeHandlerExecMultiple::next_document()
string nm = stringtolower((const string&)name);
trimstring(nm, ":");
LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
m_metaData[nm] += data;
auto it = m_metaData.find(nm);
if (it == m_metaData.end() ||
it->second.find(data) == std::string::npos) {
m_metaData[nm] += data;
}
}
if (loop == 200) {
// ??
@ -279,7 +283,7 @@ bool MimeHandlerExecMultiple::next_document()
return false;
}
if (subdocerror_received) {
return false;
return false;
}
// It used to be that eof could be signalled just by an empty document, but
@ -291,13 +295,13 @@ bool MimeHandlerExecMultiple::next_document()
}
if (!ipath.empty()) {
// If this has an ipath, it is an internal doc from a
// multi-document file. In this case, either the filter
// supplies the mimetype, or the ipath MUST be a filename-like
// string which we can use to compute a mime type
// If this has an ipath, it is an internal doc from a
// multi-document file. In this case, either the filter
// supplies the mimetype, or the ipath MUST be a filename-like
// string which we can use to compute a mime type
m_metaData[cstr_dj_keyipath] = ipath;
if (mtype.empty()) {
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
"for a guess\n");
mtype = mimetype(ipath, 0, m_config, false);
if (mtype.empty()) {
@ -313,16 +317,16 @@ bool MimeHandlerExecMultiple::next_document()
}
}
m_metaData[cstr_dj_keymt] = mtype;
if (!m_forPreview) {
string md5, xmd5;
MD5String(m_metaData[cstr_dj_keycontent], md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
}
if (!m_forPreview) {
string md5, xmd5;
MD5String(m_metaData[cstr_dj_keycontent], md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
}
} else {
// "Self" document.
// "Self" document.
m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
m_metaData.erase(cstr_dj_keyipath);
if (!m_forPreview) {
if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = file_md5;
}
}
@ -339,4 +343,3 @@ bool MimeHandlerExecMultiple::next_document()
LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
return true;
}

View File

@ -38,7 +38,7 @@ bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
string reason;
if (!file_to_string(fn, otext, &reason)) {
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
return false;
return false;
}
m_filename = fn;
return set_document_string(mt, otext);
@ -51,10 +51,10 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
m_havedoc = true;
if (!m_forPreview) {
// We want to compute the md5 now because we may modify m_html later
string md5, xmd5;
MD5String(htext, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
// We want to compute the md5 now because we may modify m_html later
string md5, xmd5;
MD5String(htext, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
}
return true;
}
@ -62,7 +62,7 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
bool MimeHandlerHtml::next_document()
{
if (m_havedoc == false)
return false;
return false;
m_havedoc = false;
// If set_doc(fn), take note of file name.
string fn = m_filename;
@ -70,12 +70,12 @@ bool MimeHandlerHtml::next_document()
string charset = m_dfltInputCharset;
LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset
<< "]\n");
<< "]\n");
// Override default input charset if someone took care to set one:
map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset);
if (it != m_metaData.end() && !it->second.empty()) {
charset = it->second;
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
charset = it->second;
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
charset << "]\n");
}
@ -88,78 +88,78 @@ bool MimeHandlerHtml::next_document()
MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB("Html::mkDoc: pass " << pass << "\n");
MyHtmlParser p;
string transcoded;
LOGDEB("Html::mkDoc: pass " << pass << "\n");
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
// Try transcoding. If it fails, use original text.
int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) <<
"]");
transcoded = m_html;
// We don't know the charset, at all
p.reset_charsets();
charset.clear();
} else {
if (ecnt) {
if (pass == 0) {
LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
transcoded = m_html;
// We don't know the charset, at all
p.reset_charsets();
charset.clear();
} else {
if (ecnt) {
if (pass == 0) {
LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
" errors for ["<<(fn.empty()?"unknown":fn)<< "]\n");
} else {
LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
} else {
LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
" errors for ["<< (fn.empty()?"unknown":fn)<< "]\n");
}
}
// charset has the putative source charset, transcoded is now
// in utf-8
p.set_charsets(charset, "utf-8");
}
}
}
// charset has the putative source charset, transcoded is now
// in utf-8
p.set_charsets(charset, "utf-8");
}
try {
p.parse_html(transcoded);
// No exception: ok? But throw true to use the same
// code path as if an exception had been thrown by parse_html
throw true;
break;
} catch (bool diag) {
result = p;
if (diag == true) {
// Parser throws true at end of text. ok
try {
p.parse_html(transcoded);
// No exception: ok? But throw true to use the same
// code path as if an exception had been thrown by parse_html
throw true;
break;
} catch (bool diag) {
result = p;
if (diag == true) {
// Parser throws true at end of text. ok
if (m_forPreview) {
// Save the html text
m_html = transcoded;
// In many cases, we need to change the charset decl,
// because the file was transcoded. It seems that just
// inserting one is enough (only the 1st one seems to
// be used by browsers/qtextedit).
if (m_forPreview) {
// Save the html text
m_html = transcoded;
// In many cases, we need to change the charset decl,
// because the file was transcoded. It seems that just
// inserting one is enough (only the 1st one seems to
// be used by browsers/qtextedit).
string::size_type idx = m_html.find("<head>");
if (idx == string::npos)
idx = m_html.find("<HEAD>");
if (idx != string::npos)
m_html.replace(idx+6, 0,
"<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\">");
}
if (idx == string::npos)
idx = m_html.find("<HEAD>");
if (idx != string::npos)
m_html.replace(idx+6, 0,
"<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\">");
}
break;
}
break;
}
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
result.get_charset() << "]\n");
if (!result.get_charset().empty() &&
!samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB("textHtmlToDoc: reparse for charsets\n");
// Set the origin charset as specified in document before
// transcoding again
charset = result.get_charset();
} else {
LOGERR("textHtmlToDoc:: error: non charset exception\n");
return false;
}
}
if (!result.get_charset().empty() &&
!samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB("textHtmlToDoc: reparse for charsets\n");
// Set the origin charset as specified in document before
// transcoding again
charset = result.get_charset();
} else {
LOGERR("textHtmlToDoc:: error: non charset exception\n");
return false;
}
}
}
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
@ -168,13 +168,13 @@ bool MimeHandlerHtml::next_document()
// Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment)
if (!result.dmtime.empty())
m_metaData[cstr_dj_keymd] = result.dmtime;
m_metaData[cstr_dj_keymd] = result.dmtime;
m_metaData[cstr_dj_keymt] = cstr_textplain;
for (map<string,string>::const_iterator it = result.meta.begin();
it != result.meta.end(); it++) {
if (!it->second.empty())
m_metaData[it->first] = it->second;
for (const auto& entry : result.meta) {
if (!entry.second.empty()) {
m_metaData[entry.first] = entry.second;
}
}
return true;
}

View File

@ -161,19 +161,19 @@ map<string, string> my_named_ents;
class NamedEntsInitializer {
public:
NamedEntsInitializer()
{
for (int i = 0;;) {
const char *ent;
const char *val;
ent = epairs[i++];
if (ent == 0)
break;
val = epairs[i++];
if (val == 0)
break;
my_named_ents[string(ent)] = val;
}
}
{
for (int i = 0;;) {
const char *ent;
const char *val;
ent = epairs[i++];
if (ent == 0)
break;
val = epairs[i++];
if (val == 0)
break;
my_named_ents[string(ent)] = val;
}
}
};
static NamedEntsInitializer namedEntsInitializerInstance;
@ -198,58 +198,58 @@ void MyHtmlParser::decode_entities(string &s)
// so don't do it. If charset known, caller has converted text to utf-8,
// and this is also how we translate entities
// if (tocharset != "utf-8")
// return;
// return;
// We need a const_iterator version of s.end() - otherwise the
// find() and find_if() templates don't work...
string::const_iterator amp = s.begin(), s_end = s.end();
while ((amp = find(amp, s_end, '&')) != s_end) {
unsigned int val = 0;
string::const_iterator end, p = amp + 1;
string subs;
if (p != s_end && *p == '#') {
p++;
if (p != s_end && (*p == 'x' || *p == 'X')) {
// hex
p++;
end = find_if(p, s_end, p_notxdigit);
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
} else {
// number
end = find_if(p, s_end, p_notdigit);
val = atoi(s.substr(p - s.begin(), end - p).c_str());
}
} else {
end = find_if(p, s_end, p_notalnum);
string code = s.substr(p - s.begin(), end - p);
map<string, string>::const_iterator i;
i = my_named_ents.find(code);
if (i != my_named_ents.end())
subs = i->second;
}
unsigned int val = 0;
string::const_iterator end, p = amp + 1;
string subs;
if (p != s_end && *p == '#') {
p++;
if (p != s_end && (*p == 'x' || *p == 'X')) {
// hex
p++;
end = find_if(p, s_end, p_notxdigit);
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
} else {
// number
end = find_if(p, s_end, p_notdigit);
val = atoi(s.substr(p - s.begin(), end - p).c_str());
}
} else {
end = find_if(p, s_end, p_notalnum);
string code = s.substr(p - s.begin(), end - p);
map<string, string>::const_iterator i;
i = my_named_ents.find(code);
if (i != my_named_ents.end())
subs = i->second;
}
if (end < s_end && *end == ';')
end++;
if (val) {
// The code is the code position for a unicode char. We need
// to translate it to an utf-8 string.
string utf16be;
utf16be += char(val / 256);
utf16be += char(val % 256);
transcode(utf16be, subs, "UTF-16BE", "UTF-8");
}
if (end < s_end && *end == ';')
end++;
if (val) {
// The code is the code position for a unicode char. We need
// to translate it to an utf-8 string.
string utf16be;
utf16be += char(val / 256);
utf16be += char(val % 256);
transcode(utf16be, subs, "UTF-16BE", "UTF-8");
}
if (subs.length() > 0) {
string::size_type amp_pos = amp - s.begin();
s.replace(amp_pos, end - amp, subs);
s_end = s.end();
// We've modified the string, so the iterators are no longer
// valid...
amp = s.begin() + amp_pos + subs.length();
} else {
amp = end;
}
if (subs.length() > 0) {
string::size_type amp_pos = amp - s.begin();
s.replace(amp_pos, end - amp, subs);
s_end = s.end();
// We've modified the string, so the iterators are no longer
// valid...
amp = s.begin() + amp_pos + subs.length();
} else {
amp = end;
}
}
}
@ -265,35 +265,35 @@ MyHtmlParser::process_text(const string &text)
CancelCheck::instance().checkCancel();
if (!in_script_tag && !in_style_tag) {
if (in_title_tag) {
titledump += text;
} else if (!in_pre_tag) {
string::size_type b = 0;
bool only_space = true;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
only_space = false;
// If space specifically needed or chunk begins with
// whitespace, add exactly one space
if (pending_space || b != 0) {
dump += ' ';
}
pending_space = true;
string::size_type e = text.find_first_of(WHITESPACE, b);
if (e == string::npos) {
dump += text.substr(b);
pending_space = false;
break;
}
dump += text.substr(b, e - b);
b = e + 1;
}
if (only_space)
pending_space = true;
} else {
if (pending_space)
dump += ' ';
dump += text;
}
if (in_title_tag) {
titledump += text;
} else if (!in_pre_tag) {
string::size_type b = 0;
bool only_space = true;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
only_space = false;
// If space specifically needed or chunk begins with
// whitespace, add exactly one space
if (pending_space || b != 0) {
dump += ' ';
}
pending_space = true;
string::size_type e = text.find_first_of(WHITESPACE, b);
if (e == string::npos) {
dump += text.substr(b);
pending_space = false;
break;
}
dump += text.substr(b, e - b);
b = e + 1;
}
if (only_space)
pending_space = true;
} else {
if (pending_space)
dump += ' ';
dump += text;
}
}
}
@ -305,175 +305,186 @@ MyHtmlParser::opening_tag(const string &tag)
cout << "TAG: " << tag << ": " << endl;
map<string, string>::const_iterator x;
for (x = p.begin(); x != p.end(); x++) {
cout << " " << x->first << " -> '" << x->second << "'" << endl;
cout << " " << x->first << " -> '" << x->second << "'" << endl;
}
#endif
if (tag.empty()) return true;
switch (tag[0]) {
case 'a':
if (tag == "address") pending_space = true;
break;
case 'b':
// body: some bad docs have several opening body tags and
// even text before the body is displayed by Opera and
// Firefox. We used to reset the dump each time we saw a
// body tag, but I can't see any reason to do so.
case 'a':
if (tag == "address") pending_space = true;
break;
case 'b':
// body: some bad docs have several opening body tags and
// even text before the body is displayed by Opera and
// Firefox. We used to reset the dump each time we saw a
// body tag, but I can't see any reason to do so.
if (tag == "blockquote" || tag == "br") {
dump += '\n';
pending_space = true;
}
break;
case 'c':
if (tag == "center") pending_space = true;
break;
case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true;
if (tag == "dt")
dump += '\n';
break;
case 'e':
if (tag == "embed") pending_space = true;
break;
case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true;
break;
case 'h':
// hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1])) {
dump += '\n';
pending_space = true;
}
break;
case 'i':
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
tag == "input") pending_space = true;
break;
case 'k':
if (tag == "keygen") pending_space = true;
break;
case 'l':
if (tag == "legend" || tag == "li" || tag == "listing") {
dump += '\n';
pending_space = true;
}
break;
case 'm':
if (tag == "meta") {
string content;
if (get_parameter(cstr_html_content, content)) {
string name;
if (get_parameter("name", name)) {
lowercase_term(name);
if (name == "date") {
// Specific to Recoll filters.
decode_entities(content);
struct tm tm;
memset(&tm, 0, sizeof(tm));
if (strptime(content.c_str(),
" %Y-%m-%d %H:%M:%S ", &tm) ||
strptime(content.c_str(),
"%Y-%m-%dT%H:%M:%S", &tm)
) {
char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
dmtime = ascuxtime;
}
} else if (name == "robots") {
} else {
string markup;
bool ishtml = false;
if (get_parameter("markup", markup)) {
if (!stringlowercmp("html", markup)) {
ishtml = true;
}
}
if (!meta[name].empty())
meta[name] += ' ';
decode_entities(content);
meta[name] += content;
if (ishtml &&
meta[name].compare(0, cstr_fldhtm.size(),
cstr_fldhtm)) {
meta[name].insert(0, cstr_fldhtm);
}
}
}
string hdr;
if (get_parameter("http-equiv", hdr)) {
lowercase_term(hdr);
if (hdr == "content-type") {
MimeHeaderValue p;
parseMimeHeaderValue(content, p);
map<string, string>::const_iterator k;
if ((k = p.params.find(cstr_html_charset)) !=
p.params.end()) {
charset = k->second;
if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1("Doc http-equiv charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
throw false;
}
}
}
}
}
string newcharset;
if (get_parameter(cstr_html_charset, newcharset)) {
// HTML5 added: <meta charset="...">
lowercase_term(newcharset);
charset = newcharset;
if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
throw false;
}
}
break;
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
pending_space = true;
break;
case 'o':
if (tag == "ol" || tag == "option") pending_space = true;
break;
case 'p':
if (tag == "p" || tag == "plaintext") {
dump += '\n';
pending_space = true;
} else if (tag == "pre") {
in_pre_tag = true;
dump += '\n';
pending_space = true;
}
break;
case 'q':
if (tag == "q") pending_space = true;
break;
case 's':
if (tag == "style") {
in_style_tag = true;
break;
} else if (tag == "script") {
in_script_tag = true;
break;
} else if (tag == "select")
pending_space = true;
break;
case 't':
if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") {
pending_space = true;
} else if (tag == "title") {
in_title_tag = true;
}
break;
case 'u':
if (tag == "ul") pending_space = true;
break;
case 'x':
if (tag == "xmp") pending_space = true;
break;
if (tag == "blockquote" || tag == "br") {
dump += '\n';
pending_space = true;
}
break;
case 'c':
if (tag == "center") pending_space = true;
break;
case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true;
if (tag == "dt")
dump += '\n';
break;
case 'e':
if (tag == "embed") pending_space = true;
break;
case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true;
break;
case 'h':
// hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1])) {
dump += '\n';
pending_space = true;
}
break;
case 'i':
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
tag == "input") pending_space = true;
break;
case 'k':
if (tag == "keygen") pending_space = true;
break;
case 'l':
if (tag == "legend" || tag == "li" || tag == "listing") {
dump += '\n';
pending_space = true;
}
break;
case 'm':
if (tag == "meta") {
string content;
if (get_parameter(cstr_html_content, content)) {
string name;
if (get_parameter("name", name)) {
lowercase_term(name);
if (name == "date") {
// Specific to Recoll filters.
decode_entities(content);
struct tm tm;
memset(&tm, 0, sizeof(tm));
if (strptime(content.c_str(),
" %Y-%m-%d %H:%M:%S ", &tm) ||
strptime(content.c_str(),
"%Y-%m-%dT%H:%M:%S", &tm)
) {
char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
dmtime = ascuxtime;
}
} else if (name == "robots") {
} else {
string markup;
bool ishtml = false;
if (get_parameter("markup", markup)) {
if (!stringlowercmp("html", markup)) {
ishtml = true;
}
}
decode_entities(content);
// Set metadata field, avoid appending
// multiple identical instances.
auto it = meta.find(name);
if (it == meta.end() || it->second.find(content) ==
string::npos) {
if (it != meta.end()) {
it->second += ' ';
it->second += content;
} else {
meta[name] = content;
}
}
if (ishtml &&
meta[name].compare(0, cstr_fldhtm.size(),
cstr_fldhtm)) {
meta[name].insert(0, cstr_fldhtm);
}
}
}
string hdr;
if (get_parameter("http-equiv", hdr)) {
lowercase_term(hdr);
if (hdr == "content-type") {
MimeHeaderValue p;
parseMimeHeaderValue(content, p);
map<string, string>::const_iterator k;
if ((k = p.params.find(cstr_html_charset)) !=
p.params.end()) {
charset = k->second;
if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1("Doc http-equiv charset '" << charset <<
"' differs from dir deflt '" <<
fromcharset << "'\n");
throw false;
}
}
}
}
}
string newcharset;
if (get_parameter(cstr_html_charset, newcharset)) {
// HTML5 added: <meta charset="...">
lowercase_term(newcharset);
charset = newcharset;
if (!charset.empty() &&
!samecharset(charset, fromcharset)) {
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
throw false;
}
}
break;
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
pending_space = true;
break;
case 'o':
if (tag == "ol" || tag == "option") pending_space = true;
break;
case 'p':
if (tag == "p" || tag == "plaintext") {
dump += '\n';
pending_space = true;
} else if (tag == "pre") {
in_pre_tag = true;
dump += '\n';
pending_space = true;
}
break;
case 'q':
if (tag == "q") pending_space = true;
break;
case 's':
if (tag == "style") {
in_style_tag = true;
break;
} else if (tag == "script") {
in_script_tag = true;
break;
} else if (tag == "select")
pending_space = true;
break;
case 't':
if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") {
pending_space = true;
} else if (tag == "title") {
in_title_tag = true;
}
break;
case 'u':
if (tag == "ul") pending_space = true;
break;
case 'x':
if (tag == "xmp") pending_space = true;
break;
}
return true;
}
@ -484,85 +495,85 @@ MyHtmlParser::closing_tag(const string &tag)
LOGDEB2("closing_tag: [" << (tag) << "]\n" );
if (tag.empty()) return true;
switch (tag[0]) {
case 'a':
if (tag == "address") pending_space = true;
break;
case 'b':
// body: We used to signal and end of doc here by returning
// false but the browsers just ignore body and html
// closing tags if there is further text, so it seems right
// to do the same
case 'a':
if (tag == "address") pending_space = true;
break;
case 'b':
// body: We used to signal and end of doc here by returning
// false but the browsers just ignore body and html
// closing tags if there is further text, so it seems right
// to do the same
if (tag == "blockquote" || tag == "br") pending_space = true;
break;
case 'c':
if (tag == "center") pending_space = true;
break;
case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true;
break;
case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true;
break;
case 'h':
// hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1]))
pending_space = true;
break;
case 'i':
if (tag == "iframe") pending_space = true;
break;
case 'l':
if (tag == "legend" || tag == "li" || tag == "listing")
pending_space = true;
break;
case 'm':
if (tag == "marquee" || tag == "menu") pending_space = true;
break;
case 'o':
if (tag == "ol" || tag == "option") pending_space = true;
break;
case 'p':
if (tag == "p") {
pending_space = true;
} else if (tag == "pre") {
pending_space = true;
in_pre_tag = false;
}
break;
case 'q':
if (tag == "q") pending_space = true;
break;
case 's':
if (tag == "style") {
in_style_tag = false;
break;
}
if (tag == "script") {
in_script_tag = false;
break;
}
if (tag == "select") pending_space = true;
break;
case 't':
if (tag == "title") {
in_title_tag = false;
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = titledump;
titledump.clear();
}
break;
}
if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") pending_space = true;
break;
case 'u':
if (tag == "ul") pending_space = true;
break;
case 'x':
if (tag == "xmp") pending_space = true;
break;
if (tag == "blockquote" || tag == "br") pending_space = true;
break;
case 'c':
if (tag == "center") pending_space = true;
break;
case 'd':
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
tag == "dt") pending_space = true;
break;
case 'f':
if (tag == "fieldset" || tag == "form") pending_space = true;
break;
case 'h':
// hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1]))
pending_space = true;
break;
case 'i':
if (tag == "iframe") pending_space = true;
break;
case 'l':
if (tag == "legend" || tag == "li" || tag == "listing")
pending_space = true;
break;
case 'm':
if (tag == "marquee" || tag == "menu") pending_space = true;
break;
case 'o':
if (tag == "ol" || tag == "option") pending_space = true;
break;
case 'p':
if (tag == "p") {
pending_space = true;
} else if (tag == "pre") {
pending_space = true;
in_pre_tag = false;
}
break;
case 'q':
if (tag == "q") pending_space = true;
break;
case 's':
if (tag == "style") {
in_style_tag = false;
break;
}
if (tag == "script") {
in_script_tag = false;
break;
}
if (tag == "select") pending_space = true;
break;
case 't':
if (tag == "title") {
in_title_tag = false;
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = titledump;
titledump.clear();
}
break;
}
if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") pending_space = true;
break;
case 'u':
if (tag == "ul") pending_space = true;
break;
case 'x':
if (tag == "xmp") pending_space = true;
break;
}
return true;
}