html/xml meta: avoid appending a value that is already present in the string

2020-01-30 08:37:46 +01:00 · 2020-01-30 08:37:46 +01:00 · e5af1651fa
commit e5af1651fa
parent 552510db06
3 changed files with 454 additions and 440 deletions
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -1,4 +1,4 @@
-	/* Copyright (C) 2005 J.F.Dockes 
+/* Copyright (C) 2005 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -38,10 +38,10 @@ bool MimeHandlerExecMultiple::startCmd()
 {
    LOGDEB("MimeHandlerExecMultiple::startCmd\n");
    if (params.empty()) {
-	// Hu ho
+        // Hu ho
-	LOGERR("MHExecMultiple::startCmd: empty params\n");
+        LOGERR("MHExecMultiple::startCmd: empty params\n");
-	m_reason = "RECFILTERROR BADCONFIG";
+        m_reason = "RECFILTERROR BADCONFIG";
-	return false;
+        return false;
    }
    // Command name
@ -55,7 +55,7 @@ bool MimeHandlerExecMultiple::startCmd()
    m_cmd.putenv("RECOLL_CONFDIR", m_config->getConfDir());
    m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
-		"RECOLL_FILTER_FORPREVIEW=no");
+                 "RECOLL_FILTER_FORPREVIEW=no");
    m_cmd.setrlimit_as(m_filtermaxmbytes);
    m_adv.setmaxsecs(m_filtermaxseconds);
@ -156,11 +156,11 @@ bool MimeHandlerExecMultiple::next_document()
 {
    LOGDEB("MimeHandlerExecMultiple::next_document(): [" << m_fn << "]\n");
    if (m_havedoc == false)
-	return false;
+        return false;
    if (missingHelper) {
-	LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
+        LOGDEB("MHExecMultiple::next_document(): helper known missing\n");
-	return false;
+        return false;
    }
    if (m_cmd.getChildPid() <= 0 && !startCmd()) {
@ -178,15 +178,15 @@ bool MimeHandlerExecMultiple::next_document()
    ostringstream obuf;
    string file_md5;
    if (m_filefirst) {
-	if (!m_forPreview && !m_nomd5) {
+        if (!m_forPreview && !m_nomd5) {
-	    string md5, xmd5, reason;
+            string md5, xmd5, reason;
-	    if (MD5File(m_fn, md5, &reason)) {
+            if (MD5File(m_fn, md5, &reason)) {
-		file_md5 = MD5HexPrint(md5, xmd5);
+                file_md5 = MD5HexPrint(md5, xmd5);
-	    } else {
+            } else {
-		LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
+                LOGERR("MimeHandlerExecM: cant compute md5 for [" << m_fn <<
                       "]: " << reason << "\n");
-	    }
+            }
-	}
+        }
        obuf << "FileName: " << m_fn.length() << "\n" << m_fn;
        // m_filefirst is set to true by set_document_file()
        m_filefirst = false;
@ -194,13 +194,13 @@ bool MimeHandlerExecMultiple::next_document()
        obuf << "Filename: " << 0 << "\n";
    }
    if (!m_ipath.empty()) {
-	LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
+        LOGDEB("next_doc: sending ipath " << m_ipath.length() << " val [" <<
               m_ipath << "]\n");
        obuf << "Ipath: " << m_ipath.length() << "\n" << m_ipath;
    }
    if (!m_dfltInputCharset.empty()) {
        obuf << "DflInCS: " << m_dfltInputCharset.length() << "\n" 
-	     << m_dfltInputCharset;
+             << m_dfltInputCharset;
    }
    obuf << "Mimetype: " << m_mimeType.length() << "\n" << m_mimeType;
    obuf << "\n";
@ -247,10 +247,10 @@ bool MimeHandlerExecMultiple::next_document()
            eofnow_received = true;
        } else if (!stringlowercmp("fileerror:", name)) {
            LOGDEB("MHExecMultiple: got FILEERROR\n");
-	    fileerror_received = true;
+            fileerror_received = true;
        } else if (!stringlowercmp("subdocerror:", name)) {
            LOGDEB("MHExecMultiple: got SUBDOCERROR\n");
-	    subdocerror_received = true;
+            subdocerror_received = true;
        } else if (!stringlowercmp("ipath:", name)) {
            ipath = data;
            LOGDEB("MHExecMultiple: got ipath [" << data << "]\n");
@ -264,7 +264,11 @@ bool MimeHandlerExecMultiple::next_document()
            string nm = stringtolower((const string&)name);
            trimstring(nm, ":");
            LOGDEB("MHExecMultiple: got [" << nm << "] -> [" << data << "]\n");
-            m_metaData[nm] += data;
+            auto it = m_metaData.find(nm);
            if (it == m_metaData.end() ||
                it->second.find(data) == std::string::npos) {
                m_metaData[nm] += data;
            }
        }
        if (loop == 200) {
            // ?? 
@ -279,7 +283,7 @@ bool MimeHandlerExecMultiple::next_document()
        return false;
    }
    if (subdocerror_received) {
-	return false;
+        return false;
    }
    // It used to be that eof could be signalled just by an empty document, but
@ -291,13 +295,13 @@ bool MimeHandlerExecMultiple::next_document()
    }
    if (!ipath.empty()) {
-	// If this has an ipath, it is an internal doc from a
+        // If this has an ipath, it is an internal doc from a
-	// multi-document file. In this case, either the filter
+        // multi-document file. In this case, either the filter
-	// supplies the mimetype, or the ipath MUST be a filename-like
+        // supplies the mimetype, or the ipath MUST be a filename-like
-	// string which we can use to compute a mime type
+        // string which we can use to compute a mime type
        m_metaData[cstr_dj_keyipath] = ipath;
        if (mtype.empty()) {
-	    LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
+            LOGDEB0("MHExecMultiple: no mime type from filter, using ipath "
                    "for a guess\n");
            mtype = mimetype(ipath, 0, m_config, false);
            if (mtype.empty()) {
@ -313,16 +317,16 @@ bool MimeHandlerExecMultiple::next_document()
            }
        }
        m_metaData[cstr_dj_keymt] = mtype;
-	if (!m_forPreview) {
+        if (!m_forPreview) {
-	    string md5, xmd5;
+            string md5, xmd5;
-	    MD5String(m_metaData[cstr_dj_keycontent], md5);
+            MD5String(m_metaData[cstr_dj_keycontent], md5);
-	    m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+            m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
-	}
+        }
    } else {
-	// "Self" document.
+        // "Self" document.
        m_metaData[cstr_dj_keymt] = mtype.empty() ? cstr_texthtml : mtype;
        m_metaData.erase(cstr_dj_keyipath);
-	if (!m_forPreview) {
+        if (!m_forPreview) {
            m_metaData[cstr_dj_keymd5] = file_md5;
        }
    }
@ -339,4 +343,3 @@ bool MimeHandlerExecMultiple::next_document()
    LOGDEB2("MHExecMultiple: metadata: \n" << metadataAsString());
    return true;
 }
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@ -38,7 +38,7 @@ bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
    string reason;
    if (!file_to_string(fn, otext, &reason)) {
        LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
-	return false;
+        return false;
    }
    m_filename = fn;
    return set_document_string(mt, otext);
@ -51,10 +51,10 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
    m_havedoc = true;
    if (!m_forPreview) {
-	// We want to compute the md5 now because we may modify m_html later
+        // We want to compute the md5 now because we may modify m_html later
-	string md5, xmd5;
+        string md5, xmd5;
-	MD5String(htext, md5);
+        MD5String(htext, md5);
-	m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+        m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
    }
    return true;
 }
@ -62,7 +62,7 @@ bool MimeHandlerHtml::set_document_string_impl(const string& mt,
 bool MimeHandlerHtml::next_document()
 {
    if (m_havedoc == false)
-	return false;
+        return false;
    m_havedoc = false;
    // If set_doc(fn), take note of file name.
    string fn = m_filename;
@ -70,12 +70,12 @@ bool MimeHandlerHtml::next_document()
    string charset = m_dfltInputCharset;
    LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset
-          << "]\n");
+           << "]\n");
    // Override default input charset if someone took care to set one:
    map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset);
    if (it != m_metaData.end() && !it->second.empty()) {
-	charset = it->second;
+        charset = it->second;
-	LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
+        LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" <<
               charset << "]\n");
    }
@ -88,78 +88,78 @@ bool MimeHandlerHtml::next_document()
    MyHtmlParser result;
    for (int pass = 0; pass < 2; pass++) {
-	string transcoded;
+        string transcoded;
-	LOGDEB("Html::mkDoc: pass " << pass << "\n");
+        LOGDEB("Html::mkDoc: pass " << pass << "\n");
-	MyHtmlParser p;
+        MyHtmlParser p;
-	// Try transcoding. If it fails, use original text.
+        // Try transcoding. If it fails, use original text.
-	int ecnt;
+        int ecnt;
-	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
+        if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
-	    LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
+            LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
                   charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) <<
                   "]");
-	    transcoded = m_html;
+            transcoded = m_html;
-	    // We don't know the charset, at all
+            // We don't know the charset, at all
-	    p.reset_charsets();
+            p.reset_charsets();
-	    charset.clear();
+            charset.clear();
-	} else {
+        } else {
-	    if (ecnt) {
+            if (ecnt) {
-		if (pass == 0) {
+                if (pass == 0) {
-		    LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
+                    LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
                           " errors for ["<<(fn.empty()?"unknown":fn)<< "]\n");
-		} else {
+                } else {
-		    LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
+                    LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
                           " errors for ["<< (fn.empty()?"unknown":fn)<< "]\n");
-		}
+                }
-	    }
+            }
-	    // charset has the putative source charset, transcoded is now
+            // charset has the putative source charset, transcoded is now
-	    // in utf-8
+            // in utf-8
-	    p.set_charsets(charset, "utf-8");
+            p.set_charsets(charset, "utf-8");
-	}
+        }
-	try {
+        try {
-	    p.parse_html(transcoded);
+            p.parse_html(transcoded);
-	    // No exception: ok? But throw true to use the same
+            // No exception: ok? But throw true to use the same
-	    // code path as if an exception had been thrown by parse_html
+            // code path as if an exception had been thrown by parse_html
-	    throw true;
+            throw true;
-	    break;
+            break;
-	} catch (bool diag) {
+        } catch (bool diag) {
-	    result = p;
+            result = p;
-	    if (diag == true) {
+            if (diag == true) {
-		// Parser throws true at end of text. ok
+                // Parser throws true at end of text. ok
-		if (m_forPreview) {
+                if (m_forPreview) {
-		    // Save the html text
+                    // Save the html text
-		    m_html = transcoded;
+                    m_html = transcoded;
-		    // In many cases, we need to change the charset decl,
+                    // In many cases, we need to change the charset decl,
-		    // because the file was transcoded. It seems that just
+                    // because the file was transcoded. It seems that just
-		    // inserting one is enough (only the 1st one seems to
+                    // inserting one is enough (only the 1st one seems to
-		    // be used by browsers/qtextedit).
+                    // be used by browsers/qtextedit).
                    string::size_type idx = m_html.find("<head>");
-		    if (idx == string::npos)
+                    if (idx == string::npos)
-			idx = m_html.find("<HEAD>");
+                        idx = m_html.find("<HEAD>");
-		    if (idx != string::npos)
+                    if (idx != string::npos)
-			m_html.replace(idx+6, 0, 
+                        m_html.replace(idx+6, 0, 
-				       "<meta http-equiv=\"content-type\" "
+                                       "<meta http-equiv=\"content-type\" "
-				       "content=\"text/html; charset=utf-8\">");
+                                       "content=\"text/html; charset=utf-8\">");
-		}
+                }
-		break;
+                break;
-	    }
+            }
-	    LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
+            LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset ["<<
                   result.get_charset() << "]\n");
-	    if (!result.get_charset().empty() && 
+            if (!result.get_charset().empty() && 
-		!samecharset(result.get_charset(), result.fromcharset)) {
+                !samecharset(result.get_charset(), result.fromcharset)) {
-		LOGDEB("textHtmlToDoc: reparse for charsets\n");
+                LOGDEB("textHtmlToDoc: reparse for charsets\n");
-		// Set the origin charset as specified in document before
+                // Set the origin charset as specified in document before
-		// transcoding again
+                // transcoding again
-		charset = result.get_charset();
+                charset = result.get_charset();
-	    } else {
+            } else {
-		LOGERR("textHtmlToDoc:: error: non charset exception\n");
+                LOGERR("textHtmlToDoc:: error: non charset exception\n");
-		return false;
+                return false;
-	    }
+            }
-	}
+        }
    }
    m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
@ -168,13 +168,13 @@ bool MimeHandlerHtml::next_document()
    // Avoid setting empty values which would crush ones possibly inherited
    // from parent (if we're an attachment)
    if (!result.dmtime.empty())
-	m_metaData[cstr_dj_keymd] = result.dmtime;
+        m_metaData[cstr_dj_keymd] = result.dmtime;
    m_metaData[cstr_dj_keymt] = cstr_textplain;
-    for (map<string,string>::const_iterator it = result.meta.begin(); 
+    for (const auto& entry : result.meta) {
-	 it != result.meta.end(); it++) {
+        if (!entry.second.empty()) {
-	if (!it->second.empty())
+            m_metaData[entry.first] = entry.second;
-	    m_metaData[it->first] = it->second;
+        }
    }
    return true;
 }
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@ -161,19 +161,19 @@ map<string, string> my_named_ents;
 class NamedEntsInitializer {
 public:
    NamedEntsInitializer()
-    {
+        {
-	for (int i = 0;;) {
+            for (int i = 0;;) {
-	    const char *ent;
+                const char *ent;
-	    const char *val;
+                const char *val;
-	    ent = epairs[i++];
+                ent = epairs[i++];
-	    if (ent == 0) 
+                if (ent == 0) 
-		break;
+                    break;
-	    val = epairs[i++];
+                val = epairs[i++];
-	    if (val == 0) 
+                if (val == 0) 
-		break;
+                    break;
-	    my_named_ents[string(ent)] = val;
+                my_named_ents[string(ent)] = val;
-	}
+            }
-    }
+        }
 };
 static NamedEntsInitializer namedEntsInitializerInstance;
@ -198,58 +198,58 @@ void MyHtmlParser::decode_entities(string &s)
    // so don't do it. If charset known, caller has converted text to utf-8, 
    // and this is also how we translate entities
    //    if (tocharset != "utf-8")
-    //    	return;
+    //      return;
    // We need a const_iterator version of s.end() - otherwise the
    // find() and find_if() templates don't work...
    string::const_iterator amp = s.begin(), s_end = s.end();
    while ((amp = find(amp, s_end, '&')) != s_end) {
-	unsigned int val = 0;
+        unsigned int val = 0;
-	string::const_iterator end, p = amp + 1;
+        string::const_iterator end, p = amp + 1;
-	string subs;
+        string subs;
-	if (p != s_end && *p == '#') {
+        if (p != s_end && *p == '#') {
-	    p++;
+            p++;
-	    if (p != s_end && (*p == 'x' || *p == 'X')) {
+            if (p != s_end && (*p == 'x' || *p == 'X')) {
-		// hex
+                // hex
-		p++;
+                p++;
-		end = find_if(p, s_end, p_notxdigit);
+                end = find_if(p, s_end, p_notxdigit);
-		sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
+                sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
-	    } else {
+            } else {
-		// number
+                // number
-		end = find_if(p, s_end, p_notdigit);
+                end = find_if(p, s_end, p_notdigit);
-		val = atoi(s.substr(p - s.begin(), end - p).c_str());
+                val = atoi(s.substr(p - s.begin(), end - p).c_str());
-	    }
+            }
-	} else {
+        } else {
-	    end = find_if(p, s_end, p_notalnum);
+            end = find_if(p, s_end, p_notalnum);
-	    string code = s.substr(p - s.begin(), end - p);
+            string code = s.substr(p - s.begin(), end - p);
-	    map<string, string>::const_iterator i;
+            map<string, string>::const_iterator i;
-	    i = my_named_ents.find(code);
+            i = my_named_ents.find(code);
-	    if (i != my_named_ents.end()) 
+            if (i != my_named_ents.end()) 
-		subs = i->second;
+                subs = i->second;
-	}
+        }
-	if (end < s_end && *end == ';') 
+        if (end < s_end && *end == ';') 
-	    end++;
+            end++;
-	if (val) {
+        if (val) {
-	    // The code is the code position for a unicode char. We need
+            // The code is the code position for a unicode char. We need
-	    // to translate it to an utf-8 string.
+            // to translate it to an utf-8 string.
-	    string utf16be;
+            string utf16be;
-	    utf16be += char(val / 256);
+            utf16be += char(val / 256);
-	    utf16be += char(val % 256);
+            utf16be += char(val % 256);
-	    transcode(utf16be, subs, "UTF-16BE", "UTF-8");
+            transcode(utf16be, subs, "UTF-16BE", "UTF-8");
-	} 
+        } 
-	if (subs.length() > 0) {
+        if (subs.length() > 0) {
-	    string::size_type amp_pos = amp - s.begin();
+            string::size_type amp_pos = amp - s.begin();
-	    s.replace(amp_pos, end - amp, subs);
+            s.replace(amp_pos, end - amp, subs);
-	    s_end = s.end();
+            s_end = s.end();
-	    // We've modified the string, so the iterators are no longer
+            // We've modified the string, so the iterators are no longer
-	    // valid...
+            // valid...
-	    amp = s.begin() + amp_pos + subs.length();
+            amp = s.begin() + amp_pos + subs.length();
-	} else {
+        } else {
-	    amp = end;
+            amp = end;
-	}
+        }
    }
 }
@ -265,35 +265,35 @@ MyHtmlParser::process_text(const string &text)
    CancelCheck::instance().checkCancel();
    if (!in_script_tag && !in_style_tag) {
-	if (in_title_tag) {
+        if (in_title_tag) {
-	    titledump += text;
+            titledump += text;
-	} else if (!in_pre_tag) {
+        } else if (!in_pre_tag) {
-	    string::size_type b = 0;
+            string::size_type b = 0;
-	    bool only_space = true;
+            bool only_space = true;
-	    while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
+            while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
-		only_space = false;
+                only_space = false;
-		// If space specifically needed or chunk begins with
+                // If space specifically needed or chunk begins with
-		// whitespace, add exactly one space
+                // whitespace, add exactly one space
-		if (pending_space || b != 0) {
+                if (pending_space || b != 0) {
-			dump += ' ';
+                    dump += ' ';
-		}
+                }
-		pending_space = true;
+                pending_space = true;
-		string::size_type e = text.find_first_of(WHITESPACE, b);
+                string::size_type e = text.find_first_of(WHITESPACE, b);
-		if (e == string::npos) {
+                if (e == string::npos) {
-		    dump += text.substr(b);
+                    dump += text.substr(b);
-		    pending_space = false;
+                    pending_space = false;
-		    break;
+                    break;
-		}
+                }
-		dump += text.substr(b, e - b);
+                dump += text.substr(b, e - b);
-		b = e + 1;
+                b = e + 1;
-	    }
+            }
-	    if (only_space)
+            if (only_space)
-		pending_space = true;
+                pending_space = true;
-	} else {
+        } else {
-	    if (pending_space)
+            if (pending_space)
-		dump += ' ';
+                dump += ' ';
-	    dump += text;
+            dump += text;
-	}
+        }
    }
 }
@ -305,175 +305,186 @@ MyHtmlParser::opening_tag(const string &tag)
    cout << "TAG: " << tag << ": " << endl;
    map<string, string>::const_iterator x;
    for (x = p.begin(); x != p.end(); x++) {
-	cout << "  " << x->first << " -> '" << x->second << "'" << endl;
+        cout << "  " << x->first << " -> '" << x->second << "'" << endl;
    }
 #endif
    if (tag.empty()) return true;
    switch (tag[0]) {
-	case 'a':
+    case 'a':
-	    if (tag == "address") pending_space = true;
+        if (tag == "address") pending_space = true;
-	    break;
+        break;
-	case 'b':
+    case 'b':
-	    // body: some bad docs have several opening body tags and
+        // body: some bad docs have several opening body tags and
-	    // even text before the body is displayed by Opera and
+        // even text before the body is displayed by Opera and
-	    // Firefox.  We used to reset the dump each time we saw a
+        // Firefox.  We used to reset the dump each time we saw a
-	    // body tag, but I can't see any reason to do so.
+        // body tag, but I can't see any reason to do so.
-	    if (tag == "blockquote" || tag == "br") {
+        if (tag == "blockquote" || tag == "br") {
-		dump += '\n';
+            dump += '\n';
-		pending_space = true;
+            pending_space = true;
-	    }
+        }
-	    break;
+        break;
-	case 'c':
+    case 'c':
-	    if (tag == "center") pending_space = true;
+        if (tag == "center") pending_space = true;
-	    break;
+        break;
-	case 'd':
+    case 'd':
-	    if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+        if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
-		tag == "dt") pending_space = true;
+            tag == "dt") pending_space = true;
-	    if (tag == "dt")
+        if (tag == "dt")
-		dump += '\n';
+            dump += '\n';
-	    break;
+        break;
-	case 'e':
+    case 'e':
-	    if (tag == "embed") pending_space = true;
+        if (tag == "embed") pending_space = true;
-	    break;
+        break;
-	case 'f':
+    case 'f':
-	    if (tag == "fieldset" || tag == "form") pending_space = true;
+        if (tag == "fieldset" || tag == "form") pending_space = true;
-	    break;
+        break;
-	case 'h':
+    case 'h':
-	    // hr, and h1, ..., h6
+        // hr, and h1, ..., h6
-	    if (tag.length() == 2 && strchr("r123456", tag[1])) {
+        if (tag.length() == 2 && strchr("r123456", tag[1])) {
-		dump += '\n';
+            dump += '\n';
-		pending_space = true;
+            pending_space = true;
-	    }
+        }
-	    break;
+        break;
-	case 'i':
+    case 'i':
-	    if (tag == "iframe" || tag == "img" || tag == "isindex" ||
+        if (tag == "iframe" || tag == "img" || tag == "isindex" ||
-		tag == "input") pending_space = true;
+            tag == "input") pending_space = true;
-	    break;
+        break;
-	case 'k':
+    case 'k':
-	    if (tag == "keygen") pending_space = true;
+        if (tag == "keygen") pending_space = true;
-	    break;
+        break;
-	case 'l':
+    case 'l':
-	    if (tag == "legend" || tag == "li" || tag == "listing") {
+        if (tag == "legend" || tag == "li" || tag == "listing") {
-		dump += '\n';
+            dump += '\n';
-		pending_space = true;
+            pending_space = true;
-	    }
+        }
-	    break;
+        break;
-	case 'm':
+    case 'm':
-	    if (tag == "meta") {
+        if (tag == "meta") {
-		string content;
+            string content;
-		if (get_parameter(cstr_html_content, content)) {
+            if (get_parameter(cstr_html_content, content)) {
-		    string name;
+                string name;
-		    if (get_parameter("name", name)) {
+                if (get_parameter("name", name)) {
-			lowercase_term(name);
+                    lowercase_term(name);
-			if (name == "date") {
+                    if (name == "date") {
-			    // Specific to Recoll filters.
+                        // Specific to Recoll filters.
-			    decode_entities(content);
+                        decode_entities(content);
-			    struct tm tm;
+                        struct tm tm;
-                            memset(&tm, 0, sizeof(tm));
+                        memset(&tm, 0, sizeof(tm));
-			    if (strptime(content.c_str(), 
+                        if (strptime(content.c_str(), 
-					 " %Y-%m-%d %H:%M:%S ", &tm) ||
+                                     " %Y-%m-%d %H:%M:%S ", &tm) ||
-				strptime(content.c_str(), 
+                            strptime(content.c_str(), 
-					 "%Y-%m-%dT%H:%M:%S", &tm)
+                                     "%Y-%m-%dT%H:%M:%S", &tm)
-				) {
+                            ) {
-				char ascuxtime[100];
+                            char ascuxtime[100];
-				sprintf(ascuxtime, "%ld", (long)mktime(&tm));
+                            sprintf(ascuxtime, "%ld", (long)mktime(&tm));
-				dmtime = ascuxtime;
+                            dmtime = ascuxtime;
-			    }
+                        }
-			} else if (name == "robots") {
+                    } else if (name == "robots") {
-			} else {
+                    } else {
-			    string markup;
+                        string markup;
-			    bool ishtml = false;
+                        bool ishtml = false;
-			    if (get_parameter("markup", markup)) {
+                        if (get_parameter("markup", markup)) {
-				if (!stringlowercmp("html", markup)) {
+                            if (!stringlowercmp("html", markup)) {
-				    ishtml = true;
+                                ishtml = true;
-				}
+                            }
-			    }
+                        }
-			    if (!meta[name].empty())
+                        decode_entities(content);
-				meta[name] += ' ';
+                        // Set metadata field, avoid appending
-			    decode_entities(content);
+                        // multiple identical instances.
-			    meta[name] += content;
+                        auto it = meta.find(name);
-			    if (ishtml && 
+                        if (it == meta.end() || it->second.find(content) ==
-				meta[name].compare(0, cstr_fldhtm.size(), 
+                            string::npos) {
-						   cstr_fldhtm)) {
+                            if (it != meta.end()) {
-				meta[name].insert(0, cstr_fldhtm);
+                                it->second += ' ';
-			    }
+                                it->second += content;
-			}
+                            } else {
-		    } 
+                                meta[name] = content;
-		    string hdr;
+                            }
-		    if (get_parameter("http-equiv", hdr)) {
+                        }
-			lowercase_term(hdr);
+                        if (ishtml && 
-			if (hdr == "content-type") {
+                            meta[name].compare(0, cstr_fldhtm.size(),
-			    MimeHeaderValue p;
+                                               cstr_fldhtm)) {
-			    parseMimeHeaderValue(content, p);
+                            meta[name].insert(0, cstr_fldhtm);
-			    map<string, string>::const_iterator k;
+                        }
-			    if ((k = p.params.find(cstr_html_charset)) != 
+                    }
-				p.params.end()) {
+                } 
-				charset = k->second;
+                string hdr;
-				if (!charset.empty() && 
+                if (get_parameter("http-equiv", hdr)) {
-				    !samecharset(charset, fromcharset)) {
+                    lowercase_term(hdr);
-				    LOGDEB1("Doc http-equiv charset '"  << (charset) << "' differs from dir deflt '"  << (fromcharset) << "'\n" );
+                    if (hdr == "content-type") {
-				    throw false;
+                        MimeHeaderValue p;
-				}
+                        parseMimeHeaderValue(content, p);
-			    }
+                        map<string, string>::const_iterator k;
-			}
+                        if ((k = p.params.find(cstr_html_charset)) != 
-		    }
+                            p.params.end()) {
-		}
+                            charset = k->second;
-		string newcharset;
+                            if (!charset.empty() && 
-		if (get_parameter(cstr_html_charset, newcharset)) {
+                                !samecharset(charset, fromcharset)) {
-		    // HTML5 added: <meta charset="...">
+                                LOGDEB1("Doc http-equiv charset '" << charset <<
-		    lowercase_term(newcharset);
+                                        "' differs from dir deflt '" <<
-		    charset = newcharset;
+                                        fromcharset << "'\n");
-		    if (!charset.empty() && 
+                                throw false;
-			!samecharset(charset, fromcharset)) {
+                            }
-			LOGDEB1("Doc html5 charset '"  << (charset) << "' differs from dir deflt '"  << (fromcharset) << "'\n" );
+                        }
-			throw false;
+                    }
-		    }
+                }
-		}
+            }
-		break;
+            string newcharset;
-	    } else if (tag == "marquee" || tag == "menu" || tag == "multicol")
+            if (get_parameter(cstr_html_charset, newcharset)) {
-		pending_space = true;
+                // HTML5 added: <meta charset="...">
-	    break;
+                lowercase_term(newcharset);
-	case 'o':
+                charset = newcharset;
-	    if (tag == "ol" || tag == "option") pending_space = true;
+                if (!charset.empty() && 
-	    break;
+                    !samecharset(charset, fromcharset)) {
-	case 'p':
+                    LOGDEB1("Doc html5 charset '"  << (charset) << "' differs from dir deflt '"  << (fromcharset) << "'\n" );
-	    if (tag == "p" || tag == "plaintext") {
+                    throw false;
-		dump += '\n';
+                }
-		pending_space = true;
+            }
-	    } else if (tag == "pre") {
+            break;
-		in_pre_tag = true;
+        } else if (tag == "marquee" || tag == "menu" || tag == "multicol")
-		dump += '\n';
+            pending_space = true;
-		pending_space = true;
+        break;
-	    }
+    case 'o':
-	    break;
+        if (tag == "ol" || tag == "option") pending_space = true;
-	case 'q':
+        break;
-	    if (tag == "q") pending_space = true;
+    case 'p':
-	    break;
+        if (tag == "p" || tag == "plaintext") {
-	case 's':
+            dump += '\n';
-	    if (tag == "style") {
+            pending_space = true;
-		in_style_tag = true;
+        } else if (tag == "pre") {
-		break;
+            in_pre_tag = true;
-	    } else if (tag == "script") {
+            dump += '\n';
-		in_script_tag = true;
+            pending_space = true;
-		break;
+        }
-	    } else if (tag == "select") 
+        break;
-		pending_space = true;
+    case 'q':
-	    break;
+        if (tag == "q") pending_space = true;
-	case 't':
+        break;
-	    if (tag == "table" || tag == "td" || tag == "textarea" ||
+    case 's':
-		tag == "th") {
+        if (tag == "style") {
-		pending_space = true;
+            in_style_tag = true;
-	    } else if (tag == "title") {
+            break;
-		in_title_tag = true;
+        } else if (tag == "script") {
-	    }
+            in_script_tag = true;
-	    break;
+            break;
-	case 'u':
+        } else if (tag == "select") 
-	    if (tag == "ul") pending_space = true;
+            pending_space = true;
-	    break;
+        break;
-	case 'x':
+    case 't':
-	    if (tag == "xmp") pending_space = true;
+        if (tag == "table" || tag == "td" || tag == "textarea" ||
-	    break;
+            tag == "th") {
            pending_space = true;
        } else if (tag == "title") {
            in_title_tag = true;
        }
        break;
    case 'u':
        if (tag == "ul") pending_space = true;
        break;
    case 'x':
        if (tag == "xmp") pending_space = true;
        break;
    }
    return true;
 }
@ -484,85 +495,85 @@ MyHtmlParser::closing_tag(const string &tag)
    LOGDEB2("closing_tag: ["  << (tag) << "]\n" );
    if (tag.empty()) return true;
    switch (tag[0]) {
-	case 'a':
+    case 'a':
-	    if (tag == "address") pending_space = true;
+        if (tag == "address") pending_space = true;
-	    break;
+        break;
-	case 'b':
+    case 'b':
-	    // body: We used to signal and end of doc here by returning
+        // body: We used to signal and end of doc here by returning
-	    // false but the browsers just ignore body and html
+        // false but the browsers just ignore body and html
-	    // closing tags if there is further text, so it seems right
+        // closing tags if there is further text, so it seems right
-	    // to do the same
+        // to do the same
-	    if (tag == "blockquote" || tag == "br") pending_space = true;
+        if (tag == "blockquote" || tag == "br") pending_space = true;
-	    break;
+        break;
-	case 'c':
+    case 'c':
-	    if (tag == "center") pending_space = true;
+        if (tag == "center") pending_space = true;
-	    break;
+        break;
-	case 'd':
+    case 'd':
-	    if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+        if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
-		tag == "dt") pending_space = true;
+            tag == "dt") pending_space = true;
-	    break;
+        break;
-	case 'f':
+    case 'f':
-	    if (tag == "fieldset" || tag == "form") pending_space = true;
+        if (tag == "fieldset" || tag == "form") pending_space = true;
-	    break;
+        break;
-	case 'h':
+    case 'h':
-	    // hr, and h1, ..., h6
+        // hr, and h1, ..., h6
-	    if (tag.length() == 2 && strchr("r123456", tag[1]))
+        if (tag.length() == 2 && strchr("r123456", tag[1]))
-		pending_space = true;
+            pending_space = true;
-	    break;
+        break;
-	case 'i':
+    case 'i':
-	    if (tag == "iframe") pending_space = true;
+        if (tag == "iframe") pending_space = true;
-	    break;
+        break;
-	case 'l':
+    case 'l':
-	    if (tag == "legend" || tag == "li" || tag == "listing")
+        if (tag == "legend" || tag == "li" || tag == "listing")
-		pending_space = true;
+            pending_space = true;
-	    break;
+        break;
-	case 'm':
+    case 'm':
-	    if (tag == "marquee" || tag == "menu") pending_space = true;
+        if (tag == "marquee" || tag == "menu") pending_space = true;
-	    break;
+        break;
-	case 'o':
+    case 'o':
-	    if (tag == "ol" || tag == "option") pending_space = true;
+        if (tag == "ol" || tag == "option") pending_space = true;
-	    break;
+        break;
-	case 'p':
+    case 'p':
-	    if (tag == "p") {
+        if (tag == "p") {
-		pending_space = true;
+            pending_space = true;
-	    } else if  (tag == "pre") {
+        } else if  (tag == "pre") {
-		pending_space = true;
+            pending_space = true;
-		in_pre_tag = false;
+            in_pre_tag = false;
-	    }
+        }
-	    break;
+        break;
-	case 'q':
+    case 'q':
-	    if (tag == "q") pending_space = true;
+        if (tag == "q") pending_space = true;
-	    break;
+        break;
-	case 's':
+    case 's':
-	    if (tag == "style") {
+        if (tag == "style") {
-		in_style_tag = false;
+            in_style_tag = false;
-		break;
+            break;
-	    }
+        }
-	    if (tag == "script") {
+        if (tag == "script") {
-		in_script_tag = false;
+            in_script_tag = false;
-		break;
+            break;
-	    }
+        }
-	    if (tag == "select") pending_space = true;
+        if (tag == "select") pending_space = true;
-	    break;
+        break;
-	case 't':
+    case 't':
-	    if (tag == "title") {
+        if (tag == "title") {
-		in_title_tag = false;
+            in_title_tag = false;
-		if (meta.find("title") == meta.end()|| meta["title"].empty()) {
+            if (meta.find("title") == meta.end()|| meta["title"].empty()) {
-		    meta["title"] = titledump;
+                meta["title"] = titledump;
-		    titledump.clear();
+                titledump.clear();
-		}
+            }
-		break;
+            break;
-	    }
+        }
-	    if (tag == "table" || tag == "td" || tag == "textarea" ||
+        if (tag == "table" || tag == "td" || tag == "textarea" ||
-		tag == "th") pending_space = true;
+            tag == "th") pending_space = true;
-	    break;
+        break;
-	case 'u':
+    case 'u':
-	    if (tag == "ul") pending_space = true;
+        if (tag == "ul") pending_space = true;
-	    break;
+        break;
-	case 'x':
+    case 'x':
-	    if (tag == "xmp") pending_space = true;
+        if (tag == "xmp") pending_space = true;
-	    break;
+        break;
    }
    return true;
 }