diff --git a/src/common/cstr.h b/src/common/cstr.h index b2d18ede..4bbf1bd0 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -56,7 +56,8 @@ DEF_CSTR(null, ""); DEF_CSTR(plus, "+"); DEF_CSTR(textplain, "text/plain"); DEF_CSTR(url, "url"); - +// Marker for HTML format fields +DEF_CSTR(fldhtm, "\007"); // Values used as keys inside Dijon::Filter::metaData[]. This structure is // used to store all data generated by format-translating filters. It is diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index d34d9687..37a822f0 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -360,9 +360,7 @@ MyHtmlParser::opening_tag(const string &tag) if (get_parameter("name", name)) { lowercase_term(name); if (name == "date") { - // Yes this doesnt exist. It's output by filters - // And the format isn't even standard http/html - // FIXME + // Specific to Recoll filters. decode_entities(content); struct tm tm; if (strptime(content.c_str(), @@ -376,10 +374,22 @@ MyHtmlParser::opening_tag(const string &tag) } } else if (name == "robots") { } else { + string markup; + bool ishtml = false; + if (get_parameter("markup", markup)) { + if (!stringlowercmp("html", markup)) { + ishtml = true; + } + } if (!meta[name].empty()) meta[name] += ' '; decode_entities(content); meta[name] += content; + if (ishtml && + meta[name].compare(0, cstr_fldhtm.size(), + cstr_fldhtm)) { + meta[name].insert(0, cstr_fldhtm); + } } } string hdr; @@ -417,8 +427,7 @@ MyHtmlParser::opening_tag(const string &tag) } } break; - } - if (tag == "marquee" || tag == "menu" || tag == "multicol") + } else if (tag == "marquee" || tag == "menu" || tag == "multicol") pending_space = true; break; case 'o': @@ -441,12 +450,11 @@ MyHtmlParser::opening_tag(const string &tag) if (tag == "style") { in_style_tag = true; break; - } - if (tag == "script") { + } else if (tag == "script") { in_script_tag = true; break; - } - if (tag == "select") pending_space = true; + } else if (tag == "select") + pending_space = true; break; case 't': if (tag == "table" || tag == "td" || tag == "textarea" || diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp index 2ddec757..71e134df 100644 --- a/src/query/reslistpager.cpp +++ b/src/query/reslistpager.cpp @@ -110,6 +110,14 @@ void ResListPager::resultPageNext() m_resultsInCurrentPage = pagelen; m_respage = npage; } +static string maybeEscapeHtml(const string& fld) +{ + if (fld.compare(0, cstr_fldhtm.size(), cstr_fldhtm)) + return escapeHtml(fld); + else + return fld.substr(cstr_fldhtm.size()); +} + void ResListPager::resultPageFor(int docnum) { @@ -263,21 +271,21 @@ void ResListPager::displayDoc(RclConfig *config, int i, Rcl::Doc& doc, subs["I"] = iconurl; subs["i"] = doc.ipath; subs["K"] = !doc.meta[Rcl::Doc::keykw].empty() ? - string("[") + escapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : ""; + string("[") + maybeEscapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : ""; subs["L"] = linksbuf.str(); subs["N"] = numbuf; subs["M"] = doc.mimetype; subs["R"] = doc.meta[Rcl::Doc::keyrr]; subs["S"] = sizebuf; - subs["T"] = escapeHtml(titleOrFilename); - subs["t"] = escapeHtml(doc.meta[Rcl::Doc::keytt]); + subs["T"] = maybeEscapeHtml(titleOrFilename); + subs["t"] = maybeEscapeHtml(doc.meta[Rcl::Doc::keytt]); subs["U"] = url; // Let %(xx) access all metadata. HTML-neuter everything: for (map::iterator it = doc.meta.begin(); it != doc.meta.end(); it++) { if (!it->first.empty()) - subs[it->first] = escapeHtml(it->second); + subs[it->first] = maybeEscapeHtml(it->second); } string formatted; diff --git a/website/index.html.en b/website/index.html.en index b091abd6..5563a018 100644 --- a/website/index.html.en +++ b/website/index.html.en @@ -84,6 +84,22 @@

News