From 17f8b652d4ed75598e235660f2fc675edc0c8188 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 25 Oct 2012 14:22:20 +0200 Subject: [PATCH] Support explicit HTML markup in fields when the markup="html" attribute is present --- src/common/cstr.h | 3 ++- src/internfile/myhtmlparse.cpp | 26 +++++++++++++++++--------- src/query/reslistpager.cpp | 16 ++++++++++++---- website/index.html.en | 16 ++++++++++++++++ website/index.html.fr | 19 +++++++++++++++++++ 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/src/common/cstr.h b/src/common/cstr.h index b2d18ede..4bbf1bd0 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -56,7 +56,8 @@ DEF_CSTR(null, ""); DEF_CSTR(plus, "+"); DEF_CSTR(textplain, "text/plain"); DEF_CSTR(url, "url"); - +// Marker for HTML format fields +DEF_CSTR(fldhtm, "\007"); // Values used as keys inside Dijon::Filter::metaData[]. This structure is // used to store all data generated by format-translating filters. It is diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index d34d9687..37a822f0 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -360,9 +360,7 @@ MyHtmlParser::opening_tag(const string &tag) if (get_parameter("name", name)) { lowercase_term(name); if (name == "date") { - // Yes this doesnt exist. It's output by filters - // And the format isn't even standard http/html - // FIXME + // Specific to Recoll filters. decode_entities(content); struct tm tm; if (strptime(content.c_str(), @@ -376,10 +374,22 @@ MyHtmlParser::opening_tag(const string &tag) } } else if (name == "robots") { } else { + string markup; + bool ishtml = false; + if (get_parameter("markup", markup)) { + if (!stringlowercmp("html", markup)) { + ishtml = true; + } + } if (!meta[name].empty()) meta[name] += ' '; decode_entities(content); meta[name] += content; + if (ishtml && + meta[name].compare(0, cstr_fldhtm.size(), + cstr_fldhtm)) { + meta[name].insert(0, cstr_fldhtm); + } } } string hdr; @@ -417,8 +427,7 @@ MyHtmlParser::opening_tag(const string &tag) } } break; - } - if (tag == "marquee" || tag == "menu" || tag == "multicol") + } else if (tag == "marquee" || tag == "menu" || tag == "multicol") pending_space = true; break; case 'o': @@ -441,12 +450,11 @@ MyHtmlParser::opening_tag(const string &tag) if (tag == "style") { in_style_tag = true; break; - } - if (tag == "script") { + } else if (tag == "script") { in_script_tag = true; break; - } - if (tag == "select") pending_space = true; + } else if (tag == "select") + pending_space = true; break; case 't': if (tag == "table" || tag == "td" || tag == "textarea" || diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp index 2ddec757..71e134df 100644 --- a/src/query/reslistpager.cpp +++ b/src/query/reslistpager.cpp @@ -110,6 +110,14 @@ void ResListPager::resultPageNext() m_resultsInCurrentPage = pagelen; m_respage = npage; } +static string maybeEscapeHtml(const string& fld) +{ + if (fld.compare(0, cstr_fldhtm.size(), cstr_fldhtm)) + return escapeHtml(fld); + else + return fld.substr(cstr_fldhtm.size()); +} + void ResListPager::resultPageFor(int docnum) { @@ -263,21 +271,21 @@ void ResListPager::displayDoc(RclConfig *config, int i, Rcl::Doc& doc, subs["I"] = iconurl; subs["i"] = doc.ipath; subs["K"] = !doc.meta[Rcl::Doc::keykw].empty() ? - string("[") + escapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : ""; + string("[") + maybeEscapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : ""; subs["L"] = linksbuf.str(); subs["N"] = numbuf; subs["M"] = doc.mimetype; subs["R"] = doc.meta[Rcl::Doc::keyrr]; subs["S"] = sizebuf; - subs["T"] = escapeHtml(titleOrFilename); - subs["t"] = escapeHtml(doc.meta[Rcl::Doc::keytt]); + subs["T"] = maybeEscapeHtml(titleOrFilename); + subs["t"] = maybeEscapeHtml(doc.meta[Rcl::Doc::keytt]); subs["U"] = url; // Let %(xx) access all metadata. HTML-neuter everything: for (map::iterator it = doc.meta.begin(); it != doc.meta.end(); it++) { if (!it->first.empty()) - subs[it->first] = escapeHtml(it->second); + subs[it->first] = maybeEscapeHtml(it->second); } string formatted; diff --git a/website/index.html.en b/website/index.html.en index b091abd6..5563a018 100644 --- a/website/index.html.en +++ b/website/index.html.en @@ -84,6 +84,22 @@

News

    +
  • 2012-10-25: a problem with a simple workaround has caused + several reported recollindex + crashes recently. If you store and index + Mozilla/Thunderbird email out of the standard location + (~/.thunderbird), you should add the following at the end of + your configuration file (e.g.: + ~/.recoll/recoll.conf):
    
    +              [/path/to/my/mozilla/mail]
    +              mhmboxquirks = tbird
    +          
    Adjust the path to your local value of course... + Without this hint, recollindex has trouble finding the + message delimiters inside the folder files, and will + possibly use all the computer's memory and crash. Apart from + crashes, which only occur for very big folders, this also + causes incorrect mail indexing. +
  • 2012-10-19: the source for recoll 1.18.001 is available, and this is a call to volunteers to test it. There are binary diff --git a/website/index.html.fr b/website/index.html.fr index a8eb5726..7196f63f 100644 --- a/website/index.html.fr +++ b/website/index.html.fr @@ -100,6 +100,25 @@

    Nouvelles:

      +
    • 2012-10-25: Un problème avec une solution simple peut provoquer + des plantages de + recollindex. + Si vous indexez des messages mail Mozilla/Thunderbird + ailleurs qu'à l'endroit standard (~/.thunderbird), vous + devriez ajouter les lignes qui suivent à la fin de votre + fichier de configuration (~/.recoll/recoll.conf): +
      
      +              [/path/to/my/mozilla/mail]
      +              mhmboxquirks = tbird
      +          
      Changez le chemin d'accès pour le votre bien + sûr. Sans cette indication, recollindex a des difficultés à + déterminer les limites de message dans les fichiers mailbox, + et peut arriver à utiliser toute la mémoire de la machine, + et à se planter. Dans les cas moins graves (avec des + fichiers de taille "raisonnable"), cela provoque aussi une + indexation incorrecte des messages. +
    • +
    • 2012-10-16: un nouveau filtre pour les documents EPUB.
    • 2012-05-24: Sortie de la