From f897f087aa0f530d564e6f3f042af19d2fe943ab Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 12 Jan 2013 14:06:40 +0100 Subject: [PATCH] HTML: do not concatenate text found before body tag with the title. Fixes issue #125 --- src/internfile/myhtmlparse.cpp | 26 ++++++++++++++++++++------ src/internfile/myhtmlparse.h | 3 ++- tests/html/html.txt | 2 +- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index b47fa260..d449f9c1 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -177,6 +177,7 @@ MyHtmlParser::MyHtmlParser() : in_script_tag(false), in_style_tag(false), in_pre_tag(false), + in_title_tag(false), pending_space(false), indexing_allowed(true) { @@ -256,12 +257,20 @@ void MyHtmlParser::decode_entities(string &s) void MyHtmlParser::process_text(const string &text) { - LOGDEB2(("process_text: pending_space %d txt [%s]\n", pending_space, - text.c_str())); + LOGDEB2(("process_text: title %d script %d style %d pre %d " + "pending_space %d txt [%s]\n", + in_title_tag, + in_script_tag, + in_style_tag, + in_pre_tag, + pending_space, + text.c_str())); CancelCheck::instance().checkCancel(); if (!in_script_tag && !in_style_tag) { - if (!in_pre_tag) { + if (in_title_tag) { + titledump += text; + } else if (!in_pre_tag) { string::size_type b = 0; bool only_space = true; while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { @@ -461,7 +470,11 @@ MyHtmlParser::opening_tag(const string &tag) break; case 't': if (tag == "table" || tag == "td" || tag == "textarea" || - tag == "th") pending_space = true; + tag == "th") { + pending_space = true; + } else if (tag == "title") { + in_title_tag = true; + } break; case 'u': if (tag == "ul") pending_space = true; @@ -542,9 +555,10 @@ MyHtmlParser::closing_tag(const string &tag) break; case 't': if (tag == "title") { + in_title_tag = false; if (meta.find("title") == meta.end()|| meta["title"].empty()) { - meta["title"] = dump; - dump.clear(); + meta["title"] = titledump; + titledump.clear(); } break; } diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 38b3e8f6..eba0785e 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -37,9 +37,10 @@ class MyHtmlParser : public HtmlParser { bool in_script_tag; bool in_style_tag; bool in_pre_tag; + bool in_title_tag; bool pending_space; map meta; - string dump, dmtime; + string dump, dmtime, titledump; // This is the charset our caller thinks the doc used (initially // comes from the environment/configuration, used as source for // conversion to utf-8) diff --git a/tests/html/html.txt b/tests/html/html.txt index cdf4c9f4..e0b8e113 100644 --- a/tests/html/html.txt +++ b/tests/html/html.txt @@ -6,7 +6,7 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/qtextedit.html] text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanualSmall.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 1480354 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanual.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 8136414 bytes 1 results -text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] [bla bla "bad quotes] 236 bytes +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] ["bad quotes] 236 bytes 1 results text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes testfield = testfieldvalue