HTML: do not concatenate text found before body tag with the title. Fixes issue #125

This commit is contained in:
Jean-Francois Dockes 2013-01-12 14:06:40 +01:00
parent 8dc1177fd4
commit f897f087aa
3 changed files with 23 additions and 8 deletions

View File

@ -177,6 +177,7 @@ MyHtmlParser::MyHtmlParser()
: in_script_tag(false), : in_script_tag(false),
in_style_tag(false), in_style_tag(false),
in_pre_tag(false), in_pre_tag(false),
in_title_tag(false),
pending_space(false), pending_space(false),
indexing_allowed(true) indexing_allowed(true)
{ {
@ -256,12 +257,20 @@ void MyHtmlParser::decode_entities(string &s)
void void
MyHtmlParser::process_text(const string &text) MyHtmlParser::process_text(const string &text)
{ {
LOGDEB2(("process_text: pending_space %d txt [%s]\n", pending_space, LOGDEB2(("process_text: title %d script %d style %d pre %d "
text.c_str())); "pending_space %d txt [%s]\n",
in_title_tag,
in_script_tag,
in_style_tag,
in_pre_tag,
pending_space,
text.c_str()));
CancelCheck::instance().checkCancel(); CancelCheck::instance().checkCancel();
if (!in_script_tag && !in_style_tag) { if (!in_script_tag && !in_style_tag) {
if (!in_pre_tag) { if (in_title_tag) {
titledump += text;
} else if (!in_pre_tag) {
string::size_type b = 0; string::size_type b = 0;
bool only_space = true; bool only_space = true;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
@ -461,7 +470,11 @@ MyHtmlParser::opening_tag(const string &tag)
break; break;
case 't': case 't':
if (tag == "table" || tag == "td" || tag == "textarea" || if (tag == "table" || tag == "td" || tag == "textarea" ||
tag == "th") pending_space = true; tag == "th") {
pending_space = true;
} else if (tag == "title") {
in_title_tag = true;
}
break; break;
case 'u': case 'u':
if (tag == "ul") pending_space = true; if (tag == "ul") pending_space = true;
@ -542,9 +555,10 @@ MyHtmlParser::closing_tag(const string &tag)
break; break;
case 't': case 't':
if (tag == "title") { if (tag == "title") {
in_title_tag = false;
if (meta.find("title") == meta.end()|| meta["title"].empty()) { if (meta.find("title") == meta.end()|| meta["title"].empty()) {
meta["title"] = dump; meta["title"] = titledump;
dump.clear(); titledump.clear();
} }
break; break;
} }

View File

@ -37,9 +37,10 @@ class MyHtmlParser : public HtmlParser {
bool in_script_tag; bool in_script_tag;
bool in_style_tag; bool in_style_tag;
bool in_pre_tag; bool in_pre_tag;
bool in_title_tag;
bool pending_space; bool pending_space;
map<string,string> meta; map<string,string> meta;
string dump, dmtime; string dump, dmtime, titledump;
// This is the charset our caller thinks the doc used (initially // This is the charset our caller thinks the doc used (initially
// comes from the environment/configuration, used as source for // comes from the environment/configuration, used as source for
// conversion to utf-8) // conversion to utf-8)

View File

@ -6,7 +6,7 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/qtextedit.html]
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanualSmall.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 1480354 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanualSmall.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 1480354 bytes
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanual.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 8136414 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanual.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 8136414 bytes
1 results 1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] [bla bla "bad quotes] 236 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] ["bad quotes] 236 bytes
1 results 1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
testfield = testfieldvalue testfield = testfieldvalue