HTML: do not concatenate text found before body tag with the title. Fixes issue #125
This commit is contained in:
parent
8dc1177fd4
commit
f897f087aa
@ -177,6 +177,7 @@ MyHtmlParser::MyHtmlParser()
|
|||||||
: in_script_tag(false),
|
: in_script_tag(false),
|
||||||
in_style_tag(false),
|
in_style_tag(false),
|
||||||
in_pre_tag(false),
|
in_pre_tag(false),
|
||||||
|
in_title_tag(false),
|
||||||
pending_space(false),
|
pending_space(false),
|
||||||
indexing_allowed(true)
|
indexing_allowed(true)
|
||||||
{
|
{
|
||||||
@ -256,12 +257,20 @@ void MyHtmlParser::decode_entities(string &s)
|
|||||||
void
|
void
|
||||||
MyHtmlParser::process_text(const string &text)
|
MyHtmlParser::process_text(const string &text)
|
||||||
{
|
{
|
||||||
LOGDEB2(("process_text: pending_space %d txt [%s]\n", pending_space,
|
LOGDEB2(("process_text: title %d script %d style %d pre %d "
|
||||||
text.c_str()));
|
"pending_space %d txt [%s]\n",
|
||||||
|
in_title_tag,
|
||||||
|
in_script_tag,
|
||||||
|
in_style_tag,
|
||||||
|
in_pre_tag,
|
||||||
|
pending_space,
|
||||||
|
text.c_str()));
|
||||||
CancelCheck::instance().checkCancel();
|
CancelCheck::instance().checkCancel();
|
||||||
|
|
||||||
if (!in_script_tag && !in_style_tag) {
|
if (!in_script_tag && !in_style_tag) {
|
||||||
if (!in_pre_tag) {
|
if (in_title_tag) {
|
||||||
|
titledump += text;
|
||||||
|
} else if (!in_pre_tag) {
|
||||||
string::size_type b = 0;
|
string::size_type b = 0;
|
||||||
bool only_space = true;
|
bool only_space = true;
|
||||||
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
||||||
@ -461,7 +470,11 @@ MyHtmlParser::opening_tag(const string &tag)
|
|||||||
break;
|
break;
|
||||||
case 't':
|
case 't':
|
||||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||||
tag == "th") pending_space = true;
|
tag == "th") {
|
||||||
|
pending_space = true;
|
||||||
|
} else if (tag == "title") {
|
||||||
|
in_title_tag = true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'u':
|
case 'u':
|
||||||
if (tag == "ul") pending_space = true;
|
if (tag == "ul") pending_space = true;
|
||||||
@ -542,9 +555,10 @@ MyHtmlParser::closing_tag(const string &tag)
|
|||||||
break;
|
break;
|
||||||
case 't':
|
case 't':
|
||||||
if (tag == "title") {
|
if (tag == "title") {
|
||||||
|
in_title_tag = false;
|
||||||
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
||||||
meta["title"] = dump;
|
meta["title"] = titledump;
|
||||||
dump.clear();
|
titledump.clear();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,9 +37,10 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
bool in_script_tag;
|
bool in_script_tag;
|
||||||
bool in_style_tag;
|
bool in_style_tag;
|
||||||
bool in_pre_tag;
|
bool in_pre_tag;
|
||||||
|
bool in_title_tag;
|
||||||
bool pending_space;
|
bool pending_space;
|
||||||
map<string,string> meta;
|
map<string,string> meta;
|
||||||
string dump, dmtime;
|
string dump, dmtime, titledump;
|
||||||
// This is the charset our caller thinks the doc used (initially
|
// This is the charset our caller thinks the doc used (initially
|
||||||
// comes from the environment/configuration, used as source for
|
// comes from the environment/configuration, used as source for
|
||||||
// conversion to utf-8)
|
// conversion to utf-8)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/qtextedit.html]
|
|||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanualSmall.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 1480354 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanualSmall.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 1480354 bytes
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanual.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 8136414 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/mysqlmanual.html] [MySQL 3.23, 4.0, 4.1 Reference Manual] 8136414 bytes
|
||||||
1 results
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] [bla bla "bad quotes] 236 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] ["bad quotes] 236 bytes
|
||||||
1 results
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes
|
||||||
testfield = testfieldvalue
|
testfield = testfieldvalue
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user