htmlparse: merged some updates from xapian 1.2.6

This commit is contained in:
Jean-Francois Dockes 2011-06-24 10:41:54 +02:00
parent 8fe524bd7f
commit c7a241d26e
4 changed files with 101 additions and 64 deletions

View File

@ -1,10 +1,10 @@
/* This file was copied/updated from xapian-omega-1.0.1 and modified */ /* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
/* htmlparse.cc: simple HTML parser for omega indexer /* htmlparse.cc: simple HTML parser for omega indexer
* *
* Copyright 1999,2000,2001 BrightStation PLC * Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2001 Ananova Ltd * Copyright 2001 Ananova Ltd
* Copyright 2002,2006 Olly Betts * Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as * modify it under the terms of the GNU General Public License as
@ -30,6 +30,14 @@ using std::find_if;
#include <ctype.h> #include <ctype.h>
#include <cstring> #include <cstring>
inline void
lowercase_string(string &str)
{
for (string::iterator i = str.begin(); i != str.end(); ++i) {
*i = tolower(static_cast<unsigned char>(*i));
}
}
map<string, unsigned int> HtmlParser::named_ents; map<string, unsigned int> HtmlParser::named_ents;
inline static bool inline static bool
@ -75,6 +83,15 @@ p_whitespaceeqgt(char c)
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>'; return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
} }
bool
HtmlParser::get_parameter(const string & param, string & value) const
{
map<string, string>::const_iterator i = parameters.find(param);
if (i == parameters.end()) return false;
value = i->second;
return true;
}
HtmlParser::HtmlParser() HtmlParser::HtmlParser()
{ {
// RECOLL: no need to initialize these entities, we use those from // RECOLL: no need to initialize these entities, we use those from
@ -151,12 +168,12 @@ HtmlParser::parse_html(const string &body)
{ {
in_script = false; in_script = false;
map<string,string> Param; parameters.clear();
string::const_iterator start = body.begin(); string::const_iterator start = body.begin();
while (true) { while (true) {
// Skip through until we find an HTML tag, a comment, or the end of // Skip through until we find an HTML tag, a comment, or the end of
// document. Ignore isolated occurences of `<' which don't start // document. Ignore isolated occurrences of `<' which don't start
// a tag or comment. // a tag or comment.
string::const_iterator p = start; string::const_iterator p = start;
while (true) { while (true) {
@ -166,6 +183,7 @@ HtmlParser::parse_html(const string &body)
// Tag, closing tag, or comment (or SGML declaration). // Tag, closing tag, or comment (or SGML declaration).
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
if (ch == '?') { if (ch == '?') {
// PHP code or XML declaration. // PHP code or XML declaration.
// XML declaration is only valid at the start of the first line. // XML declaration is only valid at the start of the first line.
@ -181,7 +199,7 @@ HtmlParser::parse_html(const string &body)
if (decl_end == body.end()) break; if (decl_end == body.end()) break;
// Default charset for XML is UTF-8. // Default charset for XML is UTF-8.
charset = "UTF-8"; charset = "utf-8";
string decl(p + 6, decl_end); string decl(p + 6, decl_end);
size_t enc = decl.find("encoding"); size_t enc = decl.find("encoding");
@ -286,11 +304,11 @@ HtmlParser::parse_html(const string &body)
start = find_if(start, body.end(), p_nottag); start = find_if(start, body.end(), p_nottag);
string tag = body.substr(p - body.begin(), start - p); string tag = body.substr(p - body.begin(), start - p);
// convert tagname to lowercase // convert tagname to lowercase
for (string::iterator i = tag.begin(); i != tag.end(); ++i) lowercase_string(tag);
*i = tolower(static_cast<unsigned char>(*i));
if (closing) { if (closing) {
closing_tag(tag); if (!closing_tag(tag))
return;
if (in_script && tag == "script") in_script = false; if (in_script && tag == "script") in_script = false;
/* ignore any bogus parameters on closing tags */ /* ignore any bogus parameters on closing tags */
@ -298,24 +316,34 @@ HtmlParser::parse_html(const string &body)
if (p == body.end()) break; if (p == body.end()) break;
start = p + 1; start = p + 1;
} else { } else {
bool empty_element = false;
// FIXME: parse parameters lazily.
while (start < body.end() && *start != '>') { while (start < body.end() && *start != '>') {
string name, value; string name, value;
p = find_if(start, body.end(), p_whitespaceeqgt); p = find_if(start, body.end(), p_whitespaceeqgt);
name = body.substr(start - body.begin(), p - start); size_t name_len = p - start;
if (name_len == 1) {
if (*start == '/' && p < body.end() && *p == '>') {
// E.g. <tag foo="bar" />
start = p;
empty_element = true;
break;
}
}
name.assign(body, start - body.begin(), name_len);
p = find_if(p, body.end(), p_notwhitespace); p = find_if(p, body.end(), p_notwhitespace);
start = p; start = p;
if (start != body.end() && *start == '=') { if (start != body.end() && *start == '=') {
int quote;
start = find_if(start + 1, body.end(), p_notwhitespace); start = find_if(start + 1, body.end(), p_notwhitespace);
p = body.end(); p = body.end();
quote = *start; int quote = *start;
if (quote == '"' || quote == '\'') { if (quote == '"' || quote == '\'') {
start++; start++;
p = find(start, body.end(), quote); p = find(start, body.end(), quote);
@ -324,28 +352,35 @@ HtmlParser::parse_html(const string &body)
if (p == body.end()) { if (p == body.end()) {
// unquoted or no closing quote // unquoted or no closing quote
p = find_if(start, body.end(), p_whitespacegt); p = find_if(start, body.end(), p_whitespacegt);
value = body.substr(start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
} else {
value = body.substr(start - body.begin(), p - start);
} }
value.assign(body, start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
if (name.size()) { if (!name.empty()) {
// convert parameter name to lowercase // convert parameter name to lowercase
string::iterator i; lowercase_string(name);
for (i = name.begin(); i != name.end(); ++i)
*i = tolower(static_cast<unsigned char>(*i));
// in case of multiple entries, use the first // in case of multiple entries, use the first
// (as Netscape does) // (as Netscape does)
if (Param.find(name) == Param.end()) parameters.insert(make_pair(name, value));
Param[name] = value;
} }
} }
} }
opening_tag(tag, Param); #if 0
Param.clear(); cout << "<" << tag;
map<string, string>::const_iterator x;
for (x = parameters.begin(); x != parameters.end(); x++) {
cout << " " << x->first << "=\"" << x->second << "\"";
}
cout << ">\n";
#endif
if (!opening_tag(tag))
return;
parameters.clear();
if (empty_element) {
if (!closing_tag(tag))
return;
}
// In <script> tags we ignore opening tags to avoid problems // In <script> tags we ignore opening tags to avoid problems
// with "a<b". // with "a<b".

View File

@ -31,16 +31,18 @@ using std::string;
using std::map; using std::map;
class HtmlParser { class HtmlParser {
map<string, string> parameters;
protected: protected:
virtual void decode_entities(string &s); virtual void decode_entities(string &s);
bool in_script; bool in_script;
string charset; string charset;
static map<string, unsigned int> named_ents; static map<string, unsigned int> named_ents;
bool get_parameter(const string & param, string & value) const;
public: public:
virtual void process_text(const string &/*text*/) { } virtual void process_text(const string &/*text*/) { }
virtual void opening_tag(const string &/*tag*/, virtual bool opening_tag(const string &/*tag*/) { return true; }
const map<string,string> &/*p*/) { } virtual bool closing_tag(const string &/*tag*/) { return true; }
virtual void closing_tag(const string &/*tag*/) { }
virtual void parse_html(const string &text); virtual void parse_html(const string &text);
virtual void do_eof() {} virtual void do_eof() {}
HtmlParser(); HtmlParser();

View File

@ -1,4 +1,4 @@
/* This file was copied from omega-0.8.5 and modified */ /* This file was copied from omega-0.8.5->1.2.6 and modified */
/* myhtmlparse.cc: subclass of HtmlParser for extracting text /* myhtmlparse.cc: subclass of HtmlParser for extracting text
* *
@ -287,8 +287,8 @@ MyHtmlParser::process_text(const string &text)
} }
} }
void bool
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p) MyHtmlParser::opening_tag(const string &tag)
{ {
LOGDEB2(("opening_tag: [%s]\n", tag.c_str())); LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
#if 0 #if 0
@ -298,14 +298,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
cout << " " << x->first << " -> '" << x->second << "'" << endl; cout << " " << x->first << " -> '" << x->second << "'" << endl;
} }
#endif #endif
if (tag.empty()) return; if (tag.empty()) return true;
switch (tag[0]) { switch (tag[0]) {
case 'a': case 'a':
if (tag == "address") pending_space = true; if (tag == "address") pending_space = true;
break; break;
case 'b': case 'b':
if (tag == "body") { if (tag == "body") {
dump = ""; dump.resize(0);
in_body_tag = true; in_body_tag = true;
break; break;
} }
@ -351,21 +351,20 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
break; break;
case 'm': case 'm':
if (tag == "meta") { if (tag == "meta") {
map<string, string>::const_iterator i, j; string content;
if ((i = p.find("content")) != p.end()) { if (get_parameter("content", content)) {
if ((j = p.find("name")) != p.end()) { string name;
string name = j->second; if (get_parameter("name", name)) {
lowercase_term(name); lowercase_term(name);
if (name == "date") { if (name == "date") {
// Yes this doesnt exist. It's output by filters // Yes this doesnt exist. It's output by filters
// And the format isn't even standard http/html // And the format isn't even standard http/html
// FIXME // FIXME
string tmp = i->second; decode_entities(content);
decode_entities(tmp);
struct tm tm; struct tm tm;
if (strptime(tmp.c_str(), if (strptime(content.c_str(),
" %Y-%m-%d %H:%M:%S ", &tm) || " %Y-%m-%d %H:%M:%S ", &tm) ||
strptime(tmp.c_str(), strptime(content.c_str(),
"%Y-%m-%dT%H:%M:%S", &tm) "%Y-%m-%dT%H:%M:%S", &tm)
) { ) {
char ascuxtime[100]; char ascuxtime[100];
@ -376,17 +375,16 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
} else { } else {
if (!meta[name].empty()) if (!meta[name].empty())
meta[name] += ' '; meta[name] += ' ';
string tmp = i->second; decode_entities(content);
decode_entities(tmp); meta[name] += content;
meta[name] += tmp;
} }
} else if ((j = p.find("http-equiv")) != p.end()) { }
string hequiv = j->second; string hdr;
lowercase_term(hequiv); if (get_parameter("http-equiv", hdr)) {
if (hequiv == "content-type") { lowercase_term(hdr);
string value = i->second; if (hdr == "content-type") {
MimeHeaderValue p; MimeHeaderValue p;
parseMimeHeaderValue(value, p); parseMimeHeaderValue(content, p);
map<string, string>::const_iterator k; map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != if ((k = p.params.find("charset")) !=
p.params.end()) { p.params.end()) {
@ -445,13 +443,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if (tag == "xmp") pending_space = true; if (tag == "xmp") pending_space = true;
break; break;
} }
return true;
} }
void bool
MyHtmlParser::closing_tag(const string &tag) MyHtmlParser::closing_tag(const string &tag)
{ {
LOGDEB2(("closing_tag: [%s]\n", tag.c_str())); LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
if (tag.empty()) return; if (tag.empty()) return true;
switch (tag[0]) { switch (tag[0]) {
case 'a': case 'a':
if (tag == "address") pending_space = true; if (tag == "address") pending_space = true;
@ -460,7 +459,7 @@ MyHtmlParser::closing_tag(const string &tag)
if (tag == "body") { if (tag == "body") {
LOGDEB1(("Myhtmlparse: body close tag found\n")); LOGDEB1(("Myhtmlparse: body close tag found\n"));
in_body_tag = false; in_body_tag = false;
throw true; return false;
} }
if (tag == "blockquote" || tag == "br") pending_space = true; if (tag == "blockquote" || tag == "br") pending_space = true;
break; break;
@ -532,6 +531,7 @@ MyHtmlParser::closing_tag(const string &tag)
if (tag == "xmp") pending_space = true; if (tag == "xmp") pending_space = true;
break; break;
} }
return true;
} }
// This gets called when hitting eof. // This gets called when hitting eof.

View File

@ -55,8 +55,8 @@ class MyHtmlParser : public HtmlParser {
bool indexing_allowed; bool indexing_allowed;
void process_text(const string &text); void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p); bool opening_tag(const string &tag);
void closing_tag(const string &tag); bool closing_tag(const string &tag);
void do_eof(); void do_eof();
void decode_entities(string &s); void decode_entities(string &s);
void reset_charsets() {fromcharset = tocharset = "";} void reset_charsets() {fromcharset = tocharset = "";}