htmlparse: merged some updates from xapian 1.2.6

This commit is contained in:
Jean-Francois Dockes 2011-06-24 10:41:54 +02:00
parent 8fe524bd7f
commit c7a241d26e
4 changed files with 101 additions and 64 deletions

View File

@ -1,10 +1,10 @@
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
/* htmlparse.cc: simple HTML parser for omega indexer
*
* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2001 Ananova Ltd
* Copyright 2002,2006 Olly Betts
* Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@ -30,6 +30,14 @@ using std::find_if;
#include <ctype.h>
#include <cstring>
inline void
lowercase_string(string &str)
{
for (string::iterator i = str.begin(); i != str.end(); ++i) {
*i = tolower(static_cast<unsigned char>(*i));
}
}
map<string, unsigned int> HtmlParser::named_ents;
inline static bool
@ -75,6 +83,15 @@ p_whitespaceeqgt(char c)
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
}
bool
HtmlParser::get_parameter(const string & param, string & value) const
{
map<string, string>::const_iterator i = parameters.find(param);
if (i == parameters.end()) return false;
value = i->second;
return true;
}
HtmlParser::HtmlParser()
{
// RECOLL: no need to initialize these entities, we use those from
@ -151,12 +168,12 @@ HtmlParser::parse_html(const string &body)
{
in_script = false;
map<string,string> Param;
parameters.clear();
string::const_iterator start = body.begin();
while (true) {
// Skip through until we find an HTML tag, a comment, or the end of
// document. Ignore isolated occurences of `<' which don't start
// document. Ignore isolated occurrences of `<' which don't start
// a tag or comment.
string::const_iterator p = start;
while (true) {
@ -166,6 +183,7 @@ HtmlParser::parse_html(const string &body)
// Tag, closing tag, or comment (or SGML declaration).
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
if (ch == '?') {
// PHP code or XML declaration.
// XML declaration is only valid at the start of the first line.
@ -181,7 +199,7 @@ HtmlParser::parse_html(const string &body)
if (decl_end == body.end()) break;
// Default charset for XML is UTF-8.
charset = "UTF-8";
charset = "utf-8";
string decl(p + 6, decl_end);
size_t enc = decl.find("encoding");
@ -205,7 +223,7 @@ HtmlParser::parse_html(const string &body)
break;
}
p++;
p++;
}
// Process text up to start of tag.
@ -286,66 +304,83 @@ HtmlParser::parse_html(const string &body)
start = find_if(start, body.end(), p_nottag);
string tag = body.substr(p - body.begin(), start - p);
// convert tagname to lowercase
for (string::iterator i = tag.begin(); i != tag.end(); ++i)
*i = tolower(static_cast<unsigned char>(*i));
lowercase_string(tag);
if (closing) {
closing_tag(tag);
if (!closing_tag(tag))
return;
if (in_script && tag == "script") in_script = false;
/* ignore any bogus parameters on closing tags */
p = find(start, body.end(), '>');
if (p == body.end()) break;
start = p + 1;
} else {
bool empty_element = false;
// FIXME: parse parameters lazily.
while (start < body.end() && *start != '>') {
string name, value;
p = find_if(start, body.end(), p_whitespaceeqgt);
name = body.substr(start - body.begin(), p - start);
size_t name_len = p - start;
if (name_len == 1) {
if (*start == '/' && p < body.end() && *p == '>') {
// E.g. <tag foo="bar" />
start = p;
empty_element = true;
break;
}
}
name.assign(body, start - body.begin(), name_len);
p = find_if(p, body.end(), p_notwhitespace);
start = p;
if (start != body.end() && *start == '=') {
int quote;
start = find_if(start + 1, body.end(), p_notwhitespace);
p = body.end();
quote = *start;
int quote = *start;
if (quote == '"' || quote == '\'') {
start++;
p = find(start, body.end(), quote);
}
if (p == body.end()) {
// unquoted or no closing quote
p = find_if(start, body.end(), p_whitespacegt);
value = body.substr(start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
} else {
value = body.substr(start - body.begin(), p - start);
}
if (name.size()) {
value.assign(body, start - body.begin(), p - start);
start = find_if(p, body.end(), p_notwhitespace);
if (!name.empty()) {
// convert parameter name to lowercase
string::iterator i;
for (i = name.begin(); i != name.end(); ++i)
*i = tolower(static_cast<unsigned char>(*i));
lowercase_string(name);
// in case of multiple entries, use the first
// (as Netscape does)
if (Param.find(name) == Param.end())
Param[name] = value;
parameters.insert(make_pair(name, value));
}
}
}
opening_tag(tag, Param);
Param.clear();
#if 0
cout << "<" << tag;
map<string, string>::const_iterator x;
for (x = parameters.begin(); x != parameters.end(); x++) {
cout << " " << x->first << "=\"" << x->second << "\"";
}
cout << ">\n";
#endif
if (!opening_tag(tag))
return;
parameters.clear();
if (empty_element) {
if (!closing_tag(tag))
return;
}
// In <script> tags we ignore opening tags to avoid problems
// with "a<b".

View File

@ -31,16 +31,18 @@ using std::string;
using std::map;
class HtmlParser {
map<string, string> parameters;
protected:
virtual void decode_entities(string &s);
bool in_script;
string charset;
static map<string, unsigned int> named_ents;
bool get_parameter(const string & param, string & value) const;
public:
virtual void process_text(const string &/*text*/) { }
virtual void opening_tag(const string &/*tag*/,
const map<string,string> &/*p*/) { }
virtual void closing_tag(const string &/*tag*/) { }
virtual bool opening_tag(const string &/*tag*/) { return true; }
virtual bool closing_tag(const string &/*tag*/) { return true; }
virtual void parse_html(const string &text);
virtual void do_eof() {}
HtmlParser();

View File

@ -1,4 +1,4 @@
/* This file was copied from omega-0.8.5 and modified */
/* This file was copied from omega-0.8.5->1.2.6 and modified */
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
*
@ -287,8 +287,8 @@ MyHtmlParser::process_text(const string &text)
}
}
void
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
bool
MyHtmlParser::opening_tag(const string &tag)
{
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
#if 0
@ -298,14 +298,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
cout << " " << x->first << " -> '" << x->second << "'" << endl;
}
#endif
if (tag.empty()) return;
if (tag.empty()) return true;
switch (tag[0]) {
case 'a':
if (tag == "address") pending_space = true;
break;
case 'b':
if (tag == "body") {
dump = "";
dump.resize(0);
in_body_tag = true;
break;
}
@ -351,21 +351,20 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
break;
case 'm':
if (tag == "meta") {
map<string, string>::const_iterator i, j;
if ((i = p.find("content")) != p.end()) {
if ((j = p.find("name")) != p.end()) {
string name = j->second;
string content;
if (get_parameter("content", content)) {
string name;
if (get_parameter("name", name)) {
lowercase_term(name);
if (name == "date") {
// Yes this doesnt exist. It's output by filters
// And the format isn't even standard http/html
// FIXME
string tmp = i->second;
decode_entities(tmp);
decode_entities(content);
struct tm tm;
if (strptime(tmp.c_str(),
if (strptime(content.c_str(),
" %Y-%m-%d %H:%M:%S ", &tm) ||
strptime(tmp.c_str(),
strptime(content.c_str(),
"%Y-%m-%dT%H:%M:%S", &tm)
) {
char ascuxtime[100];
@ -376,17 +375,16 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
} else {
if (!meta[name].empty())
meta[name] += ' ';
string tmp = i->second;
decode_entities(tmp);
meta[name] += tmp;
decode_entities(content);
meta[name] += content;
}
} else if ((j = p.find("http-equiv")) != p.end()) {
string hequiv = j->second;
lowercase_term(hequiv);
if (hequiv == "content-type") {
string value = i->second;
}
string hdr;
if (get_parameter("http-equiv", hdr)) {
lowercase_term(hdr);
if (hdr == "content-type") {
MimeHeaderValue p;
parseMimeHeaderValue(value, p);
parseMimeHeaderValue(content, p);
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) !=
p.params.end()) {
@ -445,13 +443,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if (tag == "xmp") pending_space = true;
break;
}
return true;
}
void
bool
MyHtmlParser::closing_tag(const string &tag)
{
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
if (tag.empty()) return;
if (tag.empty()) return true;
switch (tag[0]) {
case 'a':
if (tag == "address") pending_space = true;
@ -460,7 +459,7 @@ MyHtmlParser::closing_tag(const string &tag)
if (tag == "body") {
LOGDEB1(("Myhtmlparse: body close tag found\n"));
in_body_tag = false;
throw true;
return false;
}
if (tag == "blockquote" || tag == "br") pending_space = true;
break;
@ -532,6 +531,7 @@ MyHtmlParser::closing_tag(const string &tag)
if (tag == "xmp") pending_space = true;
break;
}
return true;
}
// This gets called when hitting eof.

View File

@ -55,8 +55,8 @@ class MyHtmlParser : public HtmlParser {
bool indexing_allowed;
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag);
bool opening_tag(const string &tag);
bool closing_tag(const string &tag);
void do_eof();
void decode_entities(string &s);
void reset_charsets() {fromcharset = tocharset = "";}