htmlparse: merged some updates from xapian 1.2.6
This commit is contained in:
parent
8fe524bd7f
commit
c7a241d26e
@ -1,10 +1,10 @@
|
|||||||
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
|
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
|
||||||
|
|
||||||
/* htmlparse.cc: simple HTML parser for omega indexer
|
/* htmlparse.cc: simple HTML parser for omega indexer
|
||||||
*
|
*
|
||||||
* Copyright 1999,2000,2001 BrightStation PLC
|
* Copyright 1999,2000,2001 BrightStation PLC
|
||||||
* Copyright 2001 Ananova Ltd
|
* Copyright 2001 Ananova Ltd
|
||||||
* Copyright 2002,2006 Olly Betts
|
* Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
|
||||||
*
|
*
|
||||||
* This program is free software; you can redistribute it and/or
|
* This program is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU General Public License as
|
* modify it under the terms of the GNU General Public License as
|
||||||
@ -30,6 +30,14 @@ using std::find_if;
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
inline void
|
||||||
|
lowercase_string(string &str)
|
||||||
|
{
|
||||||
|
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
||||||
|
*i = tolower(static_cast<unsigned char>(*i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
map<string, unsigned int> HtmlParser::named_ents;
|
map<string, unsigned int> HtmlParser::named_ents;
|
||||||
|
|
||||||
inline static bool
|
inline static bool
|
||||||
@ -75,6 +83,15 @@ p_whitespaceeqgt(char c)
|
|||||||
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
HtmlParser::get_parameter(const string & param, string & value) const
|
||||||
|
{
|
||||||
|
map<string, string>::const_iterator i = parameters.find(param);
|
||||||
|
if (i == parameters.end()) return false;
|
||||||
|
value = i->second;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
HtmlParser::HtmlParser()
|
HtmlParser::HtmlParser()
|
||||||
{
|
{
|
||||||
// RECOLL: no need to initialize these entities, we use those from
|
// RECOLL: no need to initialize these entities, we use those from
|
||||||
@ -151,12 +168,12 @@ HtmlParser::parse_html(const string &body)
|
|||||||
{
|
{
|
||||||
in_script = false;
|
in_script = false;
|
||||||
|
|
||||||
map<string,string> Param;
|
parameters.clear();
|
||||||
string::const_iterator start = body.begin();
|
string::const_iterator start = body.begin();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Skip through until we find an HTML tag, a comment, or the end of
|
// Skip through until we find an HTML tag, a comment, or the end of
|
||||||
// document. Ignore isolated occurences of `<' which don't start
|
// document. Ignore isolated occurrences of `<' which don't start
|
||||||
// a tag or comment.
|
// a tag or comment.
|
||||||
string::const_iterator p = start;
|
string::const_iterator p = start;
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -166,6 +183,7 @@ HtmlParser::parse_html(const string &body)
|
|||||||
|
|
||||||
// Tag, closing tag, or comment (or SGML declaration).
|
// Tag, closing tag, or comment (or SGML declaration).
|
||||||
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
||||||
|
|
||||||
if (ch == '?') {
|
if (ch == '?') {
|
||||||
// PHP code or XML declaration.
|
// PHP code or XML declaration.
|
||||||
// XML declaration is only valid at the start of the first line.
|
// XML declaration is only valid at the start of the first line.
|
||||||
@ -181,7 +199,7 @@ HtmlParser::parse_html(const string &body)
|
|||||||
if (decl_end == body.end()) break;
|
if (decl_end == body.end()) break;
|
||||||
|
|
||||||
// Default charset for XML is UTF-8.
|
// Default charset for XML is UTF-8.
|
||||||
charset = "UTF-8";
|
charset = "utf-8";
|
||||||
|
|
||||||
string decl(p + 6, decl_end);
|
string decl(p + 6, decl_end);
|
||||||
size_t enc = decl.find("encoding");
|
size_t enc = decl.find("encoding");
|
||||||
@ -205,7 +223,7 @@ HtmlParser::parse_html(const string &body)
|
|||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process text up to start of tag.
|
// Process text up to start of tag.
|
||||||
@ -286,66 +304,83 @@ HtmlParser::parse_html(const string &body)
|
|||||||
start = find_if(start, body.end(), p_nottag);
|
start = find_if(start, body.end(), p_nottag);
|
||||||
string tag = body.substr(p - body.begin(), start - p);
|
string tag = body.substr(p - body.begin(), start - p);
|
||||||
// convert tagname to lowercase
|
// convert tagname to lowercase
|
||||||
for (string::iterator i = tag.begin(); i != tag.end(); ++i)
|
lowercase_string(tag);
|
||||||
*i = tolower(static_cast<unsigned char>(*i));
|
|
||||||
|
|
||||||
if (closing) {
|
if (closing) {
|
||||||
closing_tag(tag);
|
if (!closing_tag(tag))
|
||||||
|
return;
|
||||||
if (in_script && tag == "script") in_script = false;
|
if (in_script && tag == "script") in_script = false;
|
||||||
|
|
||||||
/* ignore any bogus parameters on closing tags */
|
/* ignore any bogus parameters on closing tags */
|
||||||
p = find(start, body.end(), '>');
|
p = find(start, body.end(), '>');
|
||||||
if (p == body.end()) break;
|
if (p == body.end()) break;
|
||||||
start = p + 1;
|
start = p + 1;
|
||||||
} else {
|
} else {
|
||||||
|
bool empty_element = false;
|
||||||
|
// FIXME: parse parameters lazily.
|
||||||
while (start < body.end() && *start != '>') {
|
while (start < body.end() && *start != '>') {
|
||||||
string name, value;
|
string name, value;
|
||||||
|
|
||||||
p = find_if(start, body.end(), p_whitespaceeqgt);
|
p = find_if(start, body.end(), p_whitespaceeqgt);
|
||||||
|
|
||||||
name = body.substr(start - body.begin(), p - start);
|
size_t name_len = p - start;
|
||||||
|
if (name_len == 1) {
|
||||||
|
if (*start == '/' && p < body.end() && *p == '>') {
|
||||||
|
// E.g. <tag foo="bar" />
|
||||||
|
start = p;
|
||||||
|
empty_element = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
name.assign(body, start - body.begin(), name_len);
|
||||||
|
|
||||||
p = find_if(p, body.end(), p_notwhitespace);
|
p = find_if(p, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
start = p;
|
start = p;
|
||||||
if (start != body.end() && *start == '=') {
|
if (start != body.end() && *start == '=') {
|
||||||
int quote;
|
|
||||||
|
|
||||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
p = body.end();
|
p = body.end();
|
||||||
|
|
||||||
quote = *start;
|
int quote = *start;
|
||||||
if (quote == '"' || quote == '\'') {
|
if (quote == '"' || quote == '\'') {
|
||||||
start++;
|
start++;
|
||||||
p = find(start, body.end(), quote);
|
p = find(start, body.end(), quote);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p == body.end()) {
|
if (p == body.end()) {
|
||||||
// unquoted or no closing quote
|
// unquoted or no closing quote
|
||||||
p = find_if(start, body.end(), p_whitespacegt);
|
p = find_if(start, body.end(), p_whitespacegt);
|
||||||
|
|
||||||
value = body.substr(start - body.begin(), p - start);
|
|
||||||
|
|
||||||
start = find_if(p, body.end(), p_notwhitespace);
|
|
||||||
} else {
|
|
||||||
value = body.substr(start - body.begin(), p - start);
|
|
||||||
}
|
}
|
||||||
|
value.assign(body, start - body.begin(), p - start);
|
||||||
if (name.size()) {
|
start = find_if(p, body.end(), p_notwhitespace);
|
||||||
|
|
||||||
|
if (!name.empty()) {
|
||||||
// convert parameter name to lowercase
|
// convert parameter name to lowercase
|
||||||
string::iterator i;
|
lowercase_string(name);
|
||||||
for (i = name.begin(); i != name.end(); ++i)
|
|
||||||
*i = tolower(static_cast<unsigned char>(*i));
|
|
||||||
// in case of multiple entries, use the first
|
// in case of multiple entries, use the first
|
||||||
// (as Netscape does)
|
// (as Netscape does)
|
||||||
if (Param.find(name) == Param.end())
|
parameters.insert(make_pair(name, value));
|
||||||
Param[name] = value;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
opening_tag(tag, Param);
|
#if 0
|
||||||
Param.clear();
|
cout << "<" << tag;
|
||||||
|
map<string, string>::const_iterator x;
|
||||||
|
for (x = parameters.begin(); x != parameters.end(); x++) {
|
||||||
|
cout << " " << x->first << "=\"" << x->second << "\"";
|
||||||
|
}
|
||||||
|
cout << ">\n";
|
||||||
|
#endif
|
||||||
|
if (!opening_tag(tag))
|
||||||
|
return;
|
||||||
|
parameters.clear();
|
||||||
|
|
||||||
|
if (empty_element) {
|
||||||
|
if (!closing_tag(tag))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// In <script> tags we ignore opening tags to avoid problems
|
// In <script> tags we ignore opening tags to avoid problems
|
||||||
// with "a<b".
|
// with "a<b".
|
||||||
|
|||||||
@ -31,16 +31,18 @@ using std::string;
|
|||||||
using std::map;
|
using std::map;
|
||||||
|
|
||||||
class HtmlParser {
|
class HtmlParser {
|
||||||
|
map<string, string> parameters;
|
||||||
protected:
|
protected:
|
||||||
virtual void decode_entities(string &s);
|
virtual void decode_entities(string &s);
|
||||||
bool in_script;
|
bool in_script;
|
||||||
string charset;
|
string charset;
|
||||||
static map<string, unsigned int> named_ents;
|
static map<string, unsigned int> named_ents;
|
||||||
|
|
||||||
|
bool get_parameter(const string & param, string & value) const;
|
||||||
public:
|
public:
|
||||||
virtual void process_text(const string &/*text*/) { }
|
virtual void process_text(const string &/*text*/) { }
|
||||||
virtual void opening_tag(const string &/*tag*/,
|
virtual bool opening_tag(const string &/*tag*/) { return true; }
|
||||||
const map<string,string> &/*p*/) { }
|
virtual bool closing_tag(const string &/*tag*/) { return true; }
|
||||||
virtual void closing_tag(const string &/*tag*/) { }
|
|
||||||
virtual void parse_html(const string &text);
|
virtual void parse_html(const string &text);
|
||||||
virtual void do_eof() {}
|
virtual void do_eof() {}
|
||||||
HtmlParser();
|
HtmlParser();
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* This file was copied from omega-0.8.5 and modified */
|
/* This file was copied from omega-0.8.5->1.2.6 and modified */
|
||||||
|
|
||||||
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
|
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
|
||||||
*
|
*
|
||||||
@ -287,8 +287,8 @@ MyHtmlParser::process_text(const string &text)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
bool
|
||||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
MyHtmlParser::opening_tag(const string &tag)
|
||||||
{
|
{
|
||||||
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
||||||
#if 0
|
#if 0
|
||||||
@ -298,14 +298,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (tag.empty()) return;
|
if (tag.empty()) return true;
|
||||||
switch (tag[0]) {
|
switch (tag[0]) {
|
||||||
case 'a':
|
case 'a':
|
||||||
if (tag == "address") pending_space = true;
|
if (tag == "address") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'b':
|
case 'b':
|
||||||
if (tag == "body") {
|
if (tag == "body") {
|
||||||
dump = "";
|
dump.resize(0);
|
||||||
in_body_tag = true;
|
in_body_tag = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -351,21 +351,20 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
break;
|
break;
|
||||||
case 'm':
|
case 'm':
|
||||||
if (tag == "meta") {
|
if (tag == "meta") {
|
||||||
map<string, string>::const_iterator i, j;
|
string content;
|
||||||
if ((i = p.find("content")) != p.end()) {
|
if (get_parameter("content", content)) {
|
||||||
if ((j = p.find("name")) != p.end()) {
|
string name;
|
||||||
string name = j->second;
|
if (get_parameter("name", name)) {
|
||||||
lowercase_term(name);
|
lowercase_term(name);
|
||||||
if (name == "date") {
|
if (name == "date") {
|
||||||
// Yes this doesnt exist. It's output by filters
|
// Yes this doesnt exist. It's output by filters
|
||||||
// And the format isn't even standard http/html
|
// And the format isn't even standard http/html
|
||||||
// FIXME
|
// FIXME
|
||||||
string tmp = i->second;
|
decode_entities(content);
|
||||||
decode_entities(tmp);
|
|
||||||
struct tm tm;
|
struct tm tm;
|
||||||
if (strptime(tmp.c_str(),
|
if (strptime(content.c_str(),
|
||||||
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
||||||
strptime(tmp.c_str(),
|
strptime(content.c_str(),
|
||||||
"%Y-%m-%dT%H:%M:%S", &tm)
|
"%Y-%m-%dT%H:%M:%S", &tm)
|
||||||
) {
|
) {
|
||||||
char ascuxtime[100];
|
char ascuxtime[100];
|
||||||
@ -376,17 +375,16 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
} else {
|
} else {
|
||||||
if (!meta[name].empty())
|
if (!meta[name].empty())
|
||||||
meta[name] += ' ';
|
meta[name] += ' ';
|
||||||
string tmp = i->second;
|
decode_entities(content);
|
||||||
decode_entities(tmp);
|
meta[name] += content;
|
||||||
meta[name] += tmp;
|
|
||||||
}
|
}
|
||||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
}
|
||||||
string hequiv = j->second;
|
string hdr;
|
||||||
lowercase_term(hequiv);
|
if (get_parameter("http-equiv", hdr)) {
|
||||||
if (hequiv == "content-type") {
|
lowercase_term(hdr);
|
||||||
string value = i->second;
|
if (hdr == "content-type") {
|
||||||
MimeHeaderValue p;
|
MimeHeaderValue p;
|
||||||
parseMimeHeaderValue(value, p);
|
parseMimeHeaderValue(content, p);
|
||||||
map<string, string>::const_iterator k;
|
map<string, string>::const_iterator k;
|
||||||
if ((k = p.params.find("charset")) !=
|
if ((k = p.params.find("charset")) !=
|
||||||
p.params.end()) {
|
p.params.end()) {
|
||||||
@ -445,13 +443,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
if (tag == "xmp") pending_space = true;
|
if (tag == "xmp") pending_space = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
bool
|
||||||
MyHtmlParser::closing_tag(const string &tag)
|
MyHtmlParser::closing_tag(const string &tag)
|
||||||
{
|
{
|
||||||
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
||||||
if (tag.empty()) return;
|
if (tag.empty()) return true;
|
||||||
switch (tag[0]) {
|
switch (tag[0]) {
|
||||||
case 'a':
|
case 'a':
|
||||||
if (tag == "address") pending_space = true;
|
if (tag == "address") pending_space = true;
|
||||||
@ -460,7 +459,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
|||||||
if (tag == "body") {
|
if (tag == "body") {
|
||||||
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
||||||
in_body_tag = false;
|
in_body_tag = false;
|
||||||
throw true;
|
return false;
|
||||||
}
|
}
|
||||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||||
break;
|
break;
|
||||||
@ -532,6 +531,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
|||||||
if (tag == "xmp") pending_space = true;
|
if (tag == "xmp") pending_space = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This gets called when hitting eof.
|
// This gets called when hitting eof.
|
||||||
|
|||||||
@ -55,8 +55,8 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
bool indexing_allowed;
|
bool indexing_allowed;
|
||||||
|
|
||||||
void process_text(const string &text);
|
void process_text(const string &text);
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
bool opening_tag(const string &tag);
|
||||||
void closing_tag(const string &tag);
|
bool closing_tag(const string &tag);
|
||||||
void do_eof();
|
void do_eof();
|
||||||
void decode_entities(string &s);
|
void decode_entities(string &s);
|
||||||
void reset_charsets() {fromcharset = tocharset = "";}
|
void reset_charsets() {fromcharset = tocharset = "";}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user