htmlparse: merged some updates from xapian 1.2.6
This commit is contained in:
parent
8fe524bd7f
commit
c7a241d26e
@ -1,10 +1,10 @@
|
||||
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
|
||||
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
|
||||
|
||||
/* htmlparse.cc: simple HTML parser for omega indexer
|
||||
*
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2001 Ananova Ltd
|
||||
* Copyright 2002,2006 Olly Betts
|
||||
* Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
@ -30,6 +30,14 @@ using std::find_if;
|
||||
#include <ctype.h>
|
||||
#include <cstring>
|
||||
|
||||
inline void
|
||||
lowercase_string(string &str)
|
||||
{
|
||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
||||
*i = tolower(static_cast<unsigned char>(*i));
|
||||
}
|
||||
}
|
||||
|
||||
map<string, unsigned int> HtmlParser::named_ents;
|
||||
|
||||
inline static bool
|
||||
@ -75,6 +83,15 @@ p_whitespaceeqgt(char c)
|
||||
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
||||
}
|
||||
|
||||
bool
|
||||
HtmlParser::get_parameter(const string & param, string & value) const
|
||||
{
|
||||
map<string, string>::const_iterator i = parameters.find(param);
|
||||
if (i == parameters.end()) return false;
|
||||
value = i->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
HtmlParser::HtmlParser()
|
||||
{
|
||||
// RECOLL: no need to initialize these entities, we use those from
|
||||
@ -151,12 +168,12 @@ HtmlParser::parse_html(const string &body)
|
||||
{
|
||||
in_script = false;
|
||||
|
||||
map<string,string> Param;
|
||||
parameters.clear();
|
||||
string::const_iterator start = body.begin();
|
||||
|
||||
while (true) {
|
||||
// Skip through until we find an HTML tag, a comment, or the end of
|
||||
// document. Ignore isolated occurences of `<' which don't start
|
||||
// document. Ignore isolated occurrences of `<' which don't start
|
||||
// a tag or comment.
|
||||
string::const_iterator p = start;
|
||||
while (true) {
|
||||
@ -166,6 +183,7 @@ HtmlParser::parse_html(const string &body)
|
||||
|
||||
// Tag, closing tag, or comment (or SGML declaration).
|
||||
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
||||
|
||||
if (ch == '?') {
|
||||
// PHP code or XML declaration.
|
||||
// XML declaration is only valid at the start of the first line.
|
||||
@ -181,7 +199,7 @@ HtmlParser::parse_html(const string &body)
|
||||
if (decl_end == body.end()) break;
|
||||
|
||||
// Default charset for XML is UTF-8.
|
||||
charset = "UTF-8";
|
||||
charset = "utf-8";
|
||||
|
||||
string decl(p + 6, decl_end);
|
||||
size_t enc = decl.find("encoding");
|
||||
@ -205,7 +223,7 @@ HtmlParser::parse_html(const string &body)
|
||||
|
||||
break;
|
||||
}
|
||||
p++;
|
||||
p++;
|
||||
}
|
||||
|
||||
// Process text up to start of tag.
|
||||
@ -286,66 +304,83 @@ HtmlParser::parse_html(const string &body)
|
||||
start = find_if(start, body.end(), p_nottag);
|
||||
string tag = body.substr(p - body.begin(), start - p);
|
||||
// convert tagname to lowercase
|
||||
for (string::iterator i = tag.begin(); i != tag.end(); ++i)
|
||||
*i = tolower(static_cast<unsigned char>(*i));
|
||||
|
||||
lowercase_string(tag);
|
||||
|
||||
if (closing) {
|
||||
closing_tag(tag);
|
||||
if (!closing_tag(tag))
|
||||
return;
|
||||
if (in_script && tag == "script") in_script = false;
|
||||
|
||||
|
||||
/* ignore any bogus parameters on closing tags */
|
||||
p = find(start, body.end(), '>');
|
||||
if (p == body.end()) break;
|
||||
start = p + 1;
|
||||
} else {
|
||||
bool empty_element = false;
|
||||
// FIXME: parse parameters lazily.
|
||||
while (start < body.end() && *start != '>') {
|
||||
string name, value;
|
||||
|
||||
p = find_if(start, body.end(), p_whitespaceeqgt);
|
||||
|
||||
name = body.substr(start - body.begin(), p - start);
|
||||
|
||||
size_t name_len = p - start;
|
||||
if (name_len == 1) {
|
||||
if (*start == '/' && p < body.end() && *p == '>') {
|
||||
// E.g. <tag foo="bar" />
|
||||
start = p;
|
||||
empty_element = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
name.assign(body, start - body.begin(), name_len);
|
||||
|
||||
p = find_if(p, body.end(), p_notwhitespace);
|
||||
|
||||
|
||||
start = p;
|
||||
if (start != body.end() && *start == '=') {
|
||||
int quote;
|
||||
|
||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||
|
||||
p = body.end();
|
||||
|
||||
quote = *start;
|
||||
|
||||
int quote = *start;
|
||||
if (quote == '"' || quote == '\'') {
|
||||
start++;
|
||||
p = find(start, body.end(), quote);
|
||||
}
|
||||
|
||||
|
||||
if (p == body.end()) {
|
||||
// unquoted or no closing quote
|
||||
p = find_if(start, body.end(), p_whitespacegt);
|
||||
|
||||
value = body.substr(start - body.begin(), p - start);
|
||||
|
||||
start = find_if(p, body.end(), p_notwhitespace);
|
||||
} else {
|
||||
value = body.substr(start - body.begin(), p - start);
|
||||
}
|
||||
|
||||
if (name.size()) {
|
||||
value.assign(body, start - body.begin(), p - start);
|
||||
start = find_if(p, body.end(), p_notwhitespace);
|
||||
|
||||
if (!name.empty()) {
|
||||
// convert parameter name to lowercase
|
||||
string::iterator i;
|
||||
for (i = name.begin(); i != name.end(); ++i)
|
||||
*i = tolower(static_cast<unsigned char>(*i));
|
||||
lowercase_string(name);
|
||||
// in case of multiple entries, use the first
|
||||
// (as Netscape does)
|
||||
if (Param.find(name) == Param.end())
|
||||
Param[name] = value;
|
||||
parameters.insert(make_pair(name, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
opening_tag(tag, Param);
|
||||
Param.clear();
|
||||
#if 0
|
||||
cout << "<" << tag;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = parameters.begin(); x != parameters.end(); x++) {
|
||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
||||
}
|
||||
cout << ">\n";
|
||||
#endif
|
||||
if (!opening_tag(tag))
|
||||
return;
|
||||
parameters.clear();
|
||||
|
||||
if (empty_element) {
|
||||
if (!closing_tag(tag))
|
||||
return;
|
||||
}
|
||||
|
||||
// In <script> tags we ignore opening tags to avoid problems
|
||||
// with "a<b".
|
||||
|
||||
@ -31,16 +31,18 @@ using std::string;
|
||||
using std::map;
|
||||
|
||||
class HtmlParser {
|
||||
map<string, string> parameters;
|
||||
protected:
|
||||
virtual void decode_entities(string &s);
|
||||
bool in_script;
|
||||
string charset;
|
||||
static map<string, unsigned int> named_ents;
|
||||
|
||||
bool get_parameter(const string & param, string & value) const;
|
||||
public:
|
||||
virtual void process_text(const string &/*text*/) { }
|
||||
virtual void opening_tag(const string &/*tag*/,
|
||||
const map<string,string> &/*p*/) { }
|
||||
virtual void closing_tag(const string &/*tag*/) { }
|
||||
virtual bool opening_tag(const string &/*tag*/) { return true; }
|
||||
virtual bool closing_tag(const string &/*tag*/) { return true; }
|
||||
virtual void parse_html(const string &text);
|
||||
virtual void do_eof() {}
|
||||
HtmlParser();
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* This file was copied from omega-0.8.5 and modified */
|
||||
/* This file was copied from omega-0.8.5->1.2.6 and modified */
|
||||
|
||||
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
|
||||
*
|
||||
@ -287,8 +287,8 @@ MyHtmlParser::process_text(const string &text)
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
bool
|
||||
MyHtmlParser::opening_tag(const string &tag)
|
||||
{
|
||||
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
||||
#if 0
|
||||
@ -298,14 +298,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
}
|
||||
#endif
|
||||
if (tag.empty()) return;
|
||||
if (tag.empty()) return true;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
if (tag == "body") {
|
||||
dump = "";
|
||||
dump.resize(0);
|
||||
in_body_tag = true;
|
||||
break;
|
||||
}
|
||||
@ -351,21 +351,20 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "meta") {
|
||||
map<string, string>::const_iterator i, j;
|
||||
if ((i = p.find("content")) != p.end()) {
|
||||
if ((j = p.find("name")) != p.end()) {
|
||||
string name = j->second;
|
||||
string content;
|
||||
if (get_parameter("content", content)) {
|
||||
string name;
|
||||
if (get_parameter("name", name)) {
|
||||
lowercase_term(name);
|
||||
if (name == "date") {
|
||||
// Yes this doesnt exist. It's output by filters
|
||||
// And the format isn't even standard http/html
|
||||
// FIXME
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
decode_entities(content);
|
||||
struct tm tm;
|
||||
if (strptime(tmp.c_str(),
|
||||
if (strptime(content.c_str(),
|
||||
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
||||
strptime(tmp.c_str(),
|
||||
strptime(content.c_str(),
|
||||
"%Y-%m-%dT%H:%M:%S", &tm)
|
||||
) {
|
||||
char ascuxtime[100];
|
||||
@ -376,17 +375,16 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
} else {
|
||||
if (!meta[name].empty())
|
||||
meta[name] += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
meta[name] += tmp;
|
||||
decode_entities(content);
|
||||
meta[name] += content;
|
||||
}
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
if (hequiv == "content-type") {
|
||||
string value = i->second;
|
||||
}
|
||||
string hdr;
|
||||
if (get_parameter("http-equiv", hdr)) {
|
||||
lowercase_term(hdr);
|
||||
if (hdr == "content-type") {
|
||||
MimeHeaderValue p;
|
||||
parseMimeHeaderValue(value, p);
|
||||
parseMimeHeaderValue(content, p);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) !=
|
||||
p.params.end()) {
|
||||
@ -445,13 +443,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
bool
|
||||
MyHtmlParser::closing_tag(const string &tag)
|
||||
{
|
||||
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
||||
if (tag.empty()) return;
|
||||
if (tag.empty()) return true;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
@ -460,7 +459,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
if (tag == "body") {
|
||||
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
||||
in_body_tag = false;
|
||||
throw true;
|
||||
return false;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
break;
|
||||
@ -532,6 +531,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// This gets called when hitting eof.
|
||||
|
||||
@ -55,8 +55,8 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool indexing_allowed;
|
||||
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
void closing_tag(const string &tag);
|
||||
bool opening_tag(const string &tag);
|
||||
bool closing_tag(const string &tag);
|
||||
void do_eof();
|
||||
void decode_entities(string &s);
|
||||
void reset_charsets() {fromcharset = tocharset = "";}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user