From c7a241d26e2650debb92892db05d4a2df89bb241 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Fri, 24 Jun 2011 10:41:54 +0200
Subject: [PATCH] htmlparse: merged some updates from xapian 1.2.6

---
 src/internfile/htmlparse.cpp   | 103 ++++++++++++++++++++++-----------
 src/internfile/htmlparse.h     |   8 ++-
 src/internfile/myhtmlparse.cpp |  50 ++++++++--------
 src/internfile/myhtmlparse.h   |   4 +-
 4 files changed, 101 insertions(+), 64 deletions(-)
diff --git a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp
index 7d1fadb3..5e4e8247 100644
--- a/src/internfile/htmlparse.cpp
+++ b/src/internfile/htmlparse.cpp
@@ -1,10 +1,10 @@
-/* This file was copied/updated from xapian-omega-1.0.1 and modified */
+/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
 
 /* htmlparse.cc: simple HTML parser for omega indexer
  *
  * Copyright 1999,2000,2001 BrightStation PLC
  * Copyright 2001 Ananova Ltd
- * Copyright 2002,2006 Olly Betts
+ * Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -30,6 +30,14 @@ using std::find_if;
 #include <ctype.h>
 #include <cstring>
 
+inline void
+lowercase_string(string &str)
+{
+    for (string::iterator i = str.begin(); i != str.end(); ++i) {
+	*i = tolower(static_cast<unsigned char>(*i));
+    }
+}
+
 map<string, unsigned int> HtmlParser::named_ents;
 
 inline static bool
@@ -75,6 +83,15 @@ p_whitespaceeqgt(char c)
     return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
 }
 
+bool
+HtmlParser::get_parameter(const string & param, string & value) const
+{
+    map<string, string>::const_iterator i = parameters.find(param);
+    if (i == parameters.end()) return false;
+    value = i->second;
+    return true;
+}
+
 HtmlParser::HtmlParser()
 {
     // RECOLL: no need to initialize these entities, we use those from
@@ -151,12 +168,12 @@ HtmlParser::parse_html(const string &body)
 {
     in_script = false;
 
-    map<string,string> Param;
+    parameters.clear();
     string::const_iterator start = body.begin();
 
     while (true) {
 	// Skip through until we find an HTML tag, a comment, or the end of
-	// document.  Ignore isolated occurences of `<' which don't start
+	// document.  Ignore isolated occurrences of `<' which don't start
 	// a tag or comment.	
 	string::const_iterator p = start;
 	while (true) {
@@ -166,6 +183,7 @@ HtmlParser::parse_html(const string &body)
 
 	    // Tag, closing tag, or comment (or SGML declaration).
 	    if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
+
 	    if (ch == '?') {
 		// PHP code or XML declaration.
 		// XML declaration is only valid at the start of the first line.
@@ -181,7 +199,7 @@ HtmlParser::parse_html(const string &body)
 		if (decl_end == body.end()) break;
 
 		// Default charset for XML is UTF-8.
-		charset = "UTF-8";
+		charset = "utf-8";
 
 		string decl(p + 6, decl_end);
 		size_t enc = decl.find("encoding");
@@ -205,7 +223,7 @@ HtmlParser::parse_html(const string &body)
 
 		break;
 	    }
-	    p++; 
+	    p++;
 	}
 
 	// Process text up to start of tag.
@@ -286,66 +304,83 @@ HtmlParser::parse_html(const string &body)
 	    start = find_if(start, body.end(), p_nottag);
 	    string tag = body.substr(p - body.begin(), start - p);
 	    // convert tagname to lowercase
-	    for (string::iterator i = tag.begin(); i != tag.end(); ++i)
-		*i = tolower(static_cast<unsigned char>(*i));
-	       
+	    lowercase_string(tag);
+
 	    if (closing) {
-		closing_tag(tag);
+		if (!closing_tag(tag))
+		    return;
 		if (in_script && tag == "script") in_script = false;
-		   
+
 		/* ignore any bogus parameters on closing tags */
 		p = find(start, body.end(), '>');
 		if (p == body.end()) break;
 		start = p + 1;
 	    } else {
+		bool empty_element = false;
+		// FIXME: parse parameters lazily.
 		while (start < body.end() && *start != '>') {
 		    string name, value;
 
 		    p = find_if(start, body.end(), p_whitespaceeqgt);
 
-		    name = body.substr(start - body.begin(), p - start);
-		       
+		    size_t name_len = p - start;
+		    if (name_len == 1) {
+			if (*start == '/' && p < body.end() && *p == '>') {
+			    // E.g. <tag foo="bar" />
+			    start = p;
+			    empty_element = true;
+			    break;
+			}
+		    }
+
+		    name.assign(body, start - body.begin(), name_len);
+
 		    p = find_if(p, body.end(), p_notwhitespace);
-		      
+
 		    start = p;
 		    if (start != body.end() && *start == '=') {
-			int quote;
-		       
 			start = find_if(start + 1, body.end(), p_notwhitespace);
 
 			p = body.end();
-			   
-			quote = *start;
+
+			int quote = *start;
 			if (quote == '"' || quote == '\'') {
 			    start++;
 			    p = find(start, body.end(), quote);
 			}
-			   
+
 			if (p == body.end()) {
 			    // unquoted or no closing quote
 			    p = find_if(start, body.end(), p_whitespacegt);
-			    
-			    value = body.substr(start - body.begin(), p - start);
-
-			    start = find_if(p, body.end(), p_notwhitespace);
-			} else {
-			    value = body.substr(start - body.begin(), p - start);
 			}
-		       
-			if (name.size()) {
+			value.assign(body, start - body.begin(), p - start);
+			start = find_if(p, body.end(), p_notwhitespace);
+
+			if (!name.empty()) {
 			    // convert parameter name to lowercase
-			    string::iterator i;
-			    for (i = name.begin(); i != name.end(); ++i)
-				*i = tolower(static_cast<unsigned char>(*i));
+			    lowercase_string(name);
 			    // in case of multiple entries, use the first
 			    // (as Netscape does)
-			    if (Param.find(name) == Param.end())
-				Param[name] = value;
+			    parameters.insert(make_pair(name, value));
 			}
 		    }
 		}
-		opening_tag(tag, Param);
-		Param.clear();
+#if 0
+		cout << "<" << tag;
+		map<string, string>::const_iterator x;
+		for (x = parameters.begin(); x != parameters.end(); x++) {
+		    cout << " " << x->first << "=\"" << x->second << "\"";
+		}
+		cout << ">\n";
+#endif
+		if (!opening_tag(tag))
+		    return;
+		parameters.clear();
+
+		if (empty_element) {
+		    if (!closing_tag(tag))
+			return;
+		}
 
 		// In <script> tags we ignore opening tags to avoid problems
 		// with "a<b".
diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h
index 0c6a1309..3d71847c 100644
--- a/src/internfile/htmlparse.h
+++ b/src/internfile/htmlparse.h
@@ -31,16 +31,18 @@ using std::string;
 using std::map;
 
 class HtmlParser {
+	map<string, string> parameters;
     protected:
         virtual void decode_entities(string &s);
         bool in_script;
         string charset;
 	static map<string, unsigned int> named_ents;
+
+	bool get_parameter(const string & param, string & value) const;
     public:
 	virtual void process_text(const string &/*text*/) { }
-	virtual void opening_tag(const string &/*tag*/,
-				 const map<string,string> &/*p*/) { }
-	virtual void closing_tag(const string &/*tag*/) { }
+	virtual bool opening_tag(const string &/*tag*/) { return true; }
+        virtual bool closing_tag(const string &/*tag*/) { return true; }
 	virtual void parse_html(const string &text);
 	virtual void do_eof() {}
 	HtmlParser();
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index d000627a..54cb40ed 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -1,4 +1,4 @@
-/* This file was copied from omega-0.8.5 and modified */
+/* This file was copied from omega-0.8.5->1.2.6 and modified */
 
 /* myhtmlparse.cc: subclass of HtmlParser for extracting text
  *
@@ -287,8 +287,8 @@ MyHtmlParser::process_text(const string &text)
     }
 }
 
-void
-MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
+bool
+MyHtmlParser::opening_tag(const string &tag)
 {
     LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
 #if 0
@@ -298,14 +298,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	cout << "  " << x->first << " -> '" << x->second << "'" << endl;
     }
 #endif
-    if (tag.empty()) return;
+    if (tag.empty()) return true;
     switch (tag[0]) {
 	case 'a':
 	    if (tag == "address") pending_space = true;
 	    break;
 	case 'b':
 	    if (tag == "body") {
-		dump = "";
+		dump.resize(0);
 		in_body_tag = true;
 		break;
 	    }
@@ -351,21 +351,20 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	    break;
 	case 'm':
 	    if (tag == "meta") {
-		map<string, string>::const_iterator i, j;
-		if ((i = p.find("content")) != p.end()) {
-		    if ((j = p.find("name")) != p.end()) {
-			string name = j->second;
+		string content;
+		if (get_parameter("content", content)) {
+		    string name;
+		    if (get_parameter("name", name)) {
 			lowercase_term(name);
 			if (name == "date") {
 			    // Yes this doesnt exist. It's output by filters
 			    // And the format isn't even standard http/html
 			    // FIXME
-			    string tmp = i->second;
-			    decode_entities(tmp);
+			    decode_entities(content);
 			    struct tm tm;
-			    if (strptime(tmp.c_str(), 
+			    if (strptime(content.c_str(), 
 					 " %Y-%m-%d %H:%M:%S ", &tm) ||
-				strptime(tmp.c_str(), 
+				strptime(content.c_str(), 
 					 "%Y-%m-%dT%H:%M:%S", &tm)
 				) {
 				char ascuxtime[100];
@@ -376,17 +375,16 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 			} else {
 			    if (!meta[name].empty())
 				meta[name] += ' ';
-			    string tmp = i->second;
-			    decode_entities(tmp);
-			    meta[name] += tmp;
+			    decode_entities(content);
+			    meta[name] += content;
 			}
-		    } else if ((j = p.find("http-equiv")) != p.end()) {
-			string hequiv = j->second;
-			lowercase_term(hequiv);
-			if (hequiv == "content-type") {
-			    string value = i->second;
+		    } 
+		    string hdr;
+		    if (get_parameter("http-equiv", hdr)) {
+			lowercase_term(hdr);
+			if (hdr == "content-type") {
 			    MimeHeaderValue p;
-			    parseMimeHeaderValue(value, p);
+			    parseMimeHeaderValue(content, p);
 			    map<string, string>::const_iterator k;
 			    if ((k = p.params.find("charset")) != 
 				p.params.end()) {
@@ -445,13 +443,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	    if (tag == "xmp") pending_space = true;
 	    break;
     }
+    return true;
 }
 
-void
+bool
 MyHtmlParser::closing_tag(const string &tag)
 {
     LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
-    if (tag.empty()) return;
+    if (tag.empty()) return true;
     switch (tag[0]) {
 	case 'a':
 	    if (tag == "address") pending_space = true;
@@ -460,7 +459,7 @@ MyHtmlParser::closing_tag(const string &tag)
 	    if (tag == "body") {
 		LOGDEB1(("Myhtmlparse: body close tag found\n"));
 		in_body_tag = false;
-		throw true;
+		return false;
 	    }
 	    if (tag == "blockquote" || tag == "br") pending_space = true;
 	    break;
@@ -532,6 +531,7 @@ MyHtmlParser::closing_tag(const string &tag)
 	    if (tag == "xmp") pending_space = true;
 	    break;
     }
+    return true;
 }
 
 // This gets called when hitting eof. 
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index 82830eca..6bad0637 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -55,8 +55,8 @@ class MyHtmlParser : public HtmlParser {
     bool indexing_allowed;
 
     void process_text(const string &text);
-    void opening_tag(const string &tag, const map<string,string> &p);
-    void closing_tag(const string &tag);
+    bool opening_tag(const string &tag);
+    bool closing_tag(const string &tag);
     void do_eof();
     void decode_entities(string &s);
     void reset_charsets() {fromcharset = tocharset = "";}