From 3ca33b4eb490dba501b2200ff7ea3e5986ed91bc Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 28 Jan 2005 08:41:40 +0000 Subject: [PATCH] xapian 0.8.3 --- src/internfile/htmlparse.cpp | 389 +++++++++++++++++++++++++++++++++++ src/internfile/htmlparse.h | 42 ++++ 2 files changed, 431 insertions(+) create mode 100644 src/internfile/htmlparse.cpp create mode 100644 src/internfile/htmlparse.h diff --git a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp new file mode 100644 index 00000000..d2fab8e4 --- /dev/null +++ b/src/internfile/htmlparse.cpp @@ -0,0 +1,389 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * ----START-LICENCE---- + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * -----END-LICENCE----- + */ + +#include + +#include +using std::find; +using std::find_if; +#include "htmlparse.h" +#include +#include + +map HtmlParser::named_ents; + +inline static bool +p_alpha(char c) +{ + return isalpha(c); +} + +inline static bool +p_notdigit(char c) +{ + return !isdigit(c); +} + +inline static bool +p_notxdigit(char c) +{ + return !isxdigit(c); +} + +inline static bool +p_notalnum(char c) +{ + return !isalnum(c); +} + +inline static bool +p_notwhitespace(char c) +{ + return !isspace(c); +} + +inline static bool +p_nottag(char c) +{ + return !isalnum(c) && c != '.' && c != '-'; +} + +inline static bool +p_whitespacegt(char c) +{ + return isspace(c) || c == '>'; +} + +inline static bool +p_whitespaceeqgt(char c) +{ + return isspace(c) || c == '=' || c == '>'; +} + +HtmlParser::HtmlParser() +{ + static struct ent { const char *n; unsigned int v; } ents[] = { + { "quot", 34 }, + { "amp", 38 }, + { "lt", 60 }, + { "gt", 62 }, + { "AElig", 198 }, + { "Aacute", 193 }, + { "Acirc", 194 }, + { "Agrave", 192 }, + { "Aring", 197 }, + { "Atilde", 195 }, + { "Auml", 196 }, + { "Ccedil", 199 }, + { "ETH", 208 }, + { "Eacute", 201 }, + { "Ecirc", 202 }, + { "Egrave", 200 }, + { "Euml", 203 }, + { "Iacute", 205 }, + { "Icirc", 206 }, + { "Igrave", 204 }, + { "Iuml", 207 }, + { "Ntilde", 209 }, + { "Oacute", 211 }, + { "Ocirc", 212 }, + { "Ograve", 210 }, + { "Oslash", 216 }, + { "Otilde", 213 }, + { "Ouml", 214 }, + { "THORN", 222 }, + { "Uacute", 218 }, + { "Ucirc", 219 }, + { "Ugrave", 217 }, + { "Uuml", 220 }, + { "Yacute", 221 }, + { "aacute", 225 }, + { "acirc", 226 }, + { "acute", 180 }, + { "aelig", 230 }, + { "agrave", 224 }, + { "aring", 229 }, + { "atilde", 227 }, + { "auml", 228 }, + { "brvbar", 166 }, + { "ccedil", 231 }, + { "cedil", 184 }, + { "cent", 162 }, + { "copy", 169 }, + { "curren", 164 }, + { "deg", 176 }, + { "divide", 247 }, + { "eacute", 233 }, + { "ecirc", 234 }, + { "egrave", 232 }, + { "eth", 240 }, + { "euml", 235 }, + { "frac12", 189 }, + { "frac14", 188 }, + { "frac34", 190 }, + { "iacute", 237 }, + { "icirc", 238 }, + { "iexcl", 161 }, + { "igrave", 236 }, + { "iquest", 191 }, + { "iuml", 239 }, + { "laquo", 171 }, + { "macr", 175 }, + { "micro", 181 }, + { "middot", 183 }, + { "nbsp", 160 }, + { "not", 172 }, + { "ntilde", 241 }, + { "oacute", 243 }, + { "ocirc", 244 }, + { "ograve", 242 }, + { "ordf", 170 }, + { "ordm", 186 }, + { "oslash", 248 }, + { "otilde", 245 }, + { "ouml", 246 }, + { "para", 182 }, + { "plusmn", 177 }, + { "pound", 163 }, + { "raquo", 187 }, + { "reg", 174 }, + { "sect", 167 }, + { "shy", 173 }, + { "sup1", 185 }, + { "sup2", 178 }, + { "sup3", 179 }, + { "szlig", 223 }, + { "thorn", 254 }, + { "times", 215 }, + { "uacute", 250 }, + { "ucirc", 251 }, + { "ugrave", 249 }, + { "uml", 168 }, + { "uuml", 252 }, + { "yacute", 253 }, + { "yen", 165 }, + { "yuml", 255 }, +// iso8859-1 only for now { "OElig", 338 }, +// ditto { "oelig", 339 }, + { NULL, 0 } + }; + if (named_ents.empty()) { + struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } + } +} + +void +HtmlParser::decode_entities(string &s) +{ + // We need a const_iterator version of s.end() - otherwise the + // find() and find_if() templates don't work... + string::const_iterator amp = s.begin(), s_end = s.end(); + while ((amp = find(amp, s_end, '&')) != s_end) { + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && tolower(*p) == 'x') { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; + } + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + s.replace(amp_pos, end - amp, 1u, char(val)); + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; + } else { + amp = end; + } + } +} + +void +HtmlParser::parse_html(const string &body) +{ + map Param; + string::const_iterator start = body.begin(); + + while (1) { + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurences of `<' which don't start + // a tag or comment + string::const_iterator p = start; + while (1) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + char ch = *(p + 1); + // tag, closing tag, comment (or SGML declaration), or PHP + if (isalpha(ch) || ch == '/' || ch == '!' || ch == '?') break; + p++; + } + + + // process text up to start of tag + if (p > start) { + string text = body.substr(start - body.begin(), p - start); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) break; + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + start = find(start + 1, body.end(), '>'); + // unterminated comment swallows rest of document + // (like NS, but unlike MSIE iirc) + if (start == body.end()) break; + + p = start; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + // If we found --> skip to there, otherwise + // skip to the first > we found (as Netscape does) + if (p != body.end()) start = p; + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + for (string::iterator i = tag.begin(); i != tag.end(); i++) + *i = tolower(*i); + + if (closing) { + closing_tag(tag); + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + name = body.substr(start - body.begin(), p - start); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + int quote; + + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + + value = body.substr(start - body.begin(), p - start); + + start = find_if(p, body.end(), p_notwhitespace); + } else { + value = body.substr(start - body.begin(), p - start); + } + + if (name.size()) { + // convert parameter name to lowercase + string::iterator i; + for (i = name.begin(); i != name.end(); i++) + *i = tolower(*i); + // in case of multiple entries, use the first + // (as Netscape does) + if (Param.find(name) == Param.end()) + Param[name] = value; + } + } + } + opening_tag(tag, Param); + Param.clear(); + + if (start != body.end() && *start == '>') ++start; + } + } + } +} diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h new file mode 100644 index 00000000..66a1a69f --- /dev/null +++ b/src/internfile/htmlparse.h @@ -0,0 +1,42 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * ----START-LICENCE---- + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2002 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * -----END-LICENCE----- + */ + +#include +#include + +using std::string; +using std::map; + +class HtmlParser { + protected: + void decode_entities(string &s); + static map named_ents; + public: + virtual void process_text(const string &/*text*/) { } + virtual void opening_tag(const string &/*tag*/, + const map &/*p*/) { } + virtual void closing_tag(const string &/*tag*/) { } + virtual void parse_html(const string &text); + HtmlParser(); + virtual ~HtmlParser() { } +};