recoll/src/utils/picoxml.h
Jean-Francois Dockes 9b0ec434b4 Small win warning fix
2020-08-22 09:28:46 +01:00

443 lines
14 KiB
C++

/* Copyright (C) 2016 J.F.Dockes
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* (1) Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* (2) Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* (3)The name of the author may not be used to
* endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
**********************************************************/
#ifndef _PICOXML_H_INCLUDED_
#define _PICOXML_H_INCLUDED_
/**
* PicoXMLParser: a single include file parser for an XML-like, but
* restricted language, adequate for config files, not for arbitrary
* externally generated data.
*
* - The code depends on nothing but the C++ standard library
* - The input to the parser is a single c++ string. Does not deal with
* input in several pieces or files.
* - SAX mode only. You have access to the tag stack. I've always
* found DOM mode less usable.
* - Checks for proper tag nesting and not much else.
* - ! No CDATA
* - ! Attributes should really really not contain XML special chars.
*
* A typical input would be like the following (you can add XML
* declarations, whitespace and newlines to taste).
*
* <top>top chrs1<sub attr="attrval">sub chrs</sub>top chrs2 <emptyelt /></top>
*
* Usage: subclass PicoXMLParser, overriding the methods in the
* "protected:" section (look there for more details), call the
* constructor with your input, then call parse().
*/
#include <string>
#include <vector>
#include <map>
#include <sstream>
#include <iostream>
#include <algorithm>
// Expat compat
typedef char XML_Char;
class PicoXMLParser {
public:
PicoXMLParser(const std::string& input)
: m_in(input), m_pos(0) {}
virtual ~PicoXMLParser() {}
virtual bool parse() {
return _parse();
}
virtual bool Parse() {
return _parse();
}
virtual std::string getReason() {
return m_reason.str();
}
protected:
/* Methods to be overriden */
/**
* Tag open handler.
* @param tagname the tag name
* @param attrs a map of attribute name/value pairs
*/
virtual void startElement(
const std::string& /* nm */,
const std::map<std::string, std::string>& /* attrs */) {}
/** Expatmm compat. We don't support attributes with this at the moment */
virtual void StartElement(const XML_Char *, const XML_Char **) {}
/**
* Tag close handler.
* You should probably have been accumulating text and stuff since
* the tag opening.
* @param tagname the tag name.
*/
virtual void endElement(const std::string& /* nm */) {}
/** Expatmm compat */
virtual void EndElement(const XML_Char * /* nm */) {}
/**
* Non-tag data handler.
* @param data the data.
*/
virtual void characterData(const std::string& /*data*/) {}
/** Expatmm compat */
virtual void CharacterData(const XML_Char *, int) {}
/**
* Return current tag name stack. Deprecated, use m_path.
* This does not include the current (bottom) tag.
* Attributes are not kept in there, you'll have to do this yourself.
* @return a const ref to a vector of tag names.
*/
virtual const std::vector<std::string>& tagStack() {
return m_tagstack;
}
/**
* Current element stack, including the bottom one
* Each entry includes the attributes and the starting character offset.
* The stack includes the last element (the one open is called for).
*/
class StackEl {
public:
StackEl(const std::string& nm) : name(nm) {}
std::string name;
std::string::size_type start_index;
std::map<std::string,std::string> attributes;
std::string data; // Derived class usage
};
std::vector<StackEl> m_path;
private:
const std::string& m_in;
std::string::size_type m_pos{0};
std::stringstream m_reason;
std::vector<std::string> m_tagstack;
void _startelem(const std::string& tagname,
const std::map<std::string, std::string>& attrs, bool empty)
{
m_path.push_back(StackEl(tagname));
StackEl& lastelt = m_path.back();
lastelt.start_index = m_pos;
lastelt.attributes = attrs;
startElement(tagname, attrs);
StartElement(tagname.c_str(), nullptr);
m_tagstack.push_back(tagname); // Compat
if (empty) {
_endelem(tagname);
}
}
void _endelem(const std::string& tagname)
{
m_tagstack.pop_back();
endElement(tagname);
EndElement(tagname.c_str());
m_path.pop_back();
}
bool _parse() {
// skip initial whitespace and XML decl. On success, returns with
// current pos on first tag '<'
if (!skipDecl()) {
return false;
}
if (nomore()) {
// empty file
return true;
}
for (;;) {
// Current char is '<' and the next char is not '?'
//std::cerr<< "m_pos "<< m_pos<<" char "<< m_in[m_pos]<<std::endl;
// skipComment also processes
if (!skipComment()) {
return false;
}
if (nomore()) {
if (!m_tagstack.empty()) {
m_reason << "EOF hit inside open element";
return false;
}
return true;
}
m_pos++;
if (nomore()) {
m_reason << "EOF within tag";
return false;
}
std::string::size_type spos = m_pos;
int isendtag = m_in[m_pos] == '/' ? 1 : 0;
skipStr(">");
if (m_pos == std::string::npos || m_pos <= spos + 1) {
m_reason << "Empty tag or EOF inside tag. pos " << spos;
return false;
}
int emptyel = m_in[m_pos-2] == '/' ? 1 : 0;
if (emptyel && isendtag) {
m_reason << "Bad tag </xx/> at cpos " << spos;
return false;
}
std::string tag =
m_in.substr(spos + isendtag,
m_pos - (spos + 1 + isendtag + emptyel));
//std::cerr << "TAG NAME [" << tag << "]\n";
trimtag(tag);
std::map<std::string, std::string> attrs;
if (!parseattrs(tag, attrs)) {
return false;
}
if (isendtag) {
if (m_tagstack.empty() || tag.compare(m_tagstack.back())) {
m_reason << "Closing not open tag " << tag <<
" at cpos " << m_pos;
return false;
}
_endelem(tag);
} else {
_startelem(tag, attrs, emptyel);
}
spos = m_pos;
if (!_chardata()) {
return false;
}
}
return false;
}
bool _chardata() {
std::string::size_type spos = m_pos;
m_pos = m_in.find("<", m_pos);
if (nomore()) {
return true;
}
if (m_pos != spos) {
std::string data{unQuote(m_in.substr(spos, m_pos - spos))};
characterData(data);
CharacterData(data.c_str(), data.size());
}
return true;
}
bool nomore(int sz = 0) const {
return m_pos == std::string::npos || m_pos >= m_in.size() - sz;
}
bool skipWS(const std::string& in, std::string::size_type& pos) {
if (pos == std::string::npos)
return false;
pos = in.find_first_not_of(" \t\n\r", pos);
return pos != std::string::npos;
}
bool skipStr(const std::string& str) {
if (m_pos == std::string::npos)
return false;
m_pos = m_in.find(str, m_pos);
if (m_pos != std::string::npos)
m_pos += str.size();
return m_pos != std::string::npos;
}
int peek(int sz = 0) const {
if (nomore(sz))
return -1;
return m_in[m_pos + 1 + sz];
}
void trimtag(std::string& tagname) {
std::string::size_type trimpos = tagname.find_last_not_of(" \t\n\r");
if (trimpos != std::string::npos) {
tagname = tagname.substr(0, trimpos+1);
}
}
bool skipDecl() {
for (;;) {
if (!skipWS(m_in, m_pos)) {
m_reason << "EOF during initial ws skip";
return true;
}
if (m_in[m_pos] != '<') {
m_reason << "EOF file does not begin with decl/tag: m_pos " <<
m_pos << " char [" << m_in[m_pos] << "]\n";
return false;
}
if (peek() == '?') {
if (!skipStr("?>")) {
m_reason << "EOF while looking for end of xml decl";
return false;
}
} else {
break;
}
}
return true;
}
bool skipComment() {
if (nomore()) {
return true;
}
if (m_in[m_pos] != '<') {
m_reason << "Internal error: skipComment called with wrong "
"start: m_pos " <<
m_pos << " char [" << m_in[m_pos] << "]\n";
return false;
}
if (peek() == '!' && peek(1) == '-' && peek(2) == '-') {
if (!skipStr("-->")) {
m_reason << "EOF while looking for end of XML comment";
return false;
}
// Process possible characters until next tag
return _chardata();
}
return true;
}
bool parseattrs(std::string& tag,
std::map<std::string, std::string>& attrs) {
//std::cerr << "parseattrs: [" << tag << "]\n";
attrs.clear();
std::string::size_type spos = tag.find_first_of(" \t\n\r");
if (spos == std::string::npos)
return true;
std::string tagname = tag.substr(0, spos);
//std::cerr << "tag name [" << tagname << "] pos " << spos << "\n";
skipWS(tag, spos);
for (;;) {
//std::cerr << "top of loop [" << tag.substr(spos) << "]\n";
std::string::size_type epos = tag.find_first_of(" \t\n\r=", spos);
if (epos == std::string::npos) {
m_reason << "Bad attributes syntax at cpos " << m_pos + epos;
return false;
}
std::string attrnm = tag.substr(spos, epos - spos);
if (attrnm.empty()) {
m_reason << "Empty attribute name ?? at cpos " << m_pos + epos;
return false;
}
//std::cerr << "attr name [" << attrnm << "]\n";
skipWS(tag, epos);
if (epos == std::string::npos || epos == tag.size() - 1 ||
tag[epos] != '=') {
m_reason <<"Missing equal sign or value at cpos " << m_pos+epos;
return false;
}
epos++;
skipWS(tag, epos);
char qc{0};
if ((tag[epos] != '"' && tag[epos] != '\'') ||
epos == tag.size() - 1) {
m_reason << "Missing quote or value at cpos " << m_pos+epos;
return false;
}
qc = tag[epos];
spos = epos + 1;
epos = tag.find_first_of(qc, spos);
if (epos == std::string::npos) {
m_reason << "Missing closing quote at cpos " << m_pos+spos;
return false;
}
attrs[attrnm] = tag.substr(spos, epos - spos);
//std::cerr << "attr value [" << attrs[attrnm] << "]\n";
if (epos == tag.size() - 1) {
break;
}
epos++;
skipWS(tag, epos);
if (epos == tag.size() - 1) {
break;
}
spos = epos;
}
tag = tagname;
return true;
}
std::string unQuote(const std::string &s) {
static const std::string e_quot{"quot"};
static const std::string e_amp{"amp"};
static const std::string e_apos{"apos"};
static const std::string e_lt{"lt"};
static const std::string e_gt{"gt"};
std::string out;
out.reserve(s.size());
std::string::const_iterator it = s.begin();
while (it != s.end()) {
if (*it != '&') {
out += *it;
it++;
continue;
}
if (it == s.end()) {
// Unexpected
break;
}
it++;
std::string code;
while (it != s.end() && *it != ';') {
code += *it;
it++;
}
if (it == s.end()) {
// Unexpected
break;
}
it++;
if (code == e_quot) {
out += '"';
} else if (code == e_amp) {
out += '&';
} else if (code == e_apos) {
out += '\'';
} else if (code == e_lt) {
out += '<';
} else if (code == e_gt) {
out += '>';
}
}
return out;
}
};
#endif /* _PICOXML_H_INCLUDED_ */