/* Copyright (C) 2004 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_ #ifdef UTF8ITER_CHECK #include "assert.h" #endif #include /** * A small helper class to iterate over utf8 strings. This is not an * STL iterator and does not much error checking. It is designed purely * for recoll usage, where the utf-8 string comes out of iconv in most cases * and is assumed legal. We just try to catch cases where there would be * a risk of crash. */ class Utf8Iter { public: Utf8Iter(const std::string &in) : m_s(in), m_cl(0), m_pos(0), m_charpos(0) { update_cl(); } const std::string& buffer() const {return m_s;} void rewind() { m_cl = 0; m_pos = 0; m_charpos = 0; update_cl(); } /** "Direct" access. Awfully inefficient as we skip from start or current * position at best. This can only be useful for a lookahead from the * current position */ unsigned int operator[](unsigned int charpos) const { std::string::size_type mypos = 0; unsigned int mycp = 0; if (charpos >= m_charpos) { mypos = m_pos; mycp = m_charpos; } int l; while (mypos < m_s.length() && mycp != charpos) { l = get_cl(mypos); if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l)) return (unsigned int)-1; mypos += l; ++mycp; } if (mypos < m_s.length() && mycp == charpos) { l = get_cl(mypos); if (poslok(mypos, l) && checkvalidat(mypos, l)) return getvalueat(mypos, l); } return (unsigned int)-1; } /** Increment current position to next utf-8 char */ std::string::size_type operator++(int) { // Note: m_cl may be zero at eof if user's test not right // this shouldn't crash the program until actual data access #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif if (m_cl == 0) return std::string::npos; m_pos += m_cl; m_charpos++; update_cl(); return m_pos; } /** operator* returns the ucs4 value as a machine integer*/ unsigned int operator*() { #ifdef UTF8ITER_CHECK assert(m_cl > 0); #endif return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl); } /** Append current utf-8 possibly multi-byte character to string param. This needs to be fast. No error checking. */ unsigned int appendchartostring(std::string &out) const { #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif out.append(&m_s[m_pos], m_cl); return m_cl; } /** Return current character as string */ operator std::string() { #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string(); } bool eof() const { return m_pos == m_s.length(); } bool error() const { return m_cl == 0; } /** Return current byte offset in input string */ std::string::size_type getBpos() const { return m_pos; } /** Return current character length */ std::string::size_type getBlen() const { return m_cl; } /** Return current unicode character offset in input string */ std::string::size_type getCpos() const { return m_charpos; } private: // String we're working with const std::string& m_s; // Character length at current position. A value of zero indicates // an error. unsigned int m_cl; // Current byte offset in string. std::string::size_type m_pos; // Current character position unsigned int m_charpos; // Check position and cl against string length bool poslok(std::string::size_type p, int l) const { #ifdef UTF8ITER_CHECK assert(p != std::string::npos && l > 0 && p + l <= m_s.length()); #endif return p != std::string::npos && l > 0 && p + l <= m_s.length(); } // Update current char length in object state, check // for errors inline void update_cl() { m_cl = 0; if (m_pos >= m_s.length()) return; m_cl = get_cl(m_pos); if (!poslok(m_pos, m_cl)) { // Used to set eof here for safety, but this is bad because it // basically prevents the caller to discriminate error and eof. // m_pos = m_s.length(); m_cl = 0; return; } if (!checkvalidat(m_pos, m_cl)) { m_cl = 0; } } inline bool checkvalidat(std::string::size_type p, int l) const { switch (l) { case 1: return (unsigned char)m_s[p] < 128; case 2: return (((unsigned char)m_s[p]) & 224) == 192 && (((unsigned char)m_s[p+1]) & 192) == 128; case 3: return (((unsigned char)m_s[p]) & 240) == 224 && (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)m_s[p+2]) & 192) == 128 ; case 4: return (((unsigned char)m_s[p]) & 248) == 240 && (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)m_s[p+3]) & 192) == 128 ; default: return false; } } // Get character byte length at specified position. Returns 0 for error. inline int get_cl(std::string::size_type p) const { unsigned int z = (unsigned char)m_s[p]; if (z <= 127) { return 1; } else if ((z & 224) == 192) { return 2; } else if ((z & 240) == 224) { return 3; } else if ((z & 248) == 240) { return 4; } #ifdef UTF8ITER_CHECK assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || (z & 248) == 240); #endif return 0; } // Compute value at given position. No error checking. inline unsigned int getvalueat(std::string::size_type p, int l) const { switch (l) { case 1: #ifdef UTF8ITER_CHECK assert((unsigned char)m_s[p] < 128); #endif return (unsigned char)m_s[p]; case 2: #ifdef UTF8ITER_CHECK assert( ((unsigned char)m_s[p] & 224) == 192 && ((unsigned char)m_s[p+1] & 192) == 128 ); #endif return ((unsigned char)m_s[p] - 192) * 64 + (unsigned char)m_s[p+1] - 128 ; case 3: #ifdef UTF8ITER_CHECK assert( (((unsigned char)m_s[p]) & 240) == 224 && (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)m_s[p+2]) & 192) == 128 ); #endif return ((unsigned char)m_s[p] - 224) * 4096 + ((unsigned char)m_s[p+1] - 128) * 64 + (unsigned char)m_s[p+2] - 128; case 4: #ifdef UTF8ITER_CHECK assert( (((unsigned char)m_s[p]) & 248) == 240 && (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)m_s[p+3]) & 192) == 128 ); #endif return ((unsigned char)m_s[p]-240)*262144 + ((unsigned char)m_s[p+1]-128)*4096 + ((unsigned char)m_s[p+2]-128)*64 + (unsigned char)m_s[p+3]-128; default: #ifdef UTF8ITER_CHECK assert(l <= 4); #endif return (unsigned int)-1; } } }; #endif /* _UTF8ITER_H_INCLUDED_ */