284 lines
7.3 KiB
C++
284 lines
7.3 KiB
C++
/* Copyright (C) 2004 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#ifndef _UTF8ITER_H_INCLUDED_
|
|
#define _UTF8ITER_H_INCLUDED_
|
|
|
|
#ifdef UTF8ITER_CHECK
|
|
#include "assert.h"
|
|
#endif
|
|
#include <string>
|
|
|
|
/**
|
|
* A small helper class to iterate over utf8 strings. This is not an
|
|
* STL iterator and does not much error checking. It is designed purely
|
|
* for recoll usage, where the utf-8 string comes out of iconv in most cases
|
|
* and is assumed legal. We just try to catch cases where there would be
|
|
* a risk of crash.
|
|
*/
|
|
class Utf8Iter {
|
|
public:
|
|
Utf8Iter(const std::string &in)
|
|
: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
|
|
{
|
|
update_cl();
|
|
}
|
|
|
|
const std::string& buffer() const {return m_s;}
|
|
|
|
void rewind()
|
|
{
|
|
m_cl = 0;
|
|
m_pos = 0;
|
|
m_charpos = 0;
|
|
update_cl();
|
|
}
|
|
|
|
/** "Direct" access. Awfully inefficient as we skip from start or current
|
|
* position at best. This can only be useful for a lookahead from the
|
|
* current position */
|
|
unsigned int operator[](unsigned int charpos) const
|
|
{
|
|
std::string::size_type mypos = 0;
|
|
unsigned int mycp = 0;
|
|
if (charpos >= m_charpos) {
|
|
mypos = m_pos;
|
|
mycp = m_charpos;
|
|
}
|
|
int l;
|
|
while (mypos < m_s.length() && mycp != charpos) {
|
|
l = get_cl(mypos);
|
|
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
|
return (unsigned int)-1;
|
|
mypos += l;
|
|
++mycp;
|
|
}
|
|
if (mypos < m_s.length() && mycp == charpos) {
|
|
l = get_cl(mypos);
|
|
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
|
return getvalueat(mypos, l);
|
|
}
|
|
return (unsigned int)-1;
|
|
}
|
|
|
|
/** Increment current position to next utf-8 char */
|
|
std::string::size_type operator++(int)
|
|
{
|
|
// Note: m_cl may be zero at eof if user's test not right
|
|
// this shouldn't crash the program until actual data access
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(m_cl != 0);
|
|
#endif
|
|
if (m_cl == 0)
|
|
return std::string::npos;
|
|
|
|
m_pos += m_cl;
|
|
m_charpos++;
|
|
update_cl();
|
|
return m_pos;
|
|
}
|
|
|
|
/** operator* returns the ucs4 value as a machine integer*/
|
|
unsigned int operator*()
|
|
{
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(m_cl > 0);
|
|
#endif
|
|
return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
|
|
}
|
|
|
|
/** Append current utf-8 possibly multi-byte character to string param.
|
|
This needs to be fast. No error checking. */
|
|
unsigned int appendchartostring(std::string &out) const {
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(m_cl != 0);
|
|
#endif
|
|
out.append(&m_s[m_pos], m_cl);
|
|
return m_cl;
|
|
}
|
|
|
|
/** Return current character as string */
|
|
operator std::string() {
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(m_cl != 0);
|
|
#endif
|
|
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
|
|
}
|
|
|
|
bool eof() const {
|
|
return m_pos == m_s.length();
|
|
}
|
|
|
|
bool error() const {
|
|
return m_cl == 0;
|
|
}
|
|
|
|
/** Return current byte offset in input string */
|
|
std::string::size_type getBpos() const {
|
|
return m_pos;
|
|
}
|
|
|
|
/** Return current character length */
|
|
std::string::size_type getBlen() const {
|
|
return m_cl;
|
|
}
|
|
|
|
/** Return current unicode character offset in input string */
|
|
std::string::size_type getCpos() const {
|
|
return m_charpos;
|
|
}
|
|
|
|
private:
|
|
// String we're working with
|
|
const std::string& m_s;
|
|
// Character length at current position. A value of zero indicates
|
|
// an error.
|
|
unsigned int m_cl;
|
|
// Current byte offset in string.
|
|
std::string::size_type m_pos;
|
|
// Current character position
|
|
unsigned int m_charpos;
|
|
|
|
// Check position and cl against string length
|
|
bool poslok(std::string::size_type p, int l) const {
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
|
|
#endif
|
|
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
|
}
|
|
|
|
// Update current char length in object state, check
|
|
// for errors
|
|
inline void update_cl()
|
|
{
|
|
m_cl = 0;
|
|
if (m_pos >= m_s.length())
|
|
return;
|
|
m_cl = get_cl(m_pos);
|
|
if (!poslok(m_pos, m_cl)) {
|
|
// Used to set eof here for safety, but this is bad because it
|
|
// basically prevents the caller to discriminate error and eof.
|
|
// m_pos = m_s.length();
|
|
m_cl = 0;
|
|
return;
|
|
}
|
|
if (!checkvalidat(m_pos, m_cl)) {
|
|
m_cl = 0;
|
|
}
|
|
}
|
|
|
|
inline bool checkvalidat(std::string::size_type p, int l) const
|
|
{
|
|
switch (l) {
|
|
case 1:
|
|
return (unsigned char)m_s[p] < 128;
|
|
case 2:
|
|
return (((unsigned char)m_s[p]) & 224) == 192
|
|
&& (((unsigned char)m_s[p+1]) & 192) == 128;
|
|
case 3:
|
|
return (((unsigned char)m_s[p]) & 240) == 224
|
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
;
|
|
case 4:
|
|
return (((unsigned char)m_s[p]) & 248) == 240
|
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
|
;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Get character byte length at specified position. Returns 0 for error.
|
|
inline int get_cl(std::string::size_type p) const
|
|
{
|
|
unsigned int z = (unsigned char)m_s[p];
|
|
if (z <= 127) {
|
|
return 1;
|
|
} else if ((z & 224) == 192) {
|
|
return 2;
|
|
} else if ((z & 240) == 224) {
|
|
return 3;
|
|
} else if ((z & 248) == 240) {
|
|
return 4;
|
|
}
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
|
(z & 248) == 240);
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
// Compute value at given position. No error checking.
|
|
inline unsigned int getvalueat(std::string::size_type p, int l) const
|
|
{
|
|
switch (l) {
|
|
case 1:
|
|
#ifdef UTF8ITER_CHECK
|
|
assert((unsigned char)m_s[p] < 128);
|
|
#endif
|
|
return (unsigned char)m_s[p];
|
|
case 2:
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(
|
|
((unsigned char)m_s[p] & 224) == 192
|
|
&& ((unsigned char)m_s[p+1] & 192) == 128
|
|
);
|
|
#endif
|
|
return ((unsigned char)m_s[p] - 192) * 64 +
|
|
(unsigned char)m_s[p+1] - 128 ;
|
|
case 3:
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(
|
|
(((unsigned char)m_s[p]) & 240) == 224
|
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
);
|
|
#endif
|
|
|
|
return ((unsigned char)m_s[p] - 224) * 4096 +
|
|
((unsigned char)m_s[p+1] - 128) * 64 +
|
|
(unsigned char)m_s[p+2] - 128;
|
|
case 4:
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(
|
|
(((unsigned char)m_s[p]) & 248) == 240
|
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
|
);
|
|
#endif
|
|
|
|
return ((unsigned char)m_s[p]-240)*262144 +
|
|
((unsigned char)m_s[p+1]-128)*4096 +
|
|
((unsigned char)m_s[p+2]-128)*64 +
|
|
(unsigned char)m_s[p+3]-128;
|
|
|
|
default:
|
|
#ifdef UTF8ITER_CHECK
|
|
assert(l <= 4);
|
|
#endif
|
|
return (unsigned int)-1;
|
|
}
|
|
}
|
|
|
|
};
|
|
|
|
|
|
#endif /* _UTF8ITER_H_INCLUDED_ */
|