utf8iter: store pointer to the source string instead of ref, makes Utf8Iter copyable

This commit is contained in:
Jean-Francois Dockes 2017-04-21 12:03:21 +02:00
parent 8cf5f8963a
commit 71b4be883c

View File

@ -32,12 +32,12 @@
class Utf8Iter { class Utf8Iter {
public: public:
Utf8Iter(const std::string &in) Utf8Iter(const std::string &in)
: m_s(in), m_cl(0), m_pos(0), m_charpos(0) : m_sp(&in), m_cl(0), m_pos(0), m_charpos(0)
{ {
update_cl(); update_cl();
} }
const std::string& buffer() const {return m_s;} const std::string& buffer() const {return (*m_sp);}
void rewind() void rewind()
{ {
@ -59,14 +59,14 @@ public:
mycp = m_charpos; mycp = m_charpos;
} }
int l; int l;
while (mypos < m_s.length() && mycp != charpos) { while (mypos < m_sp->length() && mycp != charpos) {
l = get_cl(mypos); l = get_cl(mypos);
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l)) if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
return (unsigned int)-1; return (unsigned int)-1;
mypos += l; mypos += l;
++mycp; ++mycp;
} }
if (mypos < m_s.length() && mycp == charpos) { if (mypos < m_sp->length() && mycp == charpos) {
l = get_cl(mypos); l = get_cl(mypos);
if (poslok(mypos, l) && checkvalidat(mypos, l)) if (poslok(mypos, l) && checkvalidat(mypos, l))
return getvalueat(mypos, l); return getvalueat(mypos, l);
@ -106,7 +106,7 @@ public:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl != 0);
#endif #endif
out.append(&m_s[m_pos], m_cl); out.append(&(*m_sp)[m_pos], m_cl);
return m_cl; return m_cl;
} }
@ -115,11 +115,11 @@ public:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl != 0);
#endif #endif
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string(); return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
} }
bool eof() const { bool eof() const {
return m_pos == m_s.length(); return m_pos == m_sp->length();
} }
bool error() const { bool error() const {
@ -143,7 +143,7 @@ public:
private: private:
// String we're working with // String we're working with
const std::string& m_s; const std::string* m_sp;
// Character length at current position. A value of zero indicates // Character length at current position. A value of zero indicates
// an error. // an error.
unsigned int m_cl; unsigned int m_cl;
@ -155,9 +155,9 @@ private:
// Check position and cl against string length // Check position and cl against string length
bool poslok(std::string::size_type p, int l) const { bool poslok(std::string::size_type p, int l) const {
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(p != std::string::npos && l > 0 && p + l <= m_s.length()); assert(p != std::string::npos && l > 0 && p + l <= m_sp->length());
#endif #endif
return p != std::string::npos && l > 0 && p + l <= m_s.length(); return p != std::string::npos && l > 0 && p + l <= m_sp->length();
} }
// Update current char length in object state, check // Update current char length in object state, check
@ -165,13 +165,13 @@ private:
inline void update_cl() inline void update_cl()
{ {
m_cl = 0; m_cl = 0;
if (m_pos >= m_s.length()) if (m_pos >= m_sp->length())
return; return;
m_cl = get_cl(m_pos); m_cl = get_cl(m_pos);
if (!poslok(m_pos, m_cl)) { if (!poslok(m_pos, m_cl)) {
// Used to set eof here for safety, but this is bad because it // Used to set eof here for safety, but this is bad because it
// basically prevents the caller to discriminate error and eof. // basically prevents the caller to discriminate error and eof.
// m_pos = m_s.length(); // m_pos = m_sp->length();
m_cl = 0; m_cl = 0;
return; return;
} }
@ -184,20 +184,20 @@ private:
{ {
switch (l) { switch (l) {
case 1: case 1:
return (unsigned char)m_s[p] < 128; return (unsigned char)(*m_sp)[p] < 128;
case 2: case 2:
return (((unsigned char)m_s[p]) & 224) == 192 return (((unsigned char)(*m_sp)[p]) & 224) == 192
&& (((unsigned char)m_s[p+1]) & 192) == 128; && (((unsigned char)(*m_sp)[p+1]) & 192) == 128;
case 3: case 3:
return (((unsigned char)m_s[p]) & 240) == 224 return (((unsigned char)(*m_sp)[p]) & 240) == 224
&& (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)(*m_sp)[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)(*m_sp)[p+2]) & 192) == 128
; ;
case 4: case 4:
return (((unsigned char)m_s[p]) & 248) == 240 return (((unsigned char)(*m_sp)[p]) & 248) == 240
&& (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)(*m_sp)[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)(*m_sp)[p+2]) & 192) == 128
&& (((unsigned char)m_s[p+3]) & 192) == 128 && (((unsigned char)(*m_sp)[p+3]) & 192) == 128
; ;
default: default:
return false; return false;
@ -207,7 +207,7 @@ private:
// Get character byte length at specified position. Returns 0 for error. // Get character byte length at specified position. Returns 0 for error.
inline int get_cl(std::string::size_type p) const inline int get_cl(std::string::size_type p) const
{ {
unsigned int z = (unsigned char)m_s[p]; unsigned int z = (unsigned char)(*m_sp)[p];
if (z <= 127) { if (z <= 127) {
return 1; return 1;
} else if ((z & 224) == 192) { } else if ((z & 224) == 192) {
@ -230,44 +230,44 @@ private:
switch (l) { switch (l) {
case 1: case 1:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert((unsigned char)m_s[p] < 128); assert((unsigned char)(*m_sp)[p] < 128);
#endif #endif
return (unsigned char)m_s[p]; return (unsigned char)(*m_sp)[p];
case 2: case 2:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert( assert(
((unsigned char)m_s[p] & 224) == 192 ((unsigned char)(*m_sp)[p] & 224) == 192
&& ((unsigned char)m_s[p+1] & 192) == 128 && ((unsigned char)(*m_sp)[p+1] & 192) == 128
); );
#endif #endif
return ((unsigned char)m_s[p] - 192) * 64 + return ((unsigned char)(*m_sp)[p] - 192) * 64 +
(unsigned char)m_s[p+1] - 128 ; (unsigned char)(*m_sp)[p+1] - 128 ;
case 3: case 3:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert( assert(
(((unsigned char)m_s[p]) & 240) == 224 (((unsigned char)(*m_sp)[p]) & 240) == 224
&& (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)(*m_sp)[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)(*m_sp)[p+2]) & 192) == 128
); );
#endif #endif
return ((unsigned char)m_s[p] - 224) * 4096 + return ((unsigned char)(*m_sp)[p] - 224) * 4096 +
((unsigned char)m_s[p+1] - 128) * 64 + ((unsigned char)(*m_sp)[p+1] - 128) * 64 +
(unsigned char)m_s[p+2] - 128; (unsigned char)(*m_sp)[p+2] - 128;
case 4: case 4:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert( assert(
(((unsigned char)m_s[p]) & 248) == 240 (((unsigned char)(*m_sp)[p]) & 248) == 240
&& (((unsigned char)m_s[p+1]) & 192) == 128 && (((unsigned char)(*m_sp)[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128 && (((unsigned char)(*m_sp)[p+2]) & 192) == 128
&& (((unsigned char)m_s[p+3]) & 192) == 128 && (((unsigned char)(*m_sp)[p+3]) & 192) == 128
); );
#endif #endif
return ((unsigned char)m_s[p]-240)*262144 + return ((unsigned char)(*m_sp)[p]-240)*262144 +
((unsigned char)m_s[p+1]-128)*4096 + ((unsigned char)(*m_sp)[p+1]-128)*4096 +
((unsigned char)m_s[p+2]-128)*64 + ((unsigned char)(*m_sp)[p+2]-128)*64 +
(unsigned char)m_s[p+3]-128; (unsigned char)(*m_sp)[p+3]-128;
default: default:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK