From a573fbd1a9c8ad1a27a81aa6eb2eeab866db5082 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 20 Nov 2006 11:16:54 +0000 Subject: [PATCH] cleaned and speeded up utf8iter --- src/utils/Makefile | 4 +- src/utils/utf8iter.cpp | 54 +++++++- src/utils/utf8iter.h | 302 +++++++++++++++++++++++++---------------- 3 files changed, 237 insertions(+), 123 deletions(-) diff --git a/src/utils/Makefile b/src/utils/Makefile index ebe089c4..7307aef8 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp $(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \ wipedir.cpp -UTF8ITER_OBJS= trutf8iter.o $(BIGLIB) +UTF8ITER_OBJS= trutf8iter.o utf8iter : $(UTF8ITER_OBJS) - $(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV) + $(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV) trutf8iter.o : utf8iter.cpp utf8iter.h $(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \ utf8iter.cpp diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp index 0ba6a672..9efb3390 100644 --- a/src/utils/utf8iter.cpp +++ b/src/utils/utf8iter.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp #include #include #include + + #include "debuglog.h" +#include "transcode.h" + #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ +#define UTF8ITER_CHECK #include "utf8iter.h" #include "readfile.h" - - int main(int argc, char **argv) { if (argc != 3) { @@ -54,6 +57,7 @@ int main(int argc, char **argv) fprintf(stderr, "cant create %s\n", outfile); exit(1); } + int nchars = 0; for (;!it.eof(); it++) { unsigned int value = *it; @@ -61,15 +65,24 @@ int main(int argc, char **argv) fprintf(stderr, "Conversion error occurred\n"); exit(1); } + // UTF-32LE or BE array ucsout1.push_back(value); + // UTF-32LE or BE file fwrite(&value, 4, 1, fp); + + // Reconstructed utf8 strings (2 methods) if (!it.appendchartostring(out)) break; + // conversion to string out1 += it; + + // fprintf(stderr, "%s", string(it).c_str()); nchars++; } - fprintf(stderr, "nchars1 %d\n", nchars); - if (in != out) { + fclose(fp); + + fprintf(stderr, "nchars %d\n", nchars); + if (in.compare(out)) { fprintf(stderr, "error: out != in\n"); exit(1); } @@ -78,6 +91,7 @@ int main(int argc, char **argv) exit(1); } + // Rewind and do it a second time vectorucsout2; it.rewind(); for (int i = 0; ; i++) { @@ -95,7 +109,35 @@ int main(int argc, char **argv) exit(1); } - fclose(fp); + ucsout2.clear(); + int ercnt; + const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine + string ucs, ucs1; + for (vector::iterator it = ucsout1.begin(); + it != ucsout1.end(); it++) { + unsigned int i = *it; + ucs.append((const char *)&i, 4); + } + if (!transcode(ucs, ucs1, + encoding, encoding, &ercnt) || ercnt) { + fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt); + exit(1); + } + if (ucs.compare(ucs1)) { + fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n"); + exit(1); + } + + if (!transcode(ucs, ucs1, + encoding, "UTF-8", &ercnt) || ercnt) { + fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n", + ercnt); + exit(1); + } + if (ucs1.compare(in)) { + fprintf(stderr, "Transcode back to utf-8 compare to in failed\n"); + exit(1); + } exit(0); } diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index b1674467..1935c2e5 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -16,158 +16,230 @@ */ #ifndef _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_ -/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */ /** * A small helper class to iterate over utf8 strings. This is not an - * STL iterator and this is not well designed, just convenient for - some specific uses + * STL iterator and does not much error checking. It is designed purely + * for recoll usage, where the utf-8 string comes out of iconv in most cases + * and is assumed legal. We just try to catch cases where there would be + * a risk of crash. */ class Utf8Iter { - unsigned int cl; // Char length at current position if known - const string &s; // String we're working with - string::size_type pos; // Current position in string - unsigned int m_charpos; // Current character posiiton - - // Get character byte length at specified position - inline int get_cl(string::size_type p) const { - unsigned int z = (unsigned char)s[p]; - if (z <= 127) { - return 1; - } else if (z>=192 && z <= 223) { - return 2; - } else if (z >= 224 && z <= 239) { - return 3; - } else if (z >= 240 && z <= 247) { - return 4; - } else if (z >= 248 && z <= 251) { - return 5; - } else if (z >= 252 && z <= 253) { - return 6; - } - return -1; - } - // Check position and cl against string length - bool poslok(string::size_type p, int l) const { - return p != string::npos && l > 0 && p + l <= s.length(); - } - // Update current char length in object state. Assumes pos is inside string - inline int compute_cl() { - cl = 0; - cl = get_cl(pos); - if (!poslok(pos, cl)) { - pos = s.length(); - cl = 0; - return -1; - } - return 0; - } - // Compute value at given position - inline unsigned int getvalueat(string::size_type p, int l) const { - switch (l) { - case 1: return (unsigned char)s[p]; - case 2: return ((unsigned char)s[p] - 192) * 64 + - (unsigned char)s[p+1] - 128 ; - case 3: return ((unsigned char)s[p]-224)*4096 + - ((unsigned char)s[p+1]-128)*64 + - (unsigned char)s[p+2]-128; - case 4: return ((unsigned char)s[p]-240)*262144 + - ((unsigned char)s[p+1]-128)*4096 + - ((unsigned char)s[p+2]-128)*64 + - (unsigned char)s[p+3]-128; - case 5: return ((unsigned char)s[p]-248)*16777216 + - ((unsigned char)s[p+1]-128)*262144 + - ((unsigned char)s[p+2]-128)*4096 + - ((unsigned char)s[p+3]-128)*64 + - (unsigned char)s[p+4]-128; - case 6: return ((unsigned char)s[p]-252)*1073741824 + - ((unsigned char)s[p+1]-128)*16777216 + - ((unsigned char)s[p+2]-128)*262144 + - ((unsigned char)s[p+3]-128)*4096 + - ((unsigned char)s[p+4]-128)*64 + - (unsigned char)s[p+5]-128; - default: - return (unsigned int)-1; - } - } - public: +public: Utf8Iter(const string &in) - : cl(0), s(in), pos(0), m_charpos(0) - { - // Ensure state is ok if appendchartostring is called at once - compute_cl(); - } + : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false) + { + compute_cl(); + } - void rewind() { - cl=0; pos=0; m_charpos=0; - } - /** operator* returns the ucs4 value as a machine integer*/ - unsigned int operator*() { - if (!cl && compute_cl() < 0) - return (unsigned int)-1; - unsigned int val = getvalueat(pos, cl); - if (val == (unsigned int)-1) { - pos = s.length(); - cl = 0; - } - return val; + void rewind() + { + m_cl = 0; + m_pos = 0; + m_charpos = 0; + m_error = false; + compute_cl(); } + /** "Direct" access. Awfully inefficient as we skip from start or current * position at best. This can only be useful for a lookahead from the * current position */ - unsigned int operator[](unsigned int charpos) const { + unsigned int operator[](unsigned int charpos) const + { string::size_type mypos = 0; - unsigned int mycp = 0;; + unsigned int mycp = 0; if (charpos >= m_charpos) { - mypos = pos; + mypos = m_pos; mycp = m_charpos; } - while (mypos < s.length() && mycp != charpos) { - mypos += get_cl(mypos); + int l; + while (mypos < m_s.length() && mycp != charpos) { + l = get_cl(mypos); + if (l < 0) + return (unsigned int)-1; + mypos += l; ++mycp; } - if (mypos < s.length() && mycp == charpos) { - int l = get_cl(mypos); + if (mypos < m_s.length() && mycp == charpos) { + l = get_cl(mypos); if (poslok(mypos, l)) return getvalueat(mypos, get_cl(mypos)); } return (unsigned int)-1; } - /** Set current position before next utf-8 character */ - string::size_type operator++(int) { - if (!cl && compute_cl() < 0) { - return pos = string::npos; - } - pos += cl; + /** Increment current position to next utf-8 char */ + string::size_type operator++(int) + { + // Note: m_cl may be zero at eof if user's test not right + // this shouldn't crash the program until actual data access +#ifdef UTF8ITER_CHECK + assert(m_cl != 0); +#endif + if (m_cl == 0) + return string::npos; + + m_pos += m_cl; m_charpos++; - cl = 0; - return pos; + compute_cl(); + return m_pos; } - /** This needs to be fast. No error checking. */ - void appendchartostring(string &out) { - out.append(&s[pos], cl); + + /** operator* returns the ucs4 value as a machine integer*/ + unsigned int operator*() + { +#ifdef UTF8ITER_CHECK + assert(m_cl != 0); +#endif + return getvalueat(m_pos, m_cl); } + + /** Append current utf-8 possibly multi-byte character to string param. + This needs to be fast. No error checking. */ + unsigned int appendchartostring(string &out) { +#ifdef UTF8ITER_CHECK + assert(m_cl != 0); +#endif + out.append(&m_s[m_pos], m_cl); + return m_cl; + } + + /** Return current character as string */ operator string() { - if (!cl && compute_cl() < 0) { - return std::string(""); - } - return s.substr(pos, cl); +#ifdef UTF8ITER_CHECK + assert(m_cl != 0); +#endif + return m_s.substr(m_pos, m_cl); } + bool eof() { - // Note: we always ensure that pos == s.length() when setting bad to - // true - return pos == s.length(); + return m_pos == m_s.length(); } + bool error() { - return compute_cl() < 0; + return m_error; } + string::size_type getBpos() const { - return pos; + return m_pos; } + string::size_type getCpos() const { return m_charpos; } + +private: + // String we're working with + const string& m_s; + // Character length at current position. A value of zero indicates + // unknown or error. + unsigned int m_cl; + // Current byte offset in string. + string::size_type m_pos; + // Current character position + unsigned int m_charpos; + mutable bool m_error; + + // Check position and cl against string length + bool poslok(string::size_type p, int l) const { +#ifdef UTF8ITER_CHECK + assert(p != string::npos && l > 0 && p + l <= m_s.length()); +#endif + return p != string::npos && l > 0 && p + l <= m_s.length(); + } + + // Update current char length in object state, minimum checking for + // errors + inline int compute_cl() + { + m_cl = 0; + if (m_pos == m_s.length()) + return -1; + m_cl = get_cl(m_pos); + if (!poslok(m_pos, m_cl)) { + m_pos = m_s.length(); + m_cl = 0; + m_error = true; + return -1; + } + return 0; + } + + // Get character byte length at specified position + inline int get_cl(string::size_type p) const + { + unsigned int z = (unsigned char)m_s[p]; + if (z <= 127) { + return 1; + } else if ((z & 224) == 192) { + return 2; + } else if ((z & 240) == 224) { + return 3; + } else if ((z & 248) == 240) { + return 4; + } +#ifdef UTF8ITER_CHECK + assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || + (z & 248) == 240); +#endif + return -1; + } + + // Compute value at given position. No error checking. + inline unsigned int getvalueat(string::size_type p, int l) const + { + switch (l) { + case 1: +#ifdef UTF8ITER_CHECK + assert((unsigned char)m_s[p] < 128); +#endif + return (unsigned char)m_s[p]; + case 2: +#ifdef UTF8ITER_CHECK + assert( + ((unsigned char)m_s[p] & 224) == 192 + && ((unsigned char)m_s[p+1] & 192) == 128 + ); +#endif + return ((unsigned char)m_s[p] - 192) * 64 + + (unsigned char)m_s[p+1] - 128 ; + case 3: +#ifdef UTF8ITER_CHECK + assert( + (((unsigned char)m_s[p]) & 240) == 224 + && (((unsigned char)m_s[p+1]) & 192) == 128 + && (((unsigned char)m_s[p+2]) & 192) == 128 + ); +#endif + + return ((unsigned char)m_s[p] - 224) * 4096 + + ((unsigned char)m_s[p+1] - 128) * 64 + + (unsigned char)m_s[p+2] - 128; + case 4: +#ifdef UTF8ITER_CHECK + assert( + (((unsigned char)m_s[p]) & 248) == 240 + && (((unsigned char)m_s[p+1]) & 192) == 128 + && (((unsigned char)m_s[p+2]) & 192) == 128 + && (((unsigned char)m_s[p+3]) & 192) == 128 + ); +#endif + + return ((unsigned char)m_s[p]-240)*262144 + + ((unsigned char)m_s[p+1]-128)*4096 + + ((unsigned char)m_s[p+2]-128)*64 + + (unsigned char)m_s[p+3]-128; + + default: +#ifdef UTF8ITER_CHECK + assert(l <= 4); +#endif + m_error = true; + return (unsigned int)-1; + } + } + };