cleaned and speeded up utf8iter

2006-11-20 11:16:54 +00:00 · 2006-11-20 11:16:54 +00:00 · a573fbd1a9
commit a573fbd1a9
parent b3ab39522b
3 changed files with 237 additions and 123 deletions
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp
 	$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
 	       wipedir.cpp
-UTF8ITER_OBJS= trutf8iter.o  $(BIGLIB)
+UTF8ITER_OBJS= trutf8iter.o 
 utf8iter : $(UTF8ITER_OBJS)
-	$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
+	$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV) 
 trutf8iter.o : utf8iter.cpp utf8iter.h
 	$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
 	       utf8iter.cpp
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
 #include <iostream>
 #include <list>
 #include <vector>
 #include "debuglog.h"
 #include "transcode.h"
 #ifndef NO_NAMESPACES
 using namespace std;
 #endif /* NO_NAMESPACES */
 #define UTF8ITER_CHECK
 #include "utf8iter.h"
 #include "readfile.h"
 int main(int argc, char **argv)
 {
    if (argc != 3) {
@ -54,6 +57,7 @@ int main(int argc, char **argv)
 	fprintf(stderr, "cant create %s\n", outfile);
 	exit(1);
    }
    int nchars = 0;
    for (;!it.eof(); it++) {
 	unsigned int value = *it;
@ -61,15 +65,24 @@ int main(int argc, char **argv)
 	    fprintf(stderr, "Conversion error occurred\n");
 	    exit(1);
 	}
 	// UTF-32LE or BE array
 	ucsout1.push_back(value);
 	// UTF-32LE or BE file
 	fwrite(&value, 4, 1, fp);
 	// Reconstructed utf8 strings (2 methods)
 	if (!it.appendchartostring(out))
 	    break;
 	// conversion to string
 	out1 += it;
 	// fprintf(stderr, "%s", string(it).c_str());
 	nchars++;
    }
-    fprintf(stderr, "nchars1 %d\n", nchars);
+    fclose(fp);
-    if (in != out) {
+
    fprintf(stderr, "nchars %d\n", nchars);
    if (in.compare(out)) {
 	fprintf(stderr, "error: out != in\n");
 	exit(1);
    }
@ -78,6 +91,7 @@ int main(int argc, char **argv)
 	exit(1);
    }
    // Rewind and do it a second time
    vector<unsigned int>ucsout2;
    it.rewind();
    for (int i = 0; ; i++) {
@ -95,7 +109,35 @@ int main(int argc, char **argv)
 	exit(1);
    }
-    fclose(fp);
+    ucsout2.clear();
    int ercnt;
    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
    string ucs, ucs1;
    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
 	 it != ucsout1.end(); it++) {
 	unsigned int i = *it;
 	ucs.append((const char *)&i, 4);
    }
    if (!transcode(ucs, ucs1, 
 		   encoding, encoding, &ercnt) || ercnt) {
 	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
 	exit(1);
    }
    if (ucs.compare(ucs1)) {
 	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
 	exit(1);
    }
    if (!transcode(ucs, ucs1, 
 		   encoding, "UTF-8", &ercnt) || ercnt) {
 	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
 		ercnt);
 	exit(1);
    }
    if (ucs1.compare(in)) {
 	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
 	exit(1);
    }
    exit(0);
 }
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -16,158 +16,230 @@
 */
 #ifndef _UTF8ITER_H_INCLUDED_
 #define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */
 /** 
 * A small helper class to iterate over utf8 strings. This is not an
- * STL iterator and this is not well designed, just convenient for
+ * STL iterator and does not much error checking. It is designed purely
-   some specific uses
+ * for recoll usage, where the utf-8 string comes out of iconv in most cases
 * and is assumed legal. We just try to catch cases where there would be 
 * a risk of crash.
 */
 class Utf8Iter {
-    unsigned int cl; // Char length at current position if known
+public:
    const string &s; // String we're working with
    string::size_type pos; // Current position in string
    unsigned int m_charpos; // Current character posiiton
    // Get character byte length at specified position
    inline int get_cl(string::size_type p) const {
 	unsigned int z = (unsigned char)s[p];
 	if (z <= 127) {
 	    return 1;
 	} else if (z>=192 && z <= 223) {
 	    return 2;
 	} else if (z >= 224 && z <= 239) {
 	    return 3;
 	} else if (z >= 240 && z <= 247) {
 	    return 4;
 	} else if (z >= 248 && z <= 251) {
 	    return 5;
 	} else if (z >= 252 && z <= 253) {
 	    return 6;
 	} 
 	return -1;
    }
    // Check position and cl against string length
    bool poslok(string::size_type p, int l) const {
 	return p != string::npos && l > 0 && p + l <= s.length();
    }
    // Update current char length in object state. Assumes pos is inside string
    inline int compute_cl() {
 	cl = 0;
 	cl = get_cl(pos);
 	if (!poslok(pos, cl)) {
 	    pos = s.length();
 	    cl = 0;
 	    return -1;
 	}
 	return 0;
    }
    // Compute value at given position
    inline unsigned int getvalueat(string::size_type p, int l) const {
 	switch (l) {
 	case 1: return (unsigned char)s[p];
 	case 2: return ((unsigned char)s[p] - 192) * 64 + 
 		(unsigned char)s[p+1] - 128 ;
 	case 3: return ((unsigned char)s[p]-224)*4096 + 
 		((unsigned char)s[p+1]-128)*64 + 
 		(unsigned char)s[p+2]-128;
 	case 4: return ((unsigned char)s[p]-240)*262144 + 
 		((unsigned char)s[p+1]-128)*4096 + 
 		((unsigned char)s[p+2]-128)*64 + 
 		(unsigned char)s[p+3]-128;
 	case 5: return ((unsigned char)s[p]-248)*16777216 + 
 		((unsigned char)s[p+1]-128)*262144 + 
 		((unsigned char)s[p+2]-128)*4096 + 
 		((unsigned char)s[p+3]-128)*64 + 
 		(unsigned char)s[p+4]-128;
 	case 6: return  ((unsigned char)s[p]-252)*1073741824 + 
 		((unsigned char)s[p+1]-128)*16777216 + 
 		((unsigned char)s[p+2]-128)*262144 + 
 		((unsigned char)s[p+3]-128)*4096 + 
 		((unsigned char)s[p+4]-128)*64 + 
 		(unsigned char)s[p+5]-128;
 	default:
 	    return (unsigned int)-1;
 	}
    }
 public:
    Utf8Iter(const string &in) 
-	: cl(0), s(in), pos(0), m_charpos(0) 
+	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
-	{
+    {
-	    // Ensure state is ok if appendchartostring is called at once
+	compute_cl();
-	    compute_cl();
+    }
 	}
-    void rewind() {
+    void rewind() 
-	cl=0; pos=0; m_charpos=0;
+    {
-    }
+	m_cl = 0; 
-    /** operator* returns the ucs4 value as a machine integer*/
+	m_pos = 0; 
-    unsigned int operator*() {
+	m_charpos = 0; 
-	if (!cl && compute_cl() < 0)
+	m_error = false;
-	    return (unsigned int)-1;
+	compute_cl();
 	unsigned int val = getvalueat(pos, cl);
 	if (val == (unsigned int)-1) {
 	    pos = s.length();
 	    cl = 0;
 	}
 	return val;
    }
    /** "Direct" access. Awfully inefficient as we skip from start or current
     * position at best. This can only be useful for a lookahead from the
     * current position */
-    unsigned int operator[](unsigned int charpos) const {
+    unsigned int operator[](unsigned int charpos) const 
    {
 	string::size_type mypos = 0;
-	unsigned int mycp = 0;;
+	unsigned int mycp = 0;
 	if (charpos >= m_charpos) {
-	    mypos = pos;
+	    mypos = m_pos;
 	    mycp = m_charpos;
 	}
-	while (mypos < s.length() && mycp != charpos) {
+	int l;
-	    mypos += get_cl(mypos);
+	while (mypos < m_s.length() && mycp != charpos) {
 	    l = get_cl(mypos);
 	    if (l < 0)
 		return (unsigned int)-1;
 	    mypos += l;
 	    ++mycp;
 	}
-	if (mypos < s.length() && mycp == charpos) {
+	if (mypos < m_s.length() && mycp == charpos) {
-	    int l = get_cl(mypos);
+	    l = get_cl(mypos);
 	    if (poslok(mypos, l))
 		return getvalueat(mypos, get_cl(mypos));
 	}
 	return (unsigned int)-1;
    }
-    /** Set current position before next utf-8 character */
+    /** Increment current position to next utf-8 char */
-    string::size_type operator++(int) {
+    string::size_type operator++(int) 
-	if (!cl && compute_cl() < 0) {
+    {
-	    return pos = string::npos;
+	// Note: m_cl may be zero at eof if user's test not right
-	}
+	// this shouldn't crash the program until actual data access
-	pos += cl;
+#ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
 	if (m_cl == 0) 
 	    return string::npos;
 	m_pos += m_cl;
 	m_charpos++;
-	cl = 0;
+	compute_cl();
-	return pos;
+	return m_pos;
    }
-    /** This needs to be fast. No error checking. */
+
-    void appendchartostring(string &out) {
+    /** operator* returns the ucs4 value as a machine integer*/
-	out.append(&s[pos], cl);
+    unsigned int operator*() 
    {
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
 	return getvalueat(m_pos, m_cl);
    }
    /** Append current utf-8 possibly multi-byte character to string param.
 	This needs to be fast. No error checking. */
    unsigned int appendchartostring(string &out) {
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
 	out.append(&m_s[m_pos], m_cl);
 	return m_cl;
    }
    /** Return current character as string */
    operator string() {
-	if (!cl && compute_cl() < 0) {
+#ifdef UTF8ITER_CHECK
-	    return std::string("");
+	assert(m_cl != 0);
-	}
+#endif
-	return s.substr(pos, cl);
+	return m_s.substr(m_pos, m_cl);
    }
    bool eof() {
-	// Note: we always ensure that pos == s.length() when setting bad to 
+	return m_pos == m_s.length();
 	// true
 	return pos == s.length();
    }
    bool error() {
-	return compute_cl() < 0;
+	return m_error;
    }
    string::size_type getBpos() const {
-	return pos;
+	return m_pos;
    }
    string::size_type getCpos() const {
 	return m_charpos;
    }
 private:
    // String we're working with
    const string&     m_s; 
    // Character length at current position. A value of zero indicates
    // unknown or error.
    unsigned int      m_cl; 
    // Current byte offset in string.
    string::size_type m_pos; 
    // Current character position
    unsigned int      m_charpos; 
    mutable bool      m_error;
    // Check position and cl against string length
    bool poslok(string::size_type p, int l) const {
 #ifdef UTF8ITER_CHECK
 	assert(p != string::npos && l > 0 && p + l <= m_s.length());
 #endif
 	return p != string::npos && l > 0 && p + l <= m_s.length();
    }
    // Update current char length in object state, minimum checking for 
    // errors
    inline int compute_cl() 
    {
 	m_cl = 0;
 	if (m_pos == m_s.length())
 	    return -1;
 	m_cl = get_cl(m_pos);
 	if (!poslok(m_pos, m_cl)) {
 	    m_pos = m_s.length();
 	    m_cl = 0;
 	    m_error = true;
 	    return -1;
 	}
 	return 0;
    }
    // Get character byte length at specified position
    inline int get_cl(string::size_type p) const 
    {
 	unsigned int z = (unsigned char)m_s[p];
 	if (z <= 127) {
 	    return 1;
 	} else if ((z & 224) == 192) {
 	    return 2;
 	} else if ((z & 240) == 224) {
 	    return 3;
 	} else if ((z & 248) == 240) {
 	    return 4;
 	}
 #ifdef UTF8ITER_CHECK
 	assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
 	       (z & 248) == 240);
 #endif
 	return -1;
    }
    // Compute value at given position. No error checking.
    inline unsigned int getvalueat(string::size_type p, int l) const
    {
 	switch (l) {
 	case 1: 
 #ifdef UTF8ITER_CHECK
 	    assert((unsigned char)m_s[p] < 128);
 #endif
 	    return (unsigned char)m_s[p];
 	case 2: 
 #ifdef UTF8ITER_CHECK
 	    assert(
 		   ((unsigned char)m_s[p] & 224) == 192
 		   && ((unsigned char)m_s[p+1] & 192) ==  128
 		   );
 #endif
 	    return ((unsigned char)m_s[p] - 192) * 64 + 
 		(unsigned char)m_s[p+1] - 128 ;
 	case 3: 
 #ifdef UTF8ITER_CHECK
 	    assert(
 		   (((unsigned char)m_s[p]) & 240) == 224
 		   && (((unsigned char)m_s[p+1]) & 192) ==  128
 		   && (((unsigned char)m_s[p+2]) & 192) ==  128
 		   );
 #endif
 	    return ((unsigned char)m_s[p] - 224) * 4096 + 
 		((unsigned char)m_s[p+1] - 128) * 64 + 
 		(unsigned char)m_s[p+2] - 128;
 	case 4: 
 #ifdef UTF8ITER_CHECK
 	    assert(
 		   (((unsigned char)m_s[p]) & 248) == 240
 		   && (((unsigned char)m_s[p+1]) & 192) ==  128
 		   && (((unsigned char)m_s[p+2]) & 192) ==  128
 		   && (((unsigned char)m_s[p+3]) & 192) ==  128
 		   );
 #endif
 	    return ((unsigned char)m_s[p]-240)*262144 + 
 		((unsigned char)m_s[p+1]-128)*4096 + 
 		((unsigned char)m_s[p+2]-128)*64 + 
 		(unsigned char)m_s[p+3]-128;
 	default:
 #ifdef UTF8ITER_CHECK
 	    assert(l <= 4);
 #endif
 	    m_error = true;
 	    return (unsigned int)-1;
 	}
    }
 };