cleaned and speeded up utf8iter

2006-11-20 11:16:54 +00:00 · 2006-11-20 11:16:54 +00:00 · a573fbd1a9
commit a573fbd1a9
parent b3ab39522b
3 changed files with 237 additions and 123 deletions
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp
 	$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
 	       wipedir.cpp

-UTF8ITER_OBJS= trutf8iter.o  $(BIGLIB)
+UTF8ITER_OBJS= trutf8iter.o 
 utf8iter : $(UTF8ITER_OBJS)
-	$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
+	$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV) 
 trutf8iter.o : utf8iter.cpp utf8iter.h
 	$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
 	       utf8iter.cpp
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
 #include <iostream>
 #include <list>
 #include <vector>
+
+
 #include "debuglog.h"
+#include "transcode.h"
+
 #ifndef NO_NAMESPACES
 using namespace std;
 #endif /* NO_NAMESPACES */

+#define UTF8ITER_CHECK
 #include "utf8iter.h"
 #include "readfile.h"

-
-
 int main(int argc, char **argv)
 {
    if (argc != 3) {
@ -54,6 +57,7 @@ int main(int argc, char **argv)
 	fprintf(stderr, "cant create %s\n", outfile);
 	exit(1);
    }
+
    int nchars = 0;
    for (;!it.eof(); it++) {
 	unsigned int value = *it;
@ -61,15 +65,24 @@ int main(int argc, char **argv)
 	    fprintf(stderr, "Conversion error occurred\n");
 	    exit(1);
 	}
+	// UTF-32LE or BE array
 	ucsout1.push_back(value);
+	// UTF-32LE or BE file
 	fwrite(&value, 4, 1, fp);
+
+	// Reconstructed utf8 strings (2 methods)
 	if (!it.appendchartostring(out))
 	    break;
+	// conversion to string
 	out1 += it;
+	
+	// fprintf(stderr, "%s", string(it).c_str());
 	nchars++;
    }
-    fprintf(stderr, "nchars1 %d\n", nchars);
-    if (in != out) {
+    fclose(fp);
+
+    fprintf(stderr, "nchars %d\n", nchars);
+    if (in.compare(out)) {
 	fprintf(stderr, "error: out != in\n");
 	exit(1);
    }
@ -78,6 +91,7 @@ int main(int argc, char **argv)
 	exit(1);
    }

+    // Rewind and do it a second time
    vector<unsigned int>ucsout2;
    it.rewind();
    for (int i = 0; ; i++) {
@ -95,7 +109,35 @@ int main(int argc, char **argv)
 	exit(1);
    }

-    fclose(fp);
+    ucsout2.clear();
+    int ercnt;
+    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
+    string ucs, ucs1;
+    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
+	 it != ucsout1.end(); it++) {
+	unsigned int i = *it;
+	ucs.append((const char *)&i, 4);
+    }
+    if (!transcode(ucs, ucs1, 
+		   encoding, encoding, &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
+	exit(1);
+    }
+    if (ucs.compare(ucs1)) {
+	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
+	exit(1);
+    }
+
+    if (!transcode(ucs, ucs1, 
+		   encoding, "UTF-8", &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
+		ercnt);
+	exit(1);
+    }
+    if (ucs1.compare(in)) {
+	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
+	exit(1);
+    }
    exit(0);
 }

--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -16,158 +16,230 @@
 */
 #ifndef _UTF8ITER_H_INCLUDED_
 #define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */

 /** 
 * A small helper class to iterate over utf8 strings. This is not an
- * STL iterator and this is not well designed, just convenient for
-   some specific uses
+ * STL iterator and does not much error checking. It is designed purely
+ * for recoll usage, where the utf-8 string comes out of iconv in most cases
+ * and is assumed legal. We just try to catch cases where there would be 
+ * a risk of crash.
 */
 class Utf8Iter {
-    unsigned int cl; // Char length at current position if known
-    const string &s; // String we're working with
-    string::size_type pos; // Current position in string
-    unsigned int m_charpos; // Current character posiiton
-
-    // Get character byte length at specified position
-    inline int get_cl(string::size_type p) const {
-	unsigned int z = (unsigned char)s[p];
-	if (z <= 127) {
-	    return 1;
-	} else if (z>=192 && z <= 223) {
-	    return 2;
-	} else if (z >= 224 && z <= 239) {
-	    return 3;
-	} else if (z >= 240 && z <= 247) {
-	    return 4;
-	} else if (z >= 248 && z <= 251) {
-	    return 5;
-	} else if (z >= 252 && z <= 253) {
-	    return 6;
-	} 
-	return -1;
-    }
-    // Check position and cl against string length
-    bool poslok(string::size_type p, int l) const {
-	return p != string::npos && l > 0 && p + l <= s.length();
-    }
-    // Update current char length in object state. Assumes pos is inside string
-    inline int compute_cl() {
-	cl = 0;
-	cl = get_cl(pos);
-	if (!poslok(pos, cl)) {
-	    pos = s.length();
-	    cl = 0;
-	    return -1;
-	}
-	return 0;
-    }
-    // Compute value at given position
-    inline unsigned int getvalueat(string::size_type p, int l) const {
-	switch (l) {
-	case 1: return (unsigned char)s[p];
-	case 2: return ((unsigned char)s[p] - 192) * 64 + 
-		(unsigned char)s[p+1] - 128 ;
-	case 3: return ((unsigned char)s[p]-224)*4096 + 
-		((unsigned char)s[p+1]-128)*64 + 
-		(unsigned char)s[p+2]-128;
-	case 4: return ((unsigned char)s[p]-240)*262144 + 
-		((unsigned char)s[p+1]-128)*4096 + 
-		((unsigned char)s[p+2]-128)*64 + 
-		(unsigned char)s[p+3]-128;
-	case 5: return ((unsigned char)s[p]-248)*16777216 + 
-		((unsigned char)s[p+1]-128)*262144 + 
-		((unsigned char)s[p+2]-128)*4096 + 
-		((unsigned char)s[p+3]-128)*64 + 
-		(unsigned char)s[p+4]-128;
-	case 6: return  ((unsigned char)s[p]-252)*1073741824 + 
-		((unsigned char)s[p+1]-128)*16777216 + 
-		((unsigned char)s[p+2]-128)*262144 + 
-		((unsigned char)s[p+3]-128)*4096 + 
-		((unsigned char)s[p+4]-128)*64 + 
-		(unsigned char)s[p+5]-128;
-	default:
-	    return (unsigned int)-1;
-	}
-    }
- public:
+public:
    Utf8Iter(const string &in) 
-	: cl(0), s(in), pos(0), m_charpos(0) 
-	{
-	    // Ensure state is ok if appendchartostring is called at once
-	    compute_cl();
-	}
+	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
+    {
+	compute_cl();
+    }

-    void rewind() {
-	cl=0; pos=0; m_charpos=0;
-    }
-    /** operator* returns the ucs4 value as a machine integer*/
-    unsigned int operator*() {
-	if (!cl && compute_cl() < 0)
-	    return (unsigned int)-1;
-	unsigned int val = getvalueat(pos, cl);
-	if (val == (unsigned int)-1) {
-	    pos = s.length();
-	    cl = 0;
-	}
-	return val;
+    void rewind() 
+    {
+	m_cl = 0; 
+	m_pos = 0; 
+	m_charpos = 0; 
+	m_error = false;
+	compute_cl();
    }
+
    /** "Direct" access. Awfully inefficient as we skip from start or current
     * position at best. This can only be useful for a lookahead from the
     * current position */
-    unsigned int operator[](unsigned int charpos) const {
+    unsigned int operator[](unsigned int charpos) const 
+    {
 	string::size_type mypos = 0;
-	unsigned int mycp = 0;;
+	unsigned int mycp = 0;
 	if (charpos >= m_charpos) {
-	    mypos = pos;
+	    mypos = m_pos;
 	    mycp = m_charpos;
 	}
-	while (mypos < s.length() && mycp != charpos) {
-	    mypos += get_cl(mypos);
+	int l;
+	while (mypos < m_s.length() && mycp != charpos) {
+	    l = get_cl(mypos);
+	    if (l < 0)
+		return (unsigned int)-1;
+	    mypos += l;
 	    ++mycp;
 	}
-	if (mypos < s.length() && mycp == charpos) {
-	    int l = get_cl(mypos);
+	if (mypos < m_s.length() && mycp == charpos) {
+	    l = get_cl(mypos);
 	    if (poslok(mypos, l))
 		return getvalueat(mypos, get_cl(mypos));
 	}
 	return (unsigned int)-1;
    }

-    /** Set current position before next utf-8 character */
-    string::size_type operator++(int) {
-	if (!cl && compute_cl() < 0) {
-	    return pos = string::npos;
-	}
-	pos += cl;
+    /** Increment current position to next utf-8 char */
+    string::size_type operator++(int) 
+    {
+	// Note: m_cl may be zero at eof if user's test not right
+	// this shouldn't crash the program until actual data access
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	if (m_cl == 0) 
+	    return string::npos;
+
+	m_pos += m_cl;
 	m_charpos++;
-	cl = 0;
-	return pos;
+	compute_cl();
+	return m_pos;
    }
-    /** This needs to be fast. No error checking. */
-    void appendchartostring(string &out) {
-	out.append(&s[pos], cl);
+
+    /** operator* returns the ucs4 value as a machine integer*/
+    unsigned int operator*() 
+    {
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	return getvalueat(m_pos, m_cl);
    }
+
+    /** Append current utf-8 possibly multi-byte character to string param.
+	This needs to be fast. No error checking. */
+    unsigned int appendchartostring(string &out) {
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	out.append(&m_s[m_pos], m_cl);
+	return m_cl;
+    }
+
+    /** Return current character as string */
    operator string() {
-	if (!cl && compute_cl() < 0) {
-	    return std::string("");
-	}
-	return s.substr(pos, cl);
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	return m_s.substr(m_pos, m_cl);
    }
+
    bool eof() {
-	// Note: we always ensure that pos == s.length() when setting bad to 
-	// true
-	return pos == s.length();
+	return m_pos == m_s.length();
    }
+
    bool error() {
-	return compute_cl() < 0;
+	return m_error;
    }
+
    string::size_type getBpos() const {
-	return pos;
+	return m_pos;
    }
+
    string::size_type getCpos() const {
 	return m_charpos;
    }
+
+private:
+    // String we're working with
+    const string&     m_s; 
+    // Character length at current position. A value of zero indicates
+    // unknown or error.
+    unsigned int      m_cl; 
+    // Current byte offset in string.
+    string::size_type m_pos; 
+    // Current character position
+    unsigned int      m_charpos; 
+    mutable bool      m_error;
+
+    // Check position and cl against string length
+    bool poslok(string::size_type p, int l) const {
+#ifdef UTF8ITER_CHECK
+	assert(p != string::npos && l > 0 && p + l <= m_s.length());
+#endif
+	return p != string::npos && l > 0 && p + l <= m_s.length();
+    }
+
+    // Update current char length in object state, minimum checking for 
+    // errors
+    inline int compute_cl() 
+    {
+	m_cl = 0;
+	if (m_pos == m_s.length())
+	    return -1;
+	m_cl = get_cl(m_pos);
+	if (!poslok(m_pos, m_cl)) {
+	    m_pos = m_s.length();
+	    m_cl = 0;
+	    m_error = true;
+	    return -1;
+	}
+	return 0;
+    }
+
+    // Get character byte length at specified position
+    inline int get_cl(string::size_type p) const 
+    {
+	unsigned int z = (unsigned char)m_s[p];
+	if (z <= 127) {
+	    return 1;
+	} else if ((z & 224) == 192) {
+	    return 2;
+	} else if ((z & 240) == 224) {
+	    return 3;
+	} else if ((z & 248) == 240) {
+	    return 4;
+	}
+#ifdef UTF8ITER_CHECK
+	assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
+	       (z & 248) == 240);
+#endif
+	return -1;
+    }
+
+    // Compute value at given position. No error checking.
+    inline unsigned int getvalueat(string::size_type p, int l) const
+    {
+	switch (l) {
+	case 1: 
+#ifdef UTF8ITER_CHECK
+	    assert((unsigned char)m_s[p] < 128);
+#endif
+	    return (unsigned char)m_s[p];
+	case 2: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   ((unsigned char)m_s[p] & 224) == 192
+		   && ((unsigned char)m_s[p+1] & 192) ==  128
+		   );
+#endif
+	    return ((unsigned char)m_s[p] - 192) * 64 + 
+		(unsigned char)m_s[p+1] - 128 ;
+	case 3: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   (((unsigned char)m_s[p]) & 240) == 224
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   );
+#endif
+
+	    return ((unsigned char)m_s[p] - 224) * 4096 + 
+		((unsigned char)m_s[p+1] - 128) * 64 + 
+		(unsigned char)m_s[p+2] - 128;
+	case 4: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   (((unsigned char)m_s[p]) & 248) == 240
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   && (((unsigned char)m_s[p+3]) & 192) ==  128
+		   );
+#endif
+
+	    return ((unsigned char)m_s[p]-240)*262144 + 
+		((unsigned char)m_s[p+1]-128)*4096 + 
+		((unsigned char)m_s[p+2]-128)*64 + 
+		(unsigned char)m_s[p+3]-128;
+
+	default:
+#ifdef UTF8ITER_CHECK
+	    assert(l <= 4);
+#endif
+	    m_error = true;
+	    return (unsigned int)-1;
+	}
+    }
+
 };