cleaned and speeded up utf8iter

This commit is contained in:
dockes 2006-11-20 11:16:54 +00:00
parent b3ab39522b
commit a573fbd1a9
3 changed files with 237 additions and 123 deletions

View File

@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp
$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \ $(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
wipedir.cpp wipedir.cpp
UTF8ITER_OBJS= trutf8iter.o $(BIGLIB) UTF8ITER_OBJS= trutf8iter.o
utf8iter : $(UTF8ITER_OBJS) utf8iter : $(UTF8ITER_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV) $(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV)
trutf8iter.o : utf8iter.cpp utf8iter.h trutf8iter.o : utf8iter.cpp utf8iter.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \ $(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
utf8iter.cpp utf8iter.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
#include <iostream> #include <iostream>
#include <list> #include <list>
#include <vector> #include <vector>
#include "debuglog.h" #include "debuglog.h"
#include "transcode.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using namespace std; using namespace std;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
#define UTF8ITER_CHECK
#include "utf8iter.h" #include "utf8iter.h"
#include "readfile.h" #include "readfile.h"
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 3) { if (argc != 3) {
@ -54,6 +57,7 @@ int main(int argc, char **argv)
fprintf(stderr, "cant create %s\n", outfile); fprintf(stderr, "cant create %s\n", outfile);
exit(1); exit(1);
} }
int nchars = 0; int nchars = 0;
for (;!it.eof(); it++) { for (;!it.eof(); it++) {
unsigned int value = *it; unsigned int value = *it;
@ -61,15 +65,24 @@ int main(int argc, char **argv)
fprintf(stderr, "Conversion error occurred\n"); fprintf(stderr, "Conversion error occurred\n");
exit(1); exit(1);
} }
// UTF-32LE or BE array
ucsout1.push_back(value); ucsout1.push_back(value);
// UTF-32LE or BE file
fwrite(&value, 4, 1, fp); fwrite(&value, 4, 1, fp);
// Reconstructed utf8 strings (2 methods)
if (!it.appendchartostring(out)) if (!it.appendchartostring(out))
break; break;
// conversion to string
out1 += it; out1 += it;
// fprintf(stderr, "%s", string(it).c_str());
nchars++; nchars++;
} }
fprintf(stderr, "nchars1 %d\n", nchars); fclose(fp);
if (in != out) {
fprintf(stderr, "nchars %d\n", nchars);
if (in.compare(out)) {
fprintf(stderr, "error: out != in\n"); fprintf(stderr, "error: out != in\n");
exit(1); exit(1);
} }
@ -78,6 +91,7 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
// Rewind and do it a second time
vector<unsigned int>ucsout2; vector<unsigned int>ucsout2;
it.rewind(); it.rewind();
for (int i = 0; ; i++) { for (int i = 0; ; i++) {
@ -95,7 +109,35 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
fclose(fp); ucsout2.clear();
int ercnt;
const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
string ucs, ucs1;
for (vector<unsigned int>::iterator it = ucsout1.begin();
it != ucsout1.end(); it++) {
unsigned int i = *it;
ucs.append((const char *)&i, 4);
}
if (!transcode(ucs, ucs1,
encoding, encoding, &ercnt) || ercnt) {
fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
exit(1);
}
if (ucs.compare(ucs1)) {
fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
exit(1);
}
if (!transcode(ucs, ucs1,
encoding, "UTF-8", &ercnt) || ercnt) {
fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
ercnt);
exit(1);
}
if (ucs1.compare(in)) {
fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
exit(1);
}
exit(0); exit(0);
} }

View File

@ -16,158 +16,230 @@
*/ */
#ifndef _UTF8ITER_H_INCLUDED_ #ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
/** /**
* A small helper class to iterate over utf8 strings. This is not an * A small helper class to iterate over utf8 strings. This is not an
* STL iterator and this is not well designed, just convenient for * STL iterator and does not much error checking. It is designed purely
some specific uses * for recoll usage, where the utf-8 string comes out of iconv in most cases
* and is assumed legal. We just try to catch cases where there would be
* a risk of crash.
*/ */
class Utf8Iter { class Utf8Iter {
unsigned int cl; // Char length at current position if known public:
const string &s; // String we're working with
string::size_type pos; // Current position in string
unsigned int m_charpos; // Current character posiiton
// Get character byte length at specified position
inline int get_cl(string::size_type p) const {
unsigned int z = (unsigned char)s[p];
if (z <= 127) {
return 1;
} else if (z>=192 && z <= 223) {
return 2;
} else if (z >= 224 && z <= 239) {
return 3;
} else if (z >= 240 && z <= 247) {
return 4;
} else if (z >= 248 && z <= 251) {
return 5;
} else if (z >= 252 && z <= 253) {
return 6;
}
return -1;
}
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
return p != string::npos && l > 0 && p + l <= s.length();
}
// Update current char length in object state. Assumes pos is inside string
inline int compute_cl() {
cl = 0;
cl = get_cl(pos);
if (!poslok(pos, cl)) {
pos = s.length();
cl = 0;
return -1;
}
return 0;
}
// Compute value at given position
inline unsigned int getvalueat(string::size_type p, int l) const {
switch (l) {
case 1: return (unsigned char)s[p];
case 2: return ((unsigned char)s[p] - 192) * 64 +
(unsigned char)s[p+1] - 128 ;
case 3: return ((unsigned char)s[p]-224)*4096 +
((unsigned char)s[p+1]-128)*64 +
(unsigned char)s[p+2]-128;
case 4: return ((unsigned char)s[p]-240)*262144 +
((unsigned char)s[p+1]-128)*4096 +
((unsigned char)s[p+2]-128)*64 +
(unsigned char)s[p+3]-128;
case 5: return ((unsigned char)s[p]-248)*16777216 +
((unsigned char)s[p+1]-128)*262144 +
((unsigned char)s[p+2]-128)*4096 +
((unsigned char)s[p+3]-128)*64 +
(unsigned char)s[p+4]-128;
case 6: return ((unsigned char)s[p]-252)*1073741824 +
((unsigned char)s[p+1]-128)*16777216 +
((unsigned char)s[p+2]-128)*262144 +
((unsigned char)s[p+3]-128)*4096 +
((unsigned char)s[p+4]-128)*64 +
(unsigned char)s[p+5]-128;
default:
return (unsigned int)-1;
}
}
public:
Utf8Iter(const string &in) Utf8Iter(const string &in)
: cl(0), s(in), pos(0), m_charpos(0) : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
{ {
// Ensure state is ok if appendchartostring is called at once compute_cl();
compute_cl(); }
}
void rewind() { void rewind()
cl=0; pos=0; m_charpos=0; {
} m_cl = 0;
/** operator* returns the ucs4 value as a machine integer*/ m_pos = 0;
unsigned int operator*() { m_charpos = 0;
if (!cl && compute_cl() < 0) m_error = false;
return (unsigned int)-1; compute_cl();
unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) {
pos = s.length();
cl = 0;
}
return val;
} }
/** "Direct" access. Awfully inefficient as we skip from start or current /** "Direct" access. Awfully inefficient as we skip from start or current
* position at best. This can only be useful for a lookahead from the * position at best. This can only be useful for a lookahead from the
* current position */ * current position */
unsigned int operator[](unsigned int charpos) const { unsigned int operator[](unsigned int charpos) const
{
string::size_type mypos = 0; string::size_type mypos = 0;
unsigned int mycp = 0;; unsigned int mycp = 0;
if (charpos >= m_charpos) { if (charpos >= m_charpos) {
mypos = pos; mypos = m_pos;
mycp = m_charpos; mycp = m_charpos;
} }
while (mypos < s.length() && mycp != charpos) { int l;
mypos += get_cl(mypos); while (mypos < m_s.length() && mycp != charpos) {
l = get_cl(mypos);
if (l < 0)
return (unsigned int)-1;
mypos += l;
++mycp; ++mycp;
} }
if (mypos < s.length() && mycp == charpos) { if (mypos < m_s.length() && mycp == charpos) {
int l = get_cl(mypos); l = get_cl(mypos);
if (poslok(mypos, l)) if (poslok(mypos, l))
return getvalueat(mypos, get_cl(mypos)); return getvalueat(mypos, get_cl(mypos));
} }
return (unsigned int)-1; return (unsigned int)-1;
} }
/** Set current position before next utf-8 character */ /** Increment current position to next utf-8 char */
string::size_type operator++(int) { string::size_type operator++(int)
if (!cl && compute_cl() < 0) { {
return pos = string::npos; // Note: m_cl may be zero at eof if user's test not right
} // this shouldn't crash the program until actual data access
pos += cl; #ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
if (m_cl == 0)
return string::npos;
m_pos += m_cl;
m_charpos++; m_charpos++;
cl = 0; compute_cl();
return pos; return m_pos;
} }
/** This needs to be fast. No error checking. */
void appendchartostring(string &out) { /** operator* returns the ucs4 value as a machine integer*/
out.append(&s[pos], cl); unsigned int operator*()
{
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
return getvalueat(m_pos, m_cl);
} }
/** Append current utf-8 possibly multi-byte character to string param.
This needs to be fast. No error checking. */
unsigned int appendchartostring(string &out) {
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
out.append(&m_s[m_pos], m_cl);
return m_cl;
}
/** Return current character as string */
operator string() { operator string() {
if (!cl && compute_cl() < 0) { #ifdef UTF8ITER_CHECK
return std::string(""); assert(m_cl != 0);
} #endif
return s.substr(pos, cl); return m_s.substr(m_pos, m_cl);
} }
bool eof() { bool eof() {
// Note: we always ensure that pos == s.length() when setting bad to return m_pos == m_s.length();
// true
return pos == s.length();
} }
bool error() { bool error() {
return compute_cl() < 0; return m_error;
} }
string::size_type getBpos() const { string::size_type getBpos() const {
return pos; return m_pos;
} }
string::size_type getCpos() const { string::size_type getCpos() const {
return m_charpos; return m_charpos;
} }
private:
// String we're working with
const string& m_s;
// Character length at current position. A value of zero indicates
// unknown or error.
unsigned int m_cl;
// Current byte offset in string.
string::size_type m_pos;
// Current character position
unsigned int m_charpos;
mutable bool m_error;
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
#ifdef UTF8ITER_CHECK
assert(p != string::npos && l > 0 && p + l <= m_s.length());
#endif
return p != string::npos && l > 0 && p + l <= m_s.length();
}
// Update current char length in object state, minimum checking for
// errors
inline int compute_cl()
{
m_cl = 0;
if (m_pos == m_s.length())
return -1;
m_cl = get_cl(m_pos);
if (!poslok(m_pos, m_cl)) {
m_pos = m_s.length();
m_cl = 0;
m_error = true;
return -1;
}
return 0;
}
// Get character byte length at specified position
inline int get_cl(string::size_type p) const
{
unsigned int z = (unsigned char)m_s[p];
if (z <= 127) {
return 1;
} else if ((z & 224) == 192) {
return 2;
} else if ((z & 240) == 224) {
return 3;
} else if ((z & 248) == 240) {
return 4;
}
#ifdef UTF8ITER_CHECK
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
(z & 248) == 240);
#endif
return -1;
}
// Compute value at given position. No error checking.
inline unsigned int getvalueat(string::size_type p, int l) const
{
switch (l) {
case 1:
#ifdef UTF8ITER_CHECK
assert((unsigned char)m_s[p] < 128);
#endif
return (unsigned char)m_s[p];
case 2:
#ifdef UTF8ITER_CHECK
assert(
((unsigned char)m_s[p] & 224) == 192
&& ((unsigned char)m_s[p+1] & 192) == 128
);
#endif
return ((unsigned char)m_s[p] - 192) * 64 +
(unsigned char)m_s[p+1] - 128 ;
case 3:
#ifdef UTF8ITER_CHECK
assert(
(((unsigned char)m_s[p]) & 240) == 224
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
);
#endif
return ((unsigned char)m_s[p] - 224) * 4096 +
((unsigned char)m_s[p+1] - 128) * 64 +
(unsigned char)m_s[p+2] - 128;
case 4:
#ifdef UTF8ITER_CHECK
assert(
(((unsigned char)m_s[p]) & 248) == 240
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
&& (((unsigned char)m_s[p+3]) & 192) == 128
);
#endif
return ((unsigned char)m_s[p]-240)*262144 +
((unsigned char)m_s[p+1]-128)*4096 +
((unsigned char)m_s[p+2]-128)*64 +
(unsigned char)m_s[p+3]-128;
default:
#ifdef UTF8ITER_CHECK
assert(l <= 4);
#endif
m_error = true;
return (unsigned int)-1;
}
}
}; };