cleaned and speeded up utf8iter

This commit is contained in:
dockes 2006-11-20 11:16:54 +00:00
parent b3ab39522b
commit a573fbd1a9
3 changed files with 237 additions and 123 deletions

View File

@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp
$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
wipedir.cpp
UTF8ITER_OBJS= trutf8iter.o $(BIGLIB)
UTF8ITER_OBJS= trutf8iter.o
utf8iter : $(UTF8ITER_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV)
trutf8iter.o : utf8iter.cpp utf8iter.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
utf8iter.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
#include <iostream>
#include <list>
#include <vector>
#include "debuglog.h"
#include "transcode.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
#define UTF8ITER_CHECK
#include "utf8iter.h"
#include "readfile.h"
int main(int argc, char **argv)
{
if (argc != 3) {
@ -54,6 +57,7 @@ int main(int argc, char **argv)
fprintf(stderr, "cant create %s\n", outfile);
exit(1);
}
int nchars = 0;
for (;!it.eof(); it++) {
unsigned int value = *it;
@ -61,15 +65,24 @@ int main(int argc, char **argv)
fprintf(stderr, "Conversion error occurred\n");
exit(1);
}
// UTF-32LE or BE array
ucsout1.push_back(value);
// UTF-32LE or BE file
fwrite(&value, 4, 1, fp);
// Reconstructed utf8 strings (2 methods)
if (!it.appendchartostring(out))
break;
// conversion to string
out1 += it;
// fprintf(stderr, "%s", string(it).c_str());
nchars++;
}
fprintf(stderr, "nchars1 %d\n", nchars);
if (in != out) {
fclose(fp);
fprintf(stderr, "nchars %d\n", nchars);
if (in.compare(out)) {
fprintf(stderr, "error: out != in\n");
exit(1);
}
@ -78,6 +91,7 @@ int main(int argc, char **argv)
exit(1);
}
// Rewind and do it a second time
vector<unsigned int>ucsout2;
it.rewind();
for (int i = 0; ; i++) {
@ -95,7 +109,35 @@ int main(int argc, char **argv)
exit(1);
}
fclose(fp);
ucsout2.clear();
int ercnt;
const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
string ucs, ucs1;
for (vector<unsigned int>::iterator it = ucsout1.begin();
it != ucsout1.end(); it++) {
unsigned int i = *it;
ucs.append((const char *)&i, 4);
}
if (!transcode(ucs, ucs1,
encoding, encoding, &ercnt) || ercnt) {
fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
exit(1);
}
if (ucs.compare(ucs1)) {
fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
exit(1);
}
if (!transcode(ucs, ucs1,
encoding, "UTF-8", &ercnt) || ercnt) {
fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
ercnt);
exit(1);
}
if (ucs1.compare(in)) {
fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
exit(1);
}
exit(0);
}

View File

@ -16,158 +16,230 @@
*/
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
* STL iterator and this is not well designed, just convenient for
some specific uses
* STL iterator and does not much error checking. It is designed purely
* for recoll usage, where the utf-8 string comes out of iconv in most cases
* and is assumed legal. We just try to catch cases where there would be
* a risk of crash.
*/
class Utf8Iter {
unsigned int cl; // Char length at current position if known
const string &s; // String we're working with
string::size_type pos; // Current position in string
unsigned int m_charpos; // Current character posiiton
// Get character byte length at specified position
inline int get_cl(string::size_type p) const {
unsigned int z = (unsigned char)s[p];
if (z <= 127) {
return 1;
} else if (z>=192 && z <= 223) {
return 2;
} else if (z >= 224 && z <= 239) {
return 3;
} else if (z >= 240 && z <= 247) {
return 4;
} else if (z >= 248 && z <= 251) {
return 5;
} else if (z >= 252 && z <= 253) {
return 6;
}
return -1;
}
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
return p != string::npos && l > 0 && p + l <= s.length();
}
// Update current char length in object state. Assumes pos is inside string
inline int compute_cl() {
cl = 0;
cl = get_cl(pos);
if (!poslok(pos, cl)) {
pos = s.length();
cl = 0;
return -1;
}
return 0;
}
// Compute value at given position
inline unsigned int getvalueat(string::size_type p, int l) const {
switch (l) {
case 1: return (unsigned char)s[p];
case 2: return ((unsigned char)s[p] - 192) * 64 +
(unsigned char)s[p+1] - 128 ;
case 3: return ((unsigned char)s[p]-224)*4096 +
((unsigned char)s[p+1]-128)*64 +
(unsigned char)s[p+2]-128;
case 4: return ((unsigned char)s[p]-240)*262144 +
((unsigned char)s[p+1]-128)*4096 +
((unsigned char)s[p+2]-128)*64 +
(unsigned char)s[p+3]-128;
case 5: return ((unsigned char)s[p]-248)*16777216 +
((unsigned char)s[p+1]-128)*262144 +
((unsigned char)s[p+2]-128)*4096 +
((unsigned char)s[p+3]-128)*64 +
(unsigned char)s[p+4]-128;
case 6: return ((unsigned char)s[p]-252)*1073741824 +
((unsigned char)s[p+1]-128)*16777216 +
((unsigned char)s[p+2]-128)*262144 +
((unsigned char)s[p+3]-128)*4096 +
((unsigned char)s[p+4]-128)*64 +
(unsigned char)s[p+5]-128;
default:
return (unsigned int)-1;
}
}
public:
public:
Utf8Iter(const string &in)
: cl(0), s(in), pos(0), m_charpos(0)
{
// Ensure state is ok if appendchartostring is called at once
compute_cl();
}
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
{
compute_cl();
}
void rewind() {
cl=0; pos=0; m_charpos=0;
}
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) {
pos = s.length();
cl = 0;
}
return val;
void rewind()
{
m_cl = 0;
m_pos = 0;
m_charpos = 0;
m_error = false;
compute_cl();
}
/** "Direct" access. Awfully inefficient as we skip from start or current
* position at best. This can only be useful for a lookahead from the
* current position */
unsigned int operator[](unsigned int charpos) const {
unsigned int operator[](unsigned int charpos) const
{
string::size_type mypos = 0;
unsigned int mycp = 0;;
unsigned int mycp = 0;
if (charpos >= m_charpos) {
mypos = pos;
mypos = m_pos;
mycp = m_charpos;
}
while (mypos < s.length() && mycp != charpos) {
mypos += get_cl(mypos);
int l;
while (mypos < m_s.length() && mycp != charpos) {
l = get_cl(mypos);
if (l < 0)
return (unsigned int)-1;
mypos += l;
++mycp;
}
if (mypos < s.length() && mycp == charpos) {
int l = get_cl(mypos);
if (mypos < m_s.length() && mycp == charpos) {
l = get_cl(mypos);
if (poslok(mypos, l))
return getvalueat(mypos, get_cl(mypos));
}
return (unsigned int)-1;
}
/** Set current position before next utf-8 character */
string::size_type operator++(int) {
if (!cl && compute_cl() < 0) {
return pos = string::npos;
}
pos += cl;
/** Increment current position to next utf-8 char */
string::size_type operator++(int)
{
// Note: m_cl may be zero at eof if user's test not right
// this shouldn't crash the program until actual data access
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
if (m_cl == 0)
return string::npos;
m_pos += m_cl;
m_charpos++;
cl = 0;
return pos;
compute_cl();
return m_pos;
}
/** This needs to be fast. No error checking. */
void appendchartostring(string &out) {
out.append(&s[pos], cl);
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*()
{
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
return getvalueat(m_pos, m_cl);
}
/** Append current utf-8 possibly multi-byte character to string param.
This needs to be fast. No error checking. */
unsigned int appendchartostring(string &out) {
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
out.append(&m_s[m_pos], m_cl);
return m_cl;
}
/** Return current character as string */
operator string() {
if (!cl && compute_cl() < 0) {
return std::string("");
}
return s.substr(pos, cl);
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif
return m_s.substr(m_pos, m_cl);
}
bool eof() {
// Note: we always ensure that pos == s.length() when setting bad to
// true
return pos == s.length();
return m_pos == m_s.length();
}
bool error() {
return compute_cl() < 0;
return m_error;
}
string::size_type getBpos() const {
return pos;
return m_pos;
}
string::size_type getCpos() const {
return m_charpos;
}
private:
// String we're working with
const string& m_s;
// Character length at current position. A value of zero indicates
// unknown or error.
unsigned int m_cl;
// Current byte offset in string.
string::size_type m_pos;
// Current character position
unsigned int m_charpos;
mutable bool m_error;
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
#ifdef UTF8ITER_CHECK
assert(p != string::npos && l > 0 && p + l <= m_s.length());
#endif
return p != string::npos && l > 0 && p + l <= m_s.length();
}
// Update current char length in object state, minimum checking for
// errors
inline int compute_cl()
{
m_cl = 0;
if (m_pos == m_s.length())
return -1;
m_cl = get_cl(m_pos);
if (!poslok(m_pos, m_cl)) {
m_pos = m_s.length();
m_cl = 0;
m_error = true;
return -1;
}
return 0;
}
// Get character byte length at specified position
inline int get_cl(string::size_type p) const
{
unsigned int z = (unsigned char)m_s[p];
if (z <= 127) {
return 1;
} else if ((z & 224) == 192) {
return 2;
} else if ((z & 240) == 224) {
return 3;
} else if ((z & 248) == 240) {
return 4;
}
#ifdef UTF8ITER_CHECK
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
(z & 248) == 240);
#endif
return -1;
}
// Compute value at given position. No error checking.
inline unsigned int getvalueat(string::size_type p, int l) const
{
switch (l) {
case 1:
#ifdef UTF8ITER_CHECK
assert((unsigned char)m_s[p] < 128);
#endif
return (unsigned char)m_s[p];
case 2:
#ifdef UTF8ITER_CHECK
assert(
((unsigned char)m_s[p] & 224) == 192
&& ((unsigned char)m_s[p+1] & 192) == 128
);
#endif
return ((unsigned char)m_s[p] - 192) * 64 +
(unsigned char)m_s[p+1] - 128 ;
case 3:
#ifdef UTF8ITER_CHECK
assert(
(((unsigned char)m_s[p]) & 240) == 224
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
);
#endif
return ((unsigned char)m_s[p] - 224) * 4096 +
((unsigned char)m_s[p+1] - 128) * 64 +
(unsigned char)m_s[p+2] - 128;
case 4:
#ifdef UTF8ITER_CHECK
assert(
(((unsigned char)m_s[p]) & 248) == 240
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
&& (((unsigned char)m_s[p+3]) & 192) == 128
);
#endif
return ((unsigned char)m_s[p]-240)*262144 +
((unsigned char)m_s[p+1]-128)*4096 +
((unsigned char)m_s[p+2]-128)*64 +
(unsigned char)m_s[p+3]-128;
default:
#ifdef UTF8ITER_CHECK
assert(l <= 4);
#endif
m_error = true;
return (unsigned int)-1;
}
}
};