cleaned and speeded up utf8iter
This commit is contained in:
parent
b3ab39522b
commit
a573fbd1a9
@ -65,9 +65,9 @@ trwipedir.o : wipedir.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
|
||||||
wipedir.cpp
|
wipedir.cpp
|
||||||
|
|
||||||
UTF8ITER_OBJS= trutf8iter.o $(BIGLIB)
|
UTF8ITER_OBJS= trutf8iter.o
|
||||||
utf8iter : $(UTF8ITER_OBJS)
|
utf8iter : $(UTF8ITER_OBJS)
|
||||||
$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(LIBICONV)
|
$(CXX) $(ALL_CXXFLAGS) -o utf8iter $(UTF8ITER_OBJS) $(BIGLIB) $(LIBICONV)
|
||||||
trutf8iter.o : utf8iter.cpp utf8iter.h
|
trutf8iter.o : utf8iter.cpp utf8iter.h
|
||||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_UTF8ITER -c -o trutf8iter.o \
|
||||||
utf8iter.cpp
|
utf8iter.cpp
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.5 2006-11-20 11:16:54 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -22,16 +22,19 @@ static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.4 2006-01-23 13:32:28 dockes Exp
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "transcode.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
|
#define UTF8ITER_CHECK
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
if (argc != 3) {
|
if (argc != 3) {
|
||||||
@ -54,6 +57,7 @@ int main(int argc, char **argv)
|
|||||||
fprintf(stderr, "cant create %s\n", outfile);
|
fprintf(stderr, "cant create %s\n", outfile);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int nchars = 0;
|
int nchars = 0;
|
||||||
for (;!it.eof(); it++) {
|
for (;!it.eof(); it++) {
|
||||||
unsigned int value = *it;
|
unsigned int value = *it;
|
||||||
@ -61,15 +65,24 @@ int main(int argc, char **argv)
|
|||||||
fprintf(stderr, "Conversion error occurred\n");
|
fprintf(stderr, "Conversion error occurred\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
// UTF-32LE or BE array
|
||||||
ucsout1.push_back(value);
|
ucsout1.push_back(value);
|
||||||
|
// UTF-32LE or BE file
|
||||||
fwrite(&value, 4, 1, fp);
|
fwrite(&value, 4, 1, fp);
|
||||||
|
|
||||||
|
// Reconstructed utf8 strings (2 methods)
|
||||||
if (!it.appendchartostring(out))
|
if (!it.appendchartostring(out))
|
||||||
break;
|
break;
|
||||||
|
// conversion to string
|
||||||
out1 += it;
|
out1 += it;
|
||||||
|
|
||||||
|
// fprintf(stderr, "%s", string(it).c_str());
|
||||||
nchars++;
|
nchars++;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "nchars1 %d\n", nchars);
|
fclose(fp);
|
||||||
if (in != out) {
|
|
||||||
|
fprintf(stderr, "nchars %d\n", nchars);
|
||||||
|
if (in.compare(out)) {
|
||||||
fprintf(stderr, "error: out != in\n");
|
fprintf(stderr, "error: out != in\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -78,6 +91,7 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Rewind and do it a second time
|
||||||
vector<unsigned int>ucsout2;
|
vector<unsigned int>ucsout2;
|
||||||
it.rewind();
|
it.rewind();
|
||||||
for (int i = 0; ; i++) {
|
for (int i = 0; ; i++) {
|
||||||
@ -95,7 +109,35 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose(fp);
|
ucsout2.clear();
|
||||||
|
int ercnt;
|
||||||
|
const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
|
||||||
|
string ucs, ucs1;
|
||||||
|
for (vector<unsigned int>::iterator it = ucsout1.begin();
|
||||||
|
it != ucsout1.end(); it++) {
|
||||||
|
unsigned int i = *it;
|
||||||
|
ucs.append((const char *)&i, 4);
|
||||||
|
}
|
||||||
|
if (!transcode(ucs, ucs1,
|
||||||
|
encoding, encoding, &ercnt) || ercnt) {
|
||||||
|
fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ucs.compare(ucs1)) {
|
||||||
|
fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!transcode(ucs, ucs1,
|
||||||
|
encoding, "UTF-8", &ercnt) || ercnt) {
|
||||||
|
fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
|
||||||
|
ercnt);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ucs1.compare(in)) {
|
||||||
|
fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -16,158 +16,230 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _UTF8ITER_H_INCLUDED_
|
#ifndef _UTF8ITER_H_INCLUDED_
|
||||||
#define _UTF8ITER_H_INCLUDED_
|
#define _UTF8ITER_H_INCLUDED_
|
||||||
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A small helper class to iterate over utf8 strings. This is not an
|
* A small helper class to iterate over utf8 strings. This is not an
|
||||||
* STL iterator and this is not well designed, just convenient for
|
* STL iterator and does not much error checking. It is designed purely
|
||||||
some specific uses
|
* for recoll usage, where the utf-8 string comes out of iconv in most cases
|
||||||
|
* and is assumed legal. We just try to catch cases where there would be
|
||||||
|
* a risk of crash.
|
||||||
*/
|
*/
|
||||||
class Utf8Iter {
|
class Utf8Iter {
|
||||||
unsigned int cl; // Char length at current position if known
|
public:
|
||||||
const string &s; // String we're working with
|
|
||||||
string::size_type pos; // Current position in string
|
|
||||||
unsigned int m_charpos; // Current character posiiton
|
|
||||||
|
|
||||||
// Get character byte length at specified position
|
|
||||||
inline int get_cl(string::size_type p) const {
|
|
||||||
unsigned int z = (unsigned char)s[p];
|
|
||||||
if (z <= 127) {
|
|
||||||
return 1;
|
|
||||||
} else if (z>=192 && z <= 223) {
|
|
||||||
return 2;
|
|
||||||
} else if (z >= 224 && z <= 239) {
|
|
||||||
return 3;
|
|
||||||
} else if (z >= 240 && z <= 247) {
|
|
||||||
return 4;
|
|
||||||
} else if (z >= 248 && z <= 251) {
|
|
||||||
return 5;
|
|
||||||
} else if (z >= 252 && z <= 253) {
|
|
||||||
return 6;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
// Check position and cl against string length
|
|
||||||
bool poslok(string::size_type p, int l) const {
|
|
||||||
return p != string::npos && l > 0 && p + l <= s.length();
|
|
||||||
}
|
|
||||||
// Update current char length in object state. Assumes pos is inside string
|
|
||||||
inline int compute_cl() {
|
|
||||||
cl = 0;
|
|
||||||
cl = get_cl(pos);
|
|
||||||
if (!poslok(pos, cl)) {
|
|
||||||
pos = s.length();
|
|
||||||
cl = 0;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
// Compute value at given position
|
|
||||||
inline unsigned int getvalueat(string::size_type p, int l) const {
|
|
||||||
switch (l) {
|
|
||||||
case 1: return (unsigned char)s[p];
|
|
||||||
case 2: return ((unsigned char)s[p] - 192) * 64 +
|
|
||||||
(unsigned char)s[p+1] - 128 ;
|
|
||||||
case 3: return ((unsigned char)s[p]-224)*4096 +
|
|
||||||
((unsigned char)s[p+1]-128)*64 +
|
|
||||||
(unsigned char)s[p+2]-128;
|
|
||||||
case 4: return ((unsigned char)s[p]-240)*262144 +
|
|
||||||
((unsigned char)s[p+1]-128)*4096 +
|
|
||||||
((unsigned char)s[p+2]-128)*64 +
|
|
||||||
(unsigned char)s[p+3]-128;
|
|
||||||
case 5: return ((unsigned char)s[p]-248)*16777216 +
|
|
||||||
((unsigned char)s[p+1]-128)*262144 +
|
|
||||||
((unsigned char)s[p+2]-128)*4096 +
|
|
||||||
((unsigned char)s[p+3]-128)*64 +
|
|
||||||
(unsigned char)s[p+4]-128;
|
|
||||||
case 6: return ((unsigned char)s[p]-252)*1073741824 +
|
|
||||||
((unsigned char)s[p+1]-128)*16777216 +
|
|
||||||
((unsigned char)s[p+2]-128)*262144 +
|
|
||||||
((unsigned char)s[p+3]-128)*4096 +
|
|
||||||
((unsigned char)s[p+4]-128)*64 +
|
|
||||||
(unsigned char)s[p+5]-128;
|
|
||||||
default:
|
|
||||||
return (unsigned int)-1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public:
|
|
||||||
Utf8Iter(const string &in)
|
Utf8Iter(const string &in)
|
||||||
: cl(0), s(in), pos(0), m_charpos(0)
|
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
||||||
{
|
{
|
||||||
// Ensure state is ok if appendchartostring is called at once
|
compute_cl();
|
||||||
compute_cl();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
void rewind() {
|
void rewind()
|
||||||
cl=0; pos=0; m_charpos=0;
|
{
|
||||||
}
|
m_cl = 0;
|
||||||
/** operator* returns the ucs4 value as a machine integer*/
|
m_pos = 0;
|
||||||
unsigned int operator*() {
|
m_charpos = 0;
|
||||||
if (!cl && compute_cl() < 0)
|
m_error = false;
|
||||||
return (unsigned int)-1;
|
compute_cl();
|
||||||
unsigned int val = getvalueat(pos, cl);
|
|
||||||
if (val == (unsigned int)-1) {
|
|
||||||
pos = s.length();
|
|
||||||
cl = 0;
|
|
||||||
}
|
|
||||||
return val;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** "Direct" access. Awfully inefficient as we skip from start or current
|
/** "Direct" access. Awfully inefficient as we skip from start or current
|
||||||
* position at best. This can only be useful for a lookahead from the
|
* position at best. This can only be useful for a lookahead from the
|
||||||
* current position */
|
* current position */
|
||||||
unsigned int operator[](unsigned int charpos) const {
|
unsigned int operator[](unsigned int charpos) const
|
||||||
|
{
|
||||||
string::size_type mypos = 0;
|
string::size_type mypos = 0;
|
||||||
unsigned int mycp = 0;;
|
unsigned int mycp = 0;
|
||||||
if (charpos >= m_charpos) {
|
if (charpos >= m_charpos) {
|
||||||
mypos = pos;
|
mypos = m_pos;
|
||||||
mycp = m_charpos;
|
mycp = m_charpos;
|
||||||
}
|
}
|
||||||
while (mypos < s.length() && mycp != charpos) {
|
int l;
|
||||||
mypos += get_cl(mypos);
|
while (mypos < m_s.length() && mycp != charpos) {
|
||||||
|
l = get_cl(mypos);
|
||||||
|
if (l < 0)
|
||||||
|
return (unsigned int)-1;
|
||||||
|
mypos += l;
|
||||||
++mycp;
|
++mycp;
|
||||||
}
|
}
|
||||||
if (mypos < s.length() && mycp == charpos) {
|
if (mypos < m_s.length() && mycp == charpos) {
|
||||||
int l = get_cl(mypos);
|
l = get_cl(mypos);
|
||||||
if (poslok(mypos, l))
|
if (poslok(mypos, l))
|
||||||
return getvalueat(mypos, get_cl(mypos));
|
return getvalueat(mypos, get_cl(mypos));
|
||||||
}
|
}
|
||||||
return (unsigned int)-1;
|
return (unsigned int)-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Set current position before next utf-8 character */
|
/** Increment current position to next utf-8 char */
|
||||||
string::size_type operator++(int) {
|
string::size_type operator++(int)
|
||||||
if (!cl && compute_cl() < 0) {
|
{
|
||||||
return pos = string::npos;
|
// Note: m_cl may be zero at eof if user's test not right
|
||||||
}
|
// this shouldn't crash the program until actual data access
|
||||||
pos += cl;
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(m_cl != 0);
|
||||||
|
#endif
|
||||||
|
if (m_cl == 0)
|
||||||
|
return string::npos;
|
||||||
|
|
||||||
|
m_pos += m_cl;
|
||||||
m_charpos++;
|
m_charpos++;
|
||||||
cl = 0;
|
compute_cl();
|
||||||
return pos;
|
return m_pos;
|
||||||
}
|
}
|
||||||
/** This needs to be fast. No error checking. */
|
|
||||||
void appendchartostring(string &out) {
|
/** operator* returns the ucs4 value as a machine integer*/
|
||||||
out.append(&s[pos], cl);
|
unsigned int operator*()
|
||||||
|
{
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(m_cl != 0);
|
||||||
|
#endif
|
||||||
|
return getvalueat(m_pos, m_cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Append current utf-8 possibly multi-byte character to string param.
|
||||||
|
This needs to be fast. No error checking. */
|
||||||
|
unsigned int appendchartostring(string &out) {
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(m_cl != 0);
|
||||||
|
#endif
|
||||||
|
out.append(&m_s[m_pos], m_cl);
|
||||||
|
return m_cl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return current character as string */
|
||||||
operator string() {
|
operator string() {
|
||||||
if (!cl && compute_cl() < 0) {
|
#ifdef UTF8ITER_CHECK
|
||||||
return std::string("");
|
assert(m_cl != 0);
|
||||||
}
|
#endif
|
||||||
return s.substr(pos, cl);
|
return m_s.substr(m_pos, m_cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool eof() {
|
bool eof() {
|
||||||
// Note: we always ensure that pos == s.length() when setting bad to
|
return m_pos == m_s.length();
|
||||||
// true
|
|
||||||
return pos == s.length();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool error() {
|
bool error() {
|
||||||
return compute_cl() < 0;
|
return m_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
string::size_type getBpos() const {
|
string::size_type getBpos() const {
|
||||||
return pos;
|
return m_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
string::size_type getCpos() const {
|
string::size_type getCpos() const {
|
||||||
return m_charpos;
|
return m_charpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// String we're working with
|
||||||
|
const string& m_s;
|
||||||
|
// Character length at current position. A value of zero indicates
|
||||||
|
// unknown or error.
|
||||||
|
unsigned int m_cl;
|
||||||
|
// Current byte offset in string.
|
||||||
|
string::size_type m_pos;
|
||||||
|
// Current character position
|
||||||
|
unsigned int m_charpos;
|
||||||
|
mutable bool m_error;
|
||||||
|
|
||||||
|
// Check position and cl against string length
|
||||||
|
bool poslok(string::size_type p, int l) const {
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(p != string::npos && l > 0 && p + l <= m_s.length());
|
||||||
|
#endif
|
||||||
|
return p != string::npos && l > 0 && p + l <= m_s.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update current char length in object state, minimum checking for
|
||||||
|
// errors
|
||||||
|
inline int compute_cl()
|
||||||
|
{
|
||||||
|
m_cl = 0;
|
||||||
|
if (m_pos == m_s.length())
|
||||||
|
return -1;
|
||||||
|
m_cl = get_cl(m_pos);
|
||||||
|
if (!poslok(m_pos, m_cl)) {
|
||||||
|
m_pos = m_s.length();
|
||||||
|
m_cl = 0;
|
||||||
|
m_error = true;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get character byte length at specified position
|
||||||
|
inline int get_cl(string::size_type p) const
|
||||||
|
{
|
||||||
|
unsigned int z = (unsigned char)m_s[p];
|
||||||
|
if (z <= 127) {
|
||||||
|
return 1;
|
||||||
|
} else if ((z & 224) == 192) {
|
||||||
|
return 2;
|
||||||
|
} else if ((z & 240) == 224) {
|
||||||
|
return 3;
|
||||||
|
} else if ((z & 248) == 240) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
||||||
|
(z & 248) == 240);
|
||||||
|
#endif
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute value at given position. No error checking.
|
||||||
|
inline unsigned int getvalueat(string::size_type p, int l) const
|
||||||
|
{
|
||||||
|
switch (l) {
|
||||||
|
case 1:
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert((unsigned char)m_s[p] < 128);
|
||||||
|
#endif
|
||||||
|
return (unsigned char)m_s[p];
|
||||||
|
case 2:
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(
|
||||||
|
((unsigned char)m_s[p] & 224) == 192
|
||||||
|
&& ((unsigned char)m_s[p+1] & 192) == 128
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
return ((unsigned char)m_s[p] - 192) * 64 +
|
||||||
|
(unsigned char)m_s[p+1] - 128 ;
|
||||||
|
case 3:
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(
|
||||||
|
(((unsigned char)m_s[p]) & 240) == 224
|
||||||
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ((unsigned char)m_s[p] - 224) * 4096 +
|
||||||
|
((unsigned char)m_s[p+1] - 128) * 64 +
|
||||||
|
(unsigned char)m_s[p+2] - 128;
|
||||||
|
case 4:
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(
|
||||||
|
(((unsigned char)m_s[p]) & 248) == 240
|
||||||
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ((unsigned char)m_s[p]-240)*262144 +
|
||||||
|
((unsigned char)m_s[p+1]-128)*4096 +
|
||||||
|
((unsigned char)m_s[p+2]-128)*64 +
|
||||||
|
(unsigned char)m_s[p+3]-128;
|
||||||
|
|
||||||
|
default:
|
||||||
|
#ifdef UTF8ITER_CHECK
|
||||||
|
assert(l <= 4);
|
||||||
|
#endif
|
||||||
|
m_error = true;
|
||||||
|
return (unsigned int)-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user