improved word extraction a bit (unicode punctuation)

This commit is contained in:
dockes 2005-02-11 11:20:02 +00:00
parent 40a5905b15
commit d42db8b65d
6 changed files with 583 additions and 76 deletions

View File

@ -1,13 +1,15 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_TEXTSPLIT
#include <iostream>
#include <string>
#include <set>
#include "textsplit.h"
#include "debuglog.h"
#include "utf8iter.h"
#include "uproplist.h"
using namespace std;
@ -37,6 +39,8 @@ using namespace std;
// once.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
static int charclasses[256];
static set<unsigned int> unicign;
static void setcharclasses()
{
static int init = 0;
@ -67,6 +71,8 @@ static void setcharclasses()
init = 1;
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
for (i = 0; i < sizeof(uniign); i++)
unicign.insert(uniign[i]);
}
// Do some cleanup (the kind which is simpler to do here than in the main loop,
@ -152,6 +158,22 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
return true;
}
static inline int whatcc(unsigned int c)
{
int cc;
if (c <= 127) {
cc = charclasses[c];
} else {
if (c == (unsigned int)-1)
cc = SPACE;
else if (unicign.find(c) != unicign.end())
cc = SPACE;
else
cc = LETTER;
}
return cc;
}
/**
* Splitting a text into terms to be indexed.
* We basically emit a word every time we see a separator, but some chars are
@ -167,16 +189,21 @@ bool TextSplit::text_to_words(const string &in)
bool number = false;
int wordpos = 0;
int spanpos = 0;
unsigned int i;
int charpos = 0;
Utf8Iter it(in);
for (i = 0; i < in.length(); i++) {
int c = in[i];
int cc = charclasses[c];
for (; !it.eof(); it++, charpos++) {
unsigned int c = *it;
if (c == (unsigned int)-1) {
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
return false;
}
int cc = whatcc(c);
switch (cc) {
case SPACE:
SPACE:
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, true, i))
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
return false;
number = false;
}
@ -186,56 +213,57 @@ bool TextSplit::text_to_words(const string &in)
case '-':
case '+':
if (word.length() == 0) {
if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
if (whatcc(it[charpos+1]) == DIGIT) {
number = true;
word += c;
span += c;
word += it;
span += it;
}
} else {
if (!doemit(word, wordpos, span, spanpos, false, i))
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
return false;
number = false;
span += c;
span += it;
}
break;
case '@':
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, i))
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
return false;
number = false;
} else
word += c;
span += c;
word += it;
span += it;
break;
case '\'':
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, i))
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
return false;
number = false;
span += c;
span += it;
}
break;
case '.':
if (number) {
word += c;
word += it;
} else {
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, i))
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
return false;
number = false;
} else
word += c;
word += it;
}
span += c;
span += it;
break;
case '#':
// Keep it only at end of word...
if (word.length() > 0 &&
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
in[i+1] == '\n' || in[i+1] == '\r')) {
word += c;
span += c;
(whatcc(it[charpos+1]) == SPACE ||
whatcc(it[charpos+1]) == '\n' ||
whatcc(it[charpos+1]) == '\r')) {
word += it;
span += it;
}
break;
@ -261,13 +289,13 @@ bool TextSplit::text_to_words(const string &in)
else
number = false;
}
word += (char)c;
span += (char)c;
word += it;
span += it;
break;
}
}
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, true, i))
if (span.length()) {
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
return false;
}
return true;
@ -306,7 +334,8 @@ static string teststring =
"192.168.4.1 "
"one\n\rtwo\nthree-\nfour "
"[olala][ululu] "
"'o'brien' "
"'o'brien' "
"utf-8 ucs-4©"
"\n"
;

168
src/common/uproplist.h Normal file
View File

@ -0,0 +1,168 @@
#ifndef _PROPLIST_H_INCLUDED_
#define _PROPLIST_H_INCLUDED_
/* @(#$Id: uproplist.h,v 1.1 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
/*
* A subset of Unicode chars that we consider whitespace when we split text in
* words.
* This is used as a quick fix to the ascii-based code, and is not correct.
* the correct way would be to do what http://www.unicode.org/reports/tr29/
* says. We should then convert first to ucs-4, and then strictly use
* character properties, which might actually be simpler than the current
* solution...
*
* From:
# PropList-4.0.1.txt
# Date: 2004-03-02, 02:42:40 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2004 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see UCD.html
*/
static const unsigned int uniign[] = {
0x0085, /* ; White_Space # Cc <control-0085>*/
0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
0x00A1, /* misc signs, bullet etc... */
0x00A2,
0x00A3,
0x00A4,
0x00A5,
0x00A6,
0x00A9, /* copyright sign */
0x00AA,
0x00AE, /* registered sign */
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
0x2028, /* ; White_Space # Zl LINE SEPARATOR*/
0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/
0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/
0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/
0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/
0x002D, /* ; Dash # Pd HYPHEN-MINUS*/
0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/
0x1806, /* ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN*/
0x2010, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2011, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
0x2053, /* ; Dash # Po SWUNG DASH*/
0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/
0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/
0x2212, /* ; Dash # Sm MINUS SIGN*/
0x301C, /* ; Dash # Pd WAVE DASH*/
0x3030, /* ; Dash # Pd WAVY DASH*/
0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/
0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/
0xFE58, /* ; Dash # Pd SMALL EM DASH*/
0xFE63, /* ; Dash # Pd SMALL HYPHEN-MINUS*/
0xFF0D, /* ; Dash # Pd FULLWIDTH HYPHEN-MINUS*/
0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/
0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/
0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/
0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/
0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/
0xFF0D, /* ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS*/
0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/
0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/
0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/
0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/
0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/
0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/
0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/
0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/
0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/
0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/
0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/
0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/
0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/
0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/
0x301E, /* ; Quotation_Mark # Pe DOUBLE PRIME QUOTATION MARK*/
0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/
0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
0xFF02, /* ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK*/
0xFF07, /* ; Quotation_Mark # Po FULLWIDTH APOSTROPHE*/
0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/
0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/
0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/
0x002C, /* ; Terminal_Punctuation # Po COMMA*/
0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/
0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/
0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/
0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/
0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/
0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/
0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/
0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/
0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/
0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/
0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFF01, /* ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK*/
0xFF0C, /* ; Terminal_Punctuation # Po FULLWIDTH COMMA*/
0xFF0E, /* ; Terminal_Punctuation # Po FULLWIDTH FULL STOP*/
0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/
0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/
0x0021, /* ; STerm # Po EXCLAMATION MARK*/
0x002E, /* ; STerm # Po FULL STOP*/
0x003F, /* ; STerm # Po QUESTION MARK*/
0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/
0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/
0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/
0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/
0x06D4, /* ; STerm # Po ARABIC FULL STOP*/
0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/
0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/
0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/
0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x2047, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2048, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2049, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/
0xFE52, /* ; STerm # Po SMALL FULL STOP*/
0xFE56, /* ; STerm # Po SMALL QUESTION MARK*/
0xFE57, /* ; STerm # Po SMALL EXCLAMATION MARK*/
0xFF01, /* ; STerm # Po FULLWIDTH EXCLAMATION MARK*/
0xFF0E, /* ; STerm # Po FULLWIDTH FULL STOP*/
0xFF1F, /* ; STerm # Po FULLWIDTH QUESTION MARK*/
0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
};
#endif /*PLIST_H_INCLUDED_ */

View File

@ -35,7 +35,7 @@ trtranscode.o : ../utils/transcode.cpp
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
trmimeparse : $(MIMEPARSE_OBJS)
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
$(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
trmimeparse.o : mimeparse.cpp
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
mimeparse.cpp

View File

@ -1,10 +1,11 @@
#ifndef lint
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
#include <string>
#include <iostream>
#include <list>
#include <vector>
#include "debuglog.h"
using namespace std;
@ -22,32 +23,61 @@ int main(int argc, char **argv)
const char *infile = argv[1];
const char *outfile = argv[2];
string in;
string out;
if (!file_to_string(infile, in)) {
cerr << "Cant read file\n" << endl;
exit(1);
}
vector<unsigned int>ucsout1;
string out, out1;
Utf8Iter it(in);
FILE *fp = fopen(outfile, "w");
if (fp == 0) {
fprintf(stderr, "cant create %s\n", outfile);
exit(1);
}
while (!it.eof()) {
int nchars = 0;
for (;!it.eof(); it++) {
unsigned int value = *it;
it.appendchartostring(out);
it++;
if (value == (unsigned int)-1) {
fprintf(stderr, "Conversion error occurred\n");
exit(1);
}
ucsout1.push_back(value);
fwrite(&value, 4, 1, fp);
if (!it.appendchartostring(out))
break;
out1 += it;
nchars++;
}
fclose(fp);
if (it.error()) {
fprintf(stderr, "Conversion error occurred\n");
exit(1);
}
fprintf(stderr, "nchars1 %d\n", nchars);
if (in != out) {
fprintf(stderr, "error: out != in\n");
exit(1);
}
if (in != out1) {
fprintf(stderr, "error: out1 != in\n");
exit(1);
}
vector<unsigned int>ucsout2;
it.rewind();
for (int i = 0; ; i++) {
unsigned int value;
if ((value = it[i]) == (unsigned int)-1) {
fprintf(stderr, "%d chars\n", i);
break;
}
it++;
ucsout2.push_back(value);
}
if (ucsout1 != ucsout2) {
fprintf(stderr, "error: ucsout1 != ucsout2\n");
exit(1);
}
fclose(fp);
exit(0);
}

View File

@ -1,6 +1,6 @@
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: utf8iter.h,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
@ -8,58 +8,113 @@
some specific uses
*/
class Utf8Iter {
unsigned int cl;
const string &s;
string::size_type pos;
bool bad;
int compute_cl() {
unsigned int cl; // Char length at current position if known
const string &s; // String we're working with
string::size_type pos; // Current position in string
bool bad; // Status
unsigned int m_charpos; // Current character posiiton
// Get character byte length at specified position
inline int get_cl(string::size_type p) const {
unsigned int z = (unsigned char)s[p];
if (z <= 127) {
return 1;
} else if (z>=192 && z <= 223) {
return 2;
} else if (z >= 224 && z <= 239) {
return 3;
} else if (z >= 240 && z <= 247) {
return 4;
} else if (z >= 248 && z <= 251) {
return 5;
} else if (z >= 252 && z <= 253) {
return 6;
}
return -1;
}
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
return p != string::npos && l > 0 && p + l <= s.length();
}
// Update current char length in object state. Assumes pos is inside string
inline int compute_cl() {
cl = 0;
if (bad)
return -1;
unsigned int z = (unsigned char)s[pos];
if (z <= 127) {
cl = 1;
} else if (z>=192 && z <= 223) {
cl = 2;
} else if (z >= 224 && z <= 239) {
cl = 3;
} else if (z >= 240 && z <= 247) {
cl = 4;
} else if (z >= 248 && z <= 251) {
cl = 5;
} else if (z >= 252 && z <= 253) {
cl = 6;
}
if (!cl || s.length() - pos < cl) {
cl = get_cl(pos);
if (!poslok(pos, cl)) {
bad = true;
cl = 0;
return -1;
}
return 0;
}
// Compute value at given position
inline unsigned int getvalueat(string::size_type p, int l) const {
switch (l) {
case 1: return (unsigned char)s[p];
case 2: return ((unsigned char)s[p] - 192) * 64 +
(unsigned char)s[p+1] - 128 ;
case 3: return ((unsigned char)s[p]-224)*4096 +
((unsigned char)s[p+1]-128)*64 +
(unsigned char)s[p+2]-128;
case 4: return ((unsigned char)s[p]-240)*262144 +
((unsigned char)s[p+1]-128)*4096 +
((unsigned char)s[p+2]-128)*64 +
(unsigned char)s[p+3]-128;
case 5: return ((unsigned char)s[p]-248)*16777216 +
((unsigned char)s[p+1]-128)*262144 +
((unsigned char)s[p+2]-128)*4096 +
((unsigned char)s[p+3]-128)*64 +
(unsigned char)s[p+4]-128;
case 6: return ((unsigned char)s[p]-252)*1073741824 +
((unsigned char)s[p+1]-128)*16777216 +
((unsigned char)s[p+2]-128)*262144 +
((unsigned char)s[p+3]-128)*4096 +
((unsigned char)s[p+4]-128)*64 +
(unsigned char)s[p+5]-128;
default:
return (unsigned int)-1;
}
}
public:
Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
Utf8Iter(const string &in)
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
void rewind() {
cl=0; pos=0; bad=false; m_charpos=0;
}
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
switch (cl) {
case 1: return (unsigned char)s[pos];
case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
default:
unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) {
bad = true;
cl = 0;
return (unsigned int)-1;
}
return val;
}
/** "Direct" access. Awfully inefficient as we skip from start or current
* position at best. This can only be useful for a lookahead from the
* current position */
unsigned int operator[](unsigned int charpos) const {
string::size_type mypos = 0;
unsigned int mycp = 0;;
if (charpos >= m_charpos) {
mypos = pos;
mycp = m_charpos;
}
while (mypos < s.length() && mycp != charpos) {
mypos += get_cl(mypos);
++mycp;
}
if (mypos < s.length() && mycp == charpos) {
int l = get_cl(mypos);
if (poslok(mypos, l))
return getvalueat(mypos, get_cl(mypos));
}
return (unsigned int)-1;
}
string::size_type operator++(int) {
@ -67,6 +122,7 @@ class Utf8Iter {
return string::npos;
}
pos += cl;
m_charpos++;
cl = 0;
return pos;
}
@ -78,12 +134,24 @@ class Utf8Iter {
out += s.substr(pos, cl);
return true;
}
operator string() {
if (bad || (!cl && compute_cl() < 0)) {
return false;
}
return s.substr(pos, cl);
}
bool eof() {
return bad || pos == s.length();
}
bool error() {
return bad;
}
string::size_type getBpos() const {
return pos;
}
string::size_type getCpos() const {
return m_charpos;
}
};

212
src/utils/utf8testin.txt Normal file
View File

@ -0,0 +1,212 @@
UTF-8 encoded sample plain-text file
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
Markus Kuhn [ˈmaʳkʊs kuːn] <http://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25
The ASCII compatible UTF-8 encoding used in this plain-text file
is defined in Unicode, ISO 10646-1, and RFC 2279.
Using Unicode/UTF-8, you can write in emails and source code things such as
Mathematics and sciences:
∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
⎪⎢⎜│a²+b³ ⎟⎥⎪
∀x∈: ⌈x⌉ = x⌋, α ∧ ¬β = ¬(¬α β), ⎪⎢⎜│───── ⎟⎥⎪
⎪⎢⎜⎷ c₈ ⎟⎥⎪
⊆ ℕ₀ ⊂ , ⎨⎢⎜ ⎟⎥⎬
⎪⎢⎜ ∞ ⎟⎥⎪
⊥ < a ≠ b ≡ c ≤ d ≪ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
Linguistics and dictionaries:
ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
APL:
((VV)=V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
Nicer typography in plain text files:
╔══════════════════════════════════════════╗
║ ║
║ • single and “double” quotes ║
║ ║
║ • Curly apostrophes: “Weve been here” ║
║ ║
║ • Latin-1 apostrophe and accents: '´` ║
║ ║
║ • deutsche „Anführungszeichen“ ║
║ ║
║ • †, ‡, ‰, •, 34, —, 5/+5, ™, … ║
║ ║
║ • ASCII safety test: 1lI|, 0OD, 8B ║
║ ╭─────────╮ ║
║ • the euro symbol: │ 14.95 € │ ║
║ ╰─────────╯ ║
╚══════════════════════════════════════════╝
Combining characters:
STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
Greek (in Polytonic):
The Greek anthem:
Σὲ γνωρίζω ἀπὸ τὴν κόψη
τοῦ σπαθιοῦ τὴν τρομερή,
σὲ γνωρίζω ἀπὸ τὴν ὄψη
ποὺ μὲ βία μετράει τὴ γῆ.
᾿Απ᾿ τὰ κόκκαλα βγαλμένη
τῶν ῾Ελλήνων τὰ ἱερά
καὶ σὰν πρῶτα ἀνδρειωμένη
χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
From a speech of Demosthenes in the 4th century BC:
Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
Georgian:
From a Unicode conference invitation:
გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
Russian:
From a Unicode conference invitation:
Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
Конференция соберет широкий круг экспертов по вопросам глобального
Интернета и Unicode, локализации и интернационализации, воплощению и
применению Unicode в различных операционных системах и программных
приложениях, шрифтах, верстке и многоязычных компьютерных системах.
Thai (UCS Level 2):
Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
classic 'San Gua'):
[----------------------------|------------------------]
๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
(The above is a two-column text. If combining characters are handled
correctly, the lines of the second column should be aligned with the
| character above.)
Ethiopian:
Proverbs in the Amharic language:
ሰማይ አይታረስ ንጉሥ አይከሰስ።
ብላ ካለኝ እንደአባቴ በቆመጠኝ።
ጌጥ ያለቤቱ ቁምጥና ነው።
ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
የአፍ ወለምታ በቅቤ አይታሽም።
አይጥ በበላ ዳዋ ተመታ።
ሲተረጉሙ ይደረግሙ።
ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
ድር ቢያብር አንበሳ ያስር።
ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
ሥራ ከመፍታት ልጄን ላፋታት።
ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
ተንጋሎ ቢተፉ ተመልሶ ባፉ።
ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
እግርህን በፍራሽህ ልክ ዘርጋ።
Runes:
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
(Old English, which transcribed into Latin reads 'He cwaeth that he
bude thaem lande northweardum with tha Westsae.' and means 'He said
that he lived in the northern land near the Western Sea.')
Braille:
⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
(The first couple of paragraphs of "A Christmas Carol" by Dickens)
Compact font selection example text:
ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
–—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi<>⑀₂ἠḂӥẄɐː⍎אԱა
Greetings in various languages:
Hello world, Καλημέρα κόσμε, コンニチハ
Box drawing alignment tests: █
╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
╠╡ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
▝▀▘▙▄▟