diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index a46ba26b..f825571b 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,13 +1,15 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT #include #include - +#include #include "textsplit.h" #include "debuglog.h" +#include "utf8iter.h" +#include "uproplist.h" using namespace std; @@ -37,6 +39,8 @@ using namespace std; // once. enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; static int charclasses[256]; + +static set unicign; static void setcharclasses() { static int init = 0; @@ -67,6 +71,8 @@ static void setcharclasses() init = 1; //for (i=0;i<256;i++)cerr< "< 0 && - (i == in.length() -1 || charclasses[int(in[i+1])] == SPACE || - in[i+1] == '\n' || in[i+1] == '\r')) { - word += c; - span += c; + (whatcc(it[charpos+1]) == SPACE || + whatcc(it[charpos+1]) == '\n' || + whatcc(it[charpos+1]) == '\r')) { + word += it; + span += it; } break; @@ -261,13 +289,13 @@ bool TextSplit::text_to_words(const string &in) else number = false; } - word += (char)c; - span += (char)c; + word += it; + span += it; break; } } - if (word.length()) { - if (!doemit(word, wordpos, span, spanpos, true, i)) + if (span.length()) { + if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) return false; } return true; @@ -306,7 +334,8 @@ static string teststring = "192.168.4.1 " "one\n\rtwo\nthree-\nfour " "[olala][ululu] " - "'o'brien' " + "'o'brien' " + "utf-8 ucs-4©" "\n" ; diff --git a/src/common/uproplist.h b/src/common/uproplist.h new file mode 100644 index 00000000..0631fa7f --- /dev/null +++ b/src/common/uproplist.h @@ -0,0 +1,168 @@ +#ifndef _PROPLIST_H_INCLUDED_ +#define _PROPLIST_H_INCLUDED_ +/* @(#$Id: uproplist.h,v 1.1 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* + * A subset of Unicode chars that we consider whitespace when we split text in + * words. + + * This is used as a quick fix to the ascii-based code, and is not correct. + * the correct way would be to do what http://www.unicode.org/reports/tr29/ + * says. We should then convert first to ucs-4, and then strictly use + * character properties, which might actually be simpler than the current + * solution... + * + * From: +# PropList-4.0.1.txt +# Date: 2004-03-02, 02:42:40 GMT [MD] +# +# Unicode Character Database +# Copyright (c) 1991-2004 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see UCD.html +*/ + +static const unsigned int uniign[] = { + 0x0085, /* ; White_Space # Cc */ + 0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/ + 0x00A1, /* misc signs, bullet etc... */ + 0x00A2, + 0x00A3, + 0x00A4, + 0x00A5, + 0x00A6, + 0x00A9, /* copyright sign */ + 0x00AA, + 0x00AE, /* registered sign */ + 0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/ + 0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/ + 0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ + 0x2028, /* ; White_Space # Zl LINE SEPARATOR*/ + 0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/ + 0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/ + 0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/ + 0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/ + 0x002D, /* ; Dash # Pd HYPHEN-MINUS*/ + 0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/ + 0x1806, /* ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN*/ + 0x2010, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2011, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ + 0x2053, /* ; Dash # Po SWUNG DASH*/ + 0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/ + 0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/ + 0x2212, /* ; Dash # Sm MINUS SIGN*/ + 0x301C, /* ; Dash # Pd WAVE DASH*/ + 0x3030, /* ; Dash # Pd WAVY DASH*/ + 0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/ + 0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/ + 0xFE58, /* ; Dash # Pd SMALL EM DASH*/ + 0xFE63, /* ; Dash # Pd SMALL HYPHEN-MINUS*/ + 0xFF0D, /* ; Dash # Pd FULLWIDTH HYPHEN-MINUS*/ + 0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/ + 0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/ + 0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/ + 0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/ + 0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/ + 0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/ + 0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/ + 0xFF0D, /* ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS*/ + 0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/ + 0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/ + 0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/ + 0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/ + 0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/ + 0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/ + 0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/ + 0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/ + 0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/ + 0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/ + 0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/ + 0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/ + 0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/ + 0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/ + 0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/ + 0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/ + 0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/ + 0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/ + 0x301E, /* ; Quotation_Mark # Pe DOUBLE PRIME QUOTATION MARK*/ + 0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/ + 0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/ + 0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/ + 0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/ + 0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/ + 0xFF02, /* ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK*/ + 0xFF07, /* ; Quotation_Mark # Po FULLWIDTH APOSTROPHE*/ + 0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/ + 0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/ + 0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/ + 0x002C, /* ; Terminal_Punctuation # Po COMMA*/ + 0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/ + 0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/ + 0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/ + 0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/ + 0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/ + 0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/ + 0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/ + 0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/ + 0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/ + 0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/ + 0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/ + 0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/ + 0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ + 0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ + 0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ + 0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ + 0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ + 0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ + 0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ + 0xFF01, /* ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK*/ + 0xFF0C, /* ; Terminal_Punctuation # Po FULLWIDTH COMMA*/ + 0xFF0E, /* ; Terminal_Punctuation # Po FULLWIDTH FULL STOP*/ + 0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/ + 0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/ + 0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/ + 0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/ + 0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/ + 0x0021, /* ; STerm # Po EXCLAMATION MARK*/ + 0x002E, /* ; STerm # Po FULL STOP*/ + 0x003F, /* ; STerm # Po QUESTION MARK*/ + 0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/ + 0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/ + 0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/ + 0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/ + 0x06D4, /* ; STerm # Po ARABIC FULL STOP*/ + 0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/ + 0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/ + 0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/ + 0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ + 0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ + 0x2047, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0x2048, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0x2049, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ + 0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/ + 0xFE52, /* ; STerm # Po SMALL FULL STOP*/ + 0xFE56, /* ; STerm # Po SMALL QUESTION MARK*/ + 0xFE57, /* ; STerm # Po SMALL EXCLAMATION MARK*/ + 0xFF01, /* ; STerm # Po FULLWIDTH EXCLAMATION MARK*/ + 0xFF0E, /* ; STerm # Po FULLWIDTH FULL STOP*/ + 0xFF1F, /* ; STerm # Po FULLWIDTH QUESTION MARK*/ + 0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/ +}; + +#endif /*PLIST_H_INCLUDED_ */ diff --git a/src/utils/Makefile b/src/utils/Makefile index 0fc42f8b..07d08238 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -35,7 +35,7 @@ trtranscode.o : ../utils/transcode.cpp MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB) trmimeparse : $(MIMEPARSE_OBJS) - $(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) $(LIBICONV) + $(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV) trmimeparse.o : mimeparse.cpp $(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \ mimeparse.cpp diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp index a7ba91a0..64fbd11b 100644 --- a/src/utils/utf8iter.cpp +++ b/src/utils/utf8iter.cpp @@ -1,10 +1,11 @@ #ifndef lint -static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include #include #include #include +#include #include "debuglog.h" using namespace std; @@ -22,32 +23,61 @@ int main(int argc, char **argv) const char *infile = argv[1]; const char *outfile = argv[2]; string in; - string out; if (!file_to_string(infile, in)) { cerr << "Cant read file\n" << endl; exit(1); } + + vectorucsout1; + string out, out1; Utf8Iter it(in); FILE *fp = fopen(outfile, "w"); if (fp == 0) { fprintf(stderr, "cant create %s\n", outfile); exit(1); } - while (!it.eof()) { + int nchars = 0; + for (;!it.eof(); it++) { unsigned int value = *it; - it.appendchartostring(out); - it++; + if (value == (unsigned int)-1) { + fprintf(stderr, "Conversion error occurred\n"); + exit(1); + } + ucsout1.push_back(value); fwrite(&value, 4, 1, fp); + if (!it.appendchartostring(out)) + break; + out1 += it; + nchars++; } - fclose(fp); - if (it.error()) { - fprintf(stderr, "Conversion error occurred\n"); - exit(1); - } + fprintf(stderr, "nchars1 %d\n", nchars); if (in != out) { fprintf(stderr, "error: out != in\n"); exit(1); } + if (in != out1) { + fprintf(stderr, "error: out1 != in\n"); + exit(1); + } + + vectorucsout2; + it.rewind(); + for (int i = 0; ; i++) { + unsigned int value; + if ((value = it[i]) == (unsigned int)-1) { + fprintf(stderr, "%d chars\n", i); + break; + } + it++; + ucsout2.push_back(value); + } + + if (ucsout1 != ucsout2) { + fprintf(stderr, "error: ucsout1 != ucsout2\n"); + exit(1); + } + + fclose(fp); exit(0); } diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index 513fa228..c4008e7d 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -1,6 +1,6 @@ #ifndef _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_ -/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: utf8iter.h,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */ /** * A small helper class to iterate over utf8 strings. This is not an @@ -8,58 +8,113 @@ some specific uses */ class Utf8Iter { - unsigned int cl; - const string &s; - string::size_type pos; - bool bad; - int compute_cl() { + unsigned int cl; // Char length at current position if known + const string &s; // String we're working with + string::size_type pos; // Current position in string + bool bad; // Status + unsigned int m_charpos; // Current character posiiton + + // Get character byte length at specified position + inline int get_cl(string::size_type p) const { + unsigned int z = (unsigned char)s[p]; + if (z <= 127) { + return 1; + } else if (z>=192 && z <= 223) { + return 2; + } else if (z >= 224 && z <= 239) { + return 3; + } else if (z >= 240 && z <= 247) { + return 4; + } else if (z >= 248 && z <= 251) { + return 5; + } else if (z >= 252 && z <= 253) { + return 6; + } + return -1; + } + // Check position and cl against string length + bool poslok(string::size_type p, int l) const { + return p != string::npos && l > 0 && p + l <= s.length(); + } + // Update current char length in object state. Assumes pos is inside string + inline int compute_cl() { cl = 0; if (bad) return -1; - unsigned int z = (unsigned char)s[pos]; - if (z <= 127) { - cl = 1; - } else if (z>=192 && z <= 223) { - cl = 2; - } else if (z >= 224 && z <= 239) { - cl = 3; - } else if (z >= 240 && z <= 247) { - cl = 4; - } else if (z >= 248 && z <= 251) { - cl = 5; - } else if (z >= 252 && z <= 253) { - cl = 6; - } - if (!cl || s.length() - pos < cl) { + cl = get_cl(pos); + if (!poslok(pos, cl)) { bad = true; cl = 0; return -1; } return 0; } + // Compute value at given position + inline unsigned int getvalueat(string::size_type p, int l) const { + switch (l) { + case 1: return (unsigned char)s[p]; + case 2: return ((unsigned char)s[p] - 192) * 64 + + (unsigned char)s[p+1] - 128 ; + case 3: return ((unsigned char)s[p]-224)*4096 + + ((unsigned char)s[p+1]-128)*64 + + (unsigned char)s[p+2]-128; + case 4: return ((unsigned char)s[p]-240)*262144 + + ((unsigned char)s[p+1]-128)*4096 + + ((unsigned char)s[p+2]-128)*64 + + (unsigned char)s[p+3]-128; + case 5: return ((unsigned char)s[p]-248)*16777216 + + ((unsigned char)s[p+1]-128)*262144 + + ((unsigned char)s[p+2]-128)*4096 + + ((unsigned char)s[p+3]-128)*64 + + (unsigned char)s[p+4]-128; + case 6: return ((unsigned char)s[p]-252)*1073741824 + + ((unsigned char)s[p+1]-128)*16777216 + + ((unsigned char)s[p+2]-128)*262144 + + ((unsigned char)s[p+3]-128)*4096 + + ((unsigned char)s[p+4]-128)*64 + + (unsigned char)s[p+5]-128; + default: + return (unsigned int)-1; + } + } public: - Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {} + Utf8Iter(const string &in) + : cl(0), s(in), pos(0), bad(false), m_charpos(0) {} + void rewind() { + cl=0; pos=0; bad=false; m_charpos=0; + } /** operator* returns the ucs4 value as a machine integer*/ unsigned int operator*() { if (!cl && compute_cl() < 0) return (unsigned int)-1; - switch (cl) { - case 1: return (unsigned char)s[pos]; - case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ; - case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128; - case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 + - ((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128; - case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 + - ((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128; - case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 + - ((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 + - ((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128; - default: + unsigned int val = getvalueat(pos, cl); + if (val == (unsigned int)-1) { bad = true; cl = 0; - return (unsigned int)-1; } + return val; + } + /** "Direct" access. Awfully inefficient as we skip from start or current + * position at best. This can only be useful for a lookahead from the + * current position */ + unsigned int operator[](unsigned int charpos) const { + string::size_type mypos = 0; + unsigned int mycp = 0;; + if (charpos >= m_charpos) { + mypos = pos; + mycp = m_charpos; + } + while (mypos < s.length() && mycp != charpos) { + mypos += get_cl(mypos); + ++mycp; + } + if (mypos < s.length() && mycp == charpos) { + int l = get_cl(mypos); + if (poslok(mypos, l)) + return getvalueat(mypos, get_cl(mypos)); + } + return (unsigned int)-1; } string::size_type operator++(int) { @@ -67,6 +122,7 @@ class Utf8Iter { return string::npos; } pos += cl; + m_charpos++; cl = 0; return pos; } @@ -78,12 +134,24 @@ class Utf8Iter { out += s.substr(pos, cl); return true; } + operator string() { + if (bad || (!cl && compute_cl() < 0)) { + return false; + } + return s.substr(pos, cl); + } bool eof() { return bad || pos == s.length(); } bool error() { return bad; } + string::size_type getBpos() const { + return pos; + } + string::size_type getCpos() const { + return m_charpos; + } }; diff --git a/src/utils/utf8testin.txt b/src/utils/utf8testin.txt new file mode 100644 index 00000000..bfb9ec85 --- /dev/null +++ b/src/utils/utf8testin.txt @@ -0,0 +1,212 @@ + +UTF-8 encoded sample plain-text file +‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ + +Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25 + + +The ASCII compatible UTF-8 encoding used in this plain-text file +is defined in Unicode, ISO 10646-1, and RFC 2279. + + +Using Unicode/UTF-8, you can write in emails and source code things such as + +Mathematics and sciences: + + ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫ + ⎪⎢⎜│a²+b³ ⎟⎥⎪ + ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪ + ⎪⎢⎜⎷ c₈ ⎟⎥⎪ + ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬ + ⎪⎢⎜ ∞ ⎟⎥⎪ + ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪ + ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪ + 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭ + +Linguistics and dictionaries: + + ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn + Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] + +APL: + + ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈ + +Nicer typography in plain text files: + + ╔══════════════════════════════════════════╗ + ║ ║ + ║ • ‘single’ and “double” quotes ║ + ║ ║ + ║ • Curly apostrophes: “We’ve been here” ║ + ║ ║ + ║ • Latin-1 apostrophe and accents: '´` ║ + ║ ║ + ║ • ‚deutsche‘ „Anführungszeichen“ ║ + ║ ║ + ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║ + ║ ║ + ║ • ASCII safety test: 1lI|, 0OD, 8B ║ + ║ ╭─────────╮ ║ + ║ • the euro symbol: │ 14.95 € │ ║ + ║ ╰─────────╯ ║ + ╚══════════════════════════════════════════╝ + +Combining characters: + + STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑ + +Greek (in Polytonic): + + The Greek anthem: + + Σὲ γνωρίζω ἀπὸ τὴν κόψη + τοῦ σπαθιοῦ τὴν τρομερή, + σὲ γνωρίζω ἀπὸ τὴν ὄψη + ποὺ μὲ βία μετράει τὴ γῆ. + + ᾿Απ᾿ τὰ κόκκαλα βγαλμένη + τῶν ῾Ελλήνων τὰ ἱερά + καὶ σὰν πρῶτα ἀνδρειωμένη + χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά! + + From a speech of Demosthenes in the 4th century BC: + + Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, + ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς + λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ + τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿ + εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ + πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν + οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι, + οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν + ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον + τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι + γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν + προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους + σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ + τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ + τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς + τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον. + + Δημοσθένους, Γ´ ᾿Ολυνθιακὸς + +Georgian: + + From a Unicode conference invitation: + + გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო + კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს, + ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს + ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი, + ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება + ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში, + ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში. + +Russian: + + From a Unicode conference invitation: + + Зарегистрируйтесь сейчас на Десятую Международную Конференцию по + Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии. + Конференция соберет широкий круг экспертов по вопросам глобального + Интернета и Unicode, локализации и интернационализации, воплощению и + применению Unicode в различных операционных системах и программных + приложениях, шрифтах, верстке и многоязычных компьютерных системах. + +Thai (UCS Level 2): + + Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese + classic 'San Gua'): + + [----------------------------|------------------------] + ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่ + สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา + ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา + โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ + เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ + ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ + พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้ + ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ + + (The above is a two-column text. If combining characters are handled + correctly, the lines of the second column should be aligned with the + | character above.) + +Ethiopian: + + Proverbs in the Amharic language: + + ሰማይ አይታረስ ንጉሥ አይከሰስ። + ብላ ካለኝ እንደአባቴ በቆመጠኝ። + ጌጥ ያለቤቱ ቁምጥና ነው። + ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው። + የአፍ ወለምታ በቅቤ አይታሽም። + አይጥ በበላ ዳዋ ተመታ። + ሲተረጉሙ ይደረግሙ። + ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል። + ድር ቢያብር አንበሳ ያስር። + ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም። + እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም። + የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ። + ሥራ ከመፍታት ልጄን ላፋታት። + ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል። + የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ። + ተንጋሎ ቢተፉ ተመልሶ ባፉ። + ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው። + እግርህን በፍራሽህ ልክ ዘርጋ። + +Runes: + + ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ + + (Old English, which transcribed into Latin reads 'He cwaeth that he + bude thaem lande northweardum with tha Westsae.' and means 'He said + that he lived in the northern land near the Western Sea.') + +Braille: + + ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌ + + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞ + ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎ + ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂ + ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙ + ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ + ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲ + + ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹ + ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞ + ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕ + ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹ + ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎ + ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎ + ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳ + ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞ + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + (The first couple of paragraphs of "A Christmas Carol" by Dickens) + +Compact font selection example text: + + ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 + abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ + –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд + ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა + +Greetings in various languages: + + Hello world, Καλημέρα κόσμε, コンニチハ + +Box drawing alignment tests: █ + ▉ + ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳ + ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳ + ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳ + ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳ + ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎ + ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏ + ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█ + ▝▀▘▙▄▟