improved word extraction a bit (unicode punctuation)
This commit is contained in:
parent
40a5905b15
commit
d42db8b65d
@ -1,13 +1,15 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_TEXTSPLIT
|
#ifndef TEST_TEXTSPLIT
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <set>
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "utf8iter.h"
|
||||||
|
#include "uproplist.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -37,6 +39,8 @@ using namespace std;
|
|||||||
// once.
|
// once.
|
||||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
||||||
static int charclasses[256];
|
static int charclasses[256];
|
||||||
|
|
||||||
|
static set<unsigned int> unicign;
|
||||||
static void setcharclasses()
|
static void setcharclasses()
|
||||||
{
|
{
|
||||||
static int init = 0;
|
static int init = 0;
|
||||||
@ -67,6 +71,8 @@ static void setcharclasses()
|
|||||||
|
|
||||||
init = 1;
|
init = 1;
|
||||||
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
||||||
|
for (i = 0; i < sizeof(uniign); i++)
|
||||||
|
unicign.insert(uniign[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
||||||
@ -152,6 +158,22 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int whatcc(unsigned int c)
|
||||||
|
{
|
||||||
|
int cc;
|
||||||
|
if (c <= 127) {
|
||||||
|
cc = charclasses[c];
|
||||||
|
} else {
|
||||||
|
if (c == (unsigned int)-1)
|
||||||
|
cc = SPACE;
|
||||||
|
else if (unicign.find(c) != unicign.end())
|
||||||
|
cc = SPACE;
|
||||||
|
else
|
||||||
|
cc = LETTER;
|
||||||
|
}
|
||||||
|
return cc;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splitting a text into terms to be indexed.
|
* Splitting a text into terms to be indexed.
|
||||||
* We basically emit a word every time we see a separator, but some chars are
|
* We basically emit a word every time we see a separator, but some chars are
|
||||||
@ -167,16 +189,21 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
bool number = false;
|
bool number = false;
|
||||||
int wordpos = 0;
|
int wordpos = 0;
|
||||||
int spanpos = 0;
|
int spanpos = 0;
|
||||||
unsigned int i;
|
int charpos = 0;
|
||||||
|
Utf8Iter it(in);
|
||||||
|
|
||||||
for (i = 0; i < in.length(); i++) {
|
for (; !it.eof(); it++, charpos++) {
|
||||||
int c = in[i];
|
unsigned int c = *it;
|
||||||
int cc = charclasses[c];
|
if (c == (unsigned int)-1) {
|
||||||
|
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, true, i))
|
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
@ -186,56 +213,57 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
if (word.length() == 0) {
|
if (word.length() == 0) {
|
||||||
if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
|
if (whatcc(it[charpos+1]) == DIGIT) {
|
||||||
number = true;
|
number = true;
|
||||||
word += c;
|
word += it;
|
||||||
span += c;
|
span += it;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += c;
|
span += it;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '@':
|
case '@':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
word += c;
|
word += it;
|
||||||
span += c;
|
span += it;
|
||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += c;
|
span += it;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
if (number) {
|
if (number) {
|
||||||
word += c;
|
word += it;
|
||||||
} else {
|
} else {
|
||||||
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
|
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
|
||||||
if (word.length()) {
|
if (word.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
} else
|
} else
|
||||||
word += c;
|
word += it;
|
||||||
}
|
}
|
||||||
span += c;
|
span += it;
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
// Keep it only at end of word...
|
// Keep it only at end of word...
|
||||||
if (word.length() > 0 &&
|
if (word.length() > 0 &&
|
||||||
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
|
(whatcc(it[charpos+1]) == SPACE ||
|
||||||
in[i+1] == '\n' || in[i+1] == '\r')) {
|
whatcc(it[charpos+1]) == '\n' ||
|
||||||
word += c;
|
whatcc(it[charpos+1]) == '\r')) {
|
||||||
span += c;
|
word += it;
|
||||||
|
span += it;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@ -261,13 +289,13 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
else
|
else
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
word += (char)c;
|
word += it;
|
||||||
span += (char)c;
|
span += it;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (word.length()) {
|
if (span.length()) {
|
||||||
if (!doemit(word, wordpos, span, spanpos, true, i))
|
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -307,6 +335,7 @@ static string teststring =
|
|||||||
"one\n\rtwo\nthree-\nfour "
|
"one\n\rtwo\nthree-\nfour "
|
||||||
"[olala][ululu] "
|
"[olala][ululu] "
|
||||||
"'o'brien' "
|
"'o'brien' "
|
||||||
|
"utf-8 ucs-4©"
|
||||||
"\n"
|
"\n"
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|||||||
168
src/common/uproplist.h
Normal file
168
src/common/uproplist.h
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
#ifndef _PROPLIST_H_INCLUDED_
|
||||||
|
#define _PROPLIST_H_INCLUDED_
|
||||||
|
/* @(#$Id: uproplist.h,v 1.1 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
/*
|
||||||
|
* A subset of Unicode chars that we consider whitespace when we split text in
|
||||||
|
* words.
|
||||||
|
|
||||||
|
* This is used as a quick fix to the ascii-based code, and is not correct.
|
||||||
|
* the correct way would be to do what http://www.unicode.org/reports/tr29/
|
||||||
|
* says. We should then convert first to ucs-4, and then strictly use
|
||||||
|
* character properties, which might actually be simpler than the current
|
||||||
|
* solution...
|
||||||
|
*
|
||||||
|
* From:
|
||||||
|
# PropList-4.0.1.txt
|
||||||
|
# Date: 2004-03-02, 02:42:40 GMT [MD]
|
||||||
|
#
|
||||||
|
# Unicode Character Database
|
||||||
|
# Copyright (c) 1991-2004 Unicode, Inc.
|
||||||
|
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||||
|
# For documentation, see UCD.html
|
||||||
|
*/
|
||||||
|
|
||||||
|
static const unsigned int uniign[] = {
|
||||||
|
0x0085, /* ; White_Space # Cc <control-0085>*/
|
||||||
|
0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
|
||||||
|
0x00A1, /* misc signs, bullet etc... */
|
||||||
|
0x00A2,
|
||||||
|
0x00A3,
|
||||||
|
0x00A4,
|
||||||
|
0x00A5,
|
||||||
|
0x00A6,
|
||||||
|
0x00A9, /* copyright sign */
|
||||||
|
0x00AA,
|
||||||
|
0x00AE, /* registered sign */
|
||||||
|
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
|
||||||
|
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
|
||||||
|
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||||
|
0x2028, /* ; White_Space # Zl LINE SEPARATOR*/
|
||||||
|
0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/
|
||||||
|
0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/
|
||||||
|
0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/
|
||||||
|
0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/
|
||||||
|
0x002D, /* ; Dash # Pd HYPHEN-MINUS*/
|
||||||
|
0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/
|
||||||
|
0x1806, /* ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN*/
|
||||||
|
0x2010, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2011, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||||
|
0x2053, /* ; Dash # Po SWUNG DASH*/
|
||||||
|
0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/
|
||||||
|
0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/
|
||||||
|
0x2212, /* ; Dash # Sm MINUS SIGN*/
|
||||||
|
0x301C, /* ; Dash # Pd WAVE DASH*/
|
||||||
|
0x3030, /* ; Dash # Pd WAVY DASH*/
|
||||||
|
0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/
|
||||||
|
0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/
|
||||||
|
0xFE58, /* ; Dash # Pd SMALL EM DASH*/
|
||||||
|
0xFE63, /* ; Dash # Pd SMALL HYPHEN-MINUS*/
|
||||||
|
0xFF0D, /* ; Dash # Pd FULLWIDTH HYPHEN-MINUS*/
|
||||||
|
0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/
|
||||||
|
0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/
|
||||||
|
0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/
|
||||||
|
0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||||
|
0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||||
|
0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/
|
||||||
|
0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/
|
||||||
|
0xFF0D, /* ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS*/
|
||||||
|
0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/
|
||||||
|
0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
|
||||||
|
0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/
|
||||||
|
0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/
|
||||||
|
0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/
|
||||||
|
0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/
|
||||||
|
0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||||
|
0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/
|
||||||
|
0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/
|
||||||
|
0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/
|
||||||
|
0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||||
|
0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/
|
||||||
|
0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
|
||||||
|
0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/
|
||||||
|
0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/
|
||||||
|
0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/
|
||||||
|
0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/
|
||||||
|
0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/
|
||||||
|
0x301E, /* ; Quotation_Mark # Pe DOUBLE PRIME QUOTATION MARK*/
|
||||||
|
0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/
|
||||||
|
0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
|
||||||
|
0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
|
||||||
|
0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
|
||||||
|
0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
|
||||||
|
0xFF02, /* ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK*/
|
||||||
|
0xFF07, /* ; Quotation_Mark # Po FULLWIDTH APOSTROPHE*/
|
||||||
|
0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/
|
||||||
|
0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/
|
||||||
|
0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/
|
||||||
|
0x002C, /* ; Terminal_Punctuation # Po COMMA*/
|
||||||
|
0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/
|
||||||
|
0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
|
||||||
|
0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
|
||||||
|
0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/
|
||||||
|
0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/
|
||||||
|
0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/
|
||||||
|
0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/
|
||||||
|
0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/
|
||||||
|
0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/
|
||||||
|
0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/
|
||||||
|
0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/
|
||||||
|
0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/
|
||||||
|
0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||||
|
0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||||
|
0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||||
|
0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||||
|
0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||||
|
0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||||
|
0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||||
|
0xFF01, /* ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK*/
|
||||||
|
0xFF0C, /* ; Terminal_Punctuation # Po FULLWIDTH COMMA*/
|
||||||
|
0xFF0E, /* ; Terminal_Punctuation # Po FULLWIDTH FULL STOP*/
|
||||||
|
0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
|
||||||
|
0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
|
||||||
|
0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/
|
||||||
|
0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||||
|
0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/
|
||||||
|
0x0021, /* ; STerm # Po EXCLAMATION MARK*/
|
||||||
|
0x002E, /* ; STerm # Po FULL STOP*/
|
||||||
|
0x003F, /* ; STerm # Po QUESTION MARK*/
|
||||||
|
0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/
|
||||||
|
0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/
|
||||||
|
0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/
|
||||||
|
0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/
|
||||||
|
0x06D4, /* ; STerm # Po ARABIC FULL STOP*/
|
||||||
|
0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/
|
||||||
|
0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/
|
||||||
|
0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/
|
||||||
|
0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||||
|
0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||||
|
0x2047, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0x2048, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0x2049, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||||
|
0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/
|
||||||
|
0xFE52, /* ; STerm # Po SMALL FULL STOP*/
|
||||||
|
0xFE56, /* ; STerm # Po SMALL QUESTION MARK*/
|
||||||
|
0xFE57, /* ; STerm # Po SMALL EXCLAMATION MARK*/
|
||||||
|
0xFF01, /* ; STerm # Po FULLWIDTH EXCLAMATION MARK*/
|
||||||
|
0xFF0E, /* ; STerm # Po FULLWIDTH FULL STOP*/
|
||||||
|
0xFF1F, /* ; STerm # Po FULLWIDTH QUESTION MARK*/
|
||||||
|
0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /*PLIST_H_INCLUDED_ */
|
||||||
@ -35,7 +35,7 @@ trtranscode.o : ../utils/transcode.cpp
|
|||||||
|
|
||||||
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
|
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
|
||||||
trmimeparse : $(MIMEPARSE_OBJS)
|
trmimeparse : $(MIMEPARSE_OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
|
$(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
|
||||||
trmimeparse.o : mimeparse.cpp
|
trmimeparse.o : mimeparse.cpp
|
||||||
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
|
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
|
||||||
mimeparse.cpp
|
mimeparse.cpp
|
||||||
|
|||||||
@ -1,10 +1,11 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
#include <vector>
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -22,32 +23,61 @@ int main(int argc, char **argv)
|
|||||||
const char *infile = argv[1];
|
const char *infile = argv[1];
|
||||||
const char *outfile = argv[2];
|
const char *outfile = argv[2];
|
||||||
string in;
|
string in;
|
||||||
string out;
|
|
||||||
if (!file_to_string(infile, in)) {
|
if (!file_to_string(infile, in)) {
|
||||||
cerr << "Cant read file\n" << endl;
|
cerr << "Cant read file\n" << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector<unsigned int>ucsout1;
|
||||||
|
string out, out1;
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
FILE *fp = fopen(outfile, "w");
|
FILE *fp = fopen(outfile, "w");
|
||||||
if (fp == 0) {
|
if (fp == 0) {
|
||||||
fprintf(stderr, "cant create %s\n", outfile);
|
fprintf(stderr, "cant create %s\n", outfile);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
while (!it.eof()) {
|
int nchars = 0;
|
||||||
|
for (;!it.eof(); it++) {
|
||||||
unsigned int value = *it;
|
unsigned int value = *it;
|
||||||
it.appendchartostring(out);
|
if (value == (unsigned int)-1) {
|
||||||
it++;
|
fprintf(stderr, "Conversion error occurred\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
ucsout1.push_back(value);
|
||||||
fwrite(&value, 4, 1, fp);
|
fwrite(&value, 4, 1, fp);
|
||||||
|
if (!it.appendchartostring(out))
|
||||||
|
break;
|
||||||
|
out1 += it;
|
||||||
|
nchars++;
|
||||||
}
|
}
|
||||||
fclose(fp);
|
fprintf(stderr, "nchars1 %d\n", nchars);
|
||||||
if (it.error()) {
|
|
||||||
fprintf(stderr, "Conversion error occurred\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (in != out) {
|
if (in != out) {
|
||||||
fprintf(stderr, "error: out != in\n");
|
fprintf(stderr, "error: out != in\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
if (in != out1) {
|
||||||
|
fprintf(stderr, "error: out1 != in\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<unsigned int>ucsout2;
|
||||||
|
it.rewind();
|
||||||
|
for (int i = 0; ; i++) {
|
||||||
|
unsigned int value;
|
||||||
|
if ((value = it[i]) == (unsigned int)-1) {
|
||||||
|
fprintf(stderr, "%d chars\n", i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
it++;
|
||||||
|
ucsout2.push_back(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ucsout1 != ucsout2) {
|
||||||
|
fprintf(stderr, "error: ucsout1 != ucsout2\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _UTF8ITER_H_INCLUDED_
|
#ifndef _UTF8ITER_H_INCLUDED_
|
||||||
#define _UTF8ITER_H_INCLUDED_
|
#define _UTF8ITER_H_INCLUDED_
|
||||||
/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: utf8iter.h,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A small helper class to iterate over utf8 strings. This is not an
|
* A small helper class to iterate over utf8 strings. This is not an
|
||||||
@ -8,58 +8,113 @@
|
|||||||
some specific uses
|
some specific uses
|
||||||
*/
|
*/
|
||||||
class Utf8Iter {
|
class Utf8Iter {
|
||||||
unsigned int cl;
|
unsigned int cl; // Char length at current position if known
|
||||||
const string &s;
|
const string &s; // String we're working with
|
||||||
string::size_type pos;
|
string::size_type pos; // Current position in string
|
||||||
bool bad;
|
bool bad; // Status
|
||||||
int compute_cl() {
|
unsigned int m_charpos; // Current character posiiton
|
||||||
|
|
||||||
|
// Get character byte length at specified position
|
||||||
|
inline int get_cl(string::size_type p) const {
|
||||||
|
unsigned int z = (unsigned char)s[p];
|
||||||
|
if (z <= 127) {
|
||||||
|
return 1;
|
||||||
|
} else if (z>=192 && z <= 223) {
|
||||||
|
return 2;
|
||||||
|
} else if (z >= 224 && z <= 239) {
|
||||||
|
return 3;
|
||||||
|
} else if (z >= 240 && z <= 247) {
|
||||||
|
return 4;
|
||||||
|
} else if (z >= 248 && z <= 251) {
|
||||||
|
return 5;
|
||||||
|
} else if (z >= 252 && z <= 253) {
|
||||||
|
return 6;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// Check position and cl against string length
|
||||||
|
bool poslok(string::size_type p, int l) const {
|
||||||
|
return p != string::npos && l > 0 && p + l <= s.length();
|
||||||
|
}
|
||||||
|
// Update current char length in object state. Assumes pos is inside string
|
||||||
|
inline int compute_cl() {
|
||||||
cl = 0;
|
cl = 0;
|
||||||
if (bad)
|
if (bad)
|
||||||
return -1;
|
return -1;
|
||||||
unsigned int z = (unsigned char)s[pos];
|
cl = get_cl(pos);
|
||||||
if (z <= 127) {
|
if (!poslok(pos, cl)) {
|
||||||
cl = 1;
|
|
||||||
} else if (z>=192 && z <= 223) {
|
|
||||||
cl = 2;
|
|
||||||
} else if (z >= 224 && z <= 239) {
|
|
||||||
cl = 3;
|
|
||||||
} else if (z >= 240 && z <= 247) {
|
|
||||||
cl = 4;
|
|
||||||
} else if (z >= 248 && z <= 251) {
|
|
||||||
cl = 5;
|
|
||||||
} else if (z >= 252 && z <= 253) {
|
|
||||||
cl = 6;
|
|
||||||
}
|
|
||||||
if (!cl || s.length() - pos < cl) {
|
|
||||||
bad = true;
|
bad = true;
|
||||||
cl = 0;
|
cl = 0;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
// Compute value at given position
|
||||||
|
inline unsigned int getvalueat(string::size_type p, int l) const {
|
||||||
|
switch (l) {
|
||||||
|
case 1: return (unsigned char)s[p];
|
||||||
|
case 2: return ((unsigned char)s[p] - 192) * 64 +
|
||||||
|
(unsigned char)s[p+1] - 128 ;
|
||||||
|
case 3: return ((unsigned char)s[p]-224)*4096 +
|
||||||
|
((unsigned char)s[p+1]-128)*64 +
|
||||||
|
(unsigned char)s[p+2]-128;
|
||||||
|
case 4: return ((unsigned char)s[p]-240)*262144 +
|
||||||
|
((unsigned char)s[p+1]-128)*4096 +
|
||||||
|
((unsigned char)s[p+2]-128)*64 +
|
||||||
|
(unsigned char)s[p+3]-128;
|
||||||
|
case 5: return ((unsigned char)s[p]-248)*16777216 +
|
||||||
|
((unsigned char)s[p+1]-128)*262144 +
|
||||||
|
((unsigned char)s[p+2]-128)*4096 +
|
||||||
|
((unsigned char)s[p+3]-128)*64 +
|
||||||
|
(unsigned char)s[p+4]-128;
|
||||||
|
case 6: return ((unsigned char)s[p]-252)*1073741824 +
|
||||||
|
((unsigned char)s[p+1]-128)*16777216 +
|
||||||
|
((unsigned char)s[p+2]-128)*262144 +
|
||||||
|
((unsigned char)s[p+3]-128)*4096 +
|
||||||
|
((unsigned char)s[p+4]-128)*64 +
|
||||||
|
(unsigned char)s[p+5]-128;
|
||||||
|
default:
|
||||||
|
return (unsigned int)-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
|
Utf8Iter(const string &in)
|
||||||
|
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
|
||||||
|
|
||||||
|
void rewind() {
|
||||||
|
cl=0; pos=0; bad=false; m_charpos=0;
|
||||||
|
}
|
||||||
/** operator* returns the ucs4 value as a machine integer*/
|
/** operator* returns the ucs4 value as a machine integer*/
|
||||||
unsigned int operator*() {
|
unsigned int operator*() {
|
||||||
if (!cl && compute_cl() < 0)
|
if (!cl && compute_cl() < 0)
|
||||||
return (unsigned int)-1;
|
return (unsigned int)-1;
|
||||||
switch (cl) {
|
unsigned int val = getvalueat(pos, cl);
|
||||||
case 1: return (unsigned char)s[pos];
|
if (val == (unsigned int)-1) {
|
||||||
case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
|
|
||||||
case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
|
|
||||||
case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
|
|
||||||
((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
|
|
||||||
case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
|
|
||||||
((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
|
|
||||||
case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
|
|
||||||
((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
|
|
||||||
((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
|
|
||||||
default:
|
|
||||||
bad = true;
|
bad = true;
|
||||||
cl = 0;
|
cl = 0;
|
||||||
return (unsigned int)-1;
|
|
||||||
}
|
}
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
/** "Direct" access. Awfully inefficient as we skip from start or current
|
||||||
|
* position at best. This can only be useful for a lookahead from the
|
||||||
|
* current position */
|
||||||
|
unsigned int operator[](unsigned int charpos) const {
|
||||||
|
string::size_type mypos = 0;
|
||||||
|
unsigned int mycp = 0;;
|
||||||
|
if (charpos >= m_charpos) {
|
||||||
|
mypos = pos;
|
||||||
|
mycp = m_charpos;
|
||||||
|
}
|
||||||
|
while (mypos < s.length() && mycp != charpos) {
|
||||||
|
mypos += get_cl(mypos);
|
||||||
|
++mycp;
|
||||||
|
}
|
||||||
|
if (mypos < s.length() && mycp == charpos) {
|
||||||
|
int l = get_cl(mypos);
|
||||||
|
if (poslok(mypos, l))
|
||||||
|
return getvalueat(mypos, get_cl(mypos));
|
||||||
|
}
|
||||||
|
return (unsigned int)-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
string::size_type operator++(int) {
|
string::size_type operator++(int) {
|
||||||
@ -67,6 +122,7 @@ class Utf8Iter {
|
|||||||
return string::npos;
|
return string::npos;
|
||||||
}
|
}
|
||||||
pos += cl;
|
pos += cl;
|
||||||
|
m_charpos++;
|
||||||
cl = 0;
|
cl = 0;
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
@ -78,12 +134,24 @@ class Utf8Iter {
|
|||||||
out += s.substr(pos, cl);
|
out += s.substr(pos, cl);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
operator string() {
|
||||||
|
if (bad || (!cl && compute_cl() < 0)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return s.substr(pos, cl);
|
||||||
|
}
|
||||||
bool eof() {
|
bool eof() {
|
||||||
return bad || pos == s.length();
|
return bad || pos == s.length();
|
||||||
}
|
}
|
||||||
bool error() {
|
bool error() {
|
||||||
return bad;
|
return bad;
|
||||||
}
|
}
|
||||||
|
string::size_type getBpos() const {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
string::size_type getCpos() const {
|
||||||
|
return m_charpos;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
212
src/utils/utf8testin.txt
Normal file
212
src/utils/utf8testin.txt
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
|
||||||
|
UTF-8 encoded sample plain-text file
|
||||||
|
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
|
||||||
|
|
||||||
|
Markus Kuhn [ˈmaʳkʊs kuːn] <http://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25
|
||||||
|
|
||||||
|
|
||||||
|
The ASCII compatible UTF-8 encoding used in this plain-text file
|
||||||
|
is defined in Unicode, ISO 10646-1, and RFC 2279.
|
||||||
|
|
||||||
|
|
||||||
|
Using Unicode/UTF-8, you can write in emails and source code things such as
|
||||||
|
|
||||||
|
Mathematics and sciences:
|
||||||
|
|
||||||
|
∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
|
||||||
|
⎪⎢⎜│a²+b³ ⎟⎥⎪
|
||||||
|
∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
|
||||||
|
⎪⎢⎜⎷ c₈ ⎟⎥⎪
|
||||||
|
ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
|
||||||
|
⎪⎢⎜ ∞ ⎟⎥⎪
|
||||||
|
⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
|
||||||
|
⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
|
||||||
|
2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
|
||||||
|
|
||||||
|
Linguistics and dictionaries:
|
||||||
|
|
||||||
|
ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
|
||||||
|
Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
|
||||||
|
|
||||||
|
APL:
|
||||||
|
|
||||||
|
((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
|
||||||
|
|
||||||
|
Nicer typography in plain text files:
|
||||||
|
|
||||||
|
╔══════════════════════════════════════════╗
|
||||||
|
║ ║
|
||||||
|
║ • ‘single’ and “double” quotes ║
|
||||||
|
║ ║
|
||||||
|
║ • Curly apostrophes: “We’ve been here” ║
|
||||||
|
║ ║
|
||||||
|
║ • Latin-1 apostrophe and accents: '´` ║
|
||||||
|
║ ║
|
||||||
|
║ • ‚deutsche‘ „Anführungszeichen“ ║
|
||||||
|
║ ║
|
||||||
|
║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
|
||||||
|
║ ║
|
||||||
|
║ • ASCII safety test: 1lI|, 0OD, 8B ║
|
||||||
|
║ ╭─────────╮ ║
|
||||||
|
║ • the euro symbol: │ 14.95 € │ ║
|
||||||
|
║ ╰─────────╯ ║
|
||||||
|
╚══════════════════════════════════════════╝
|
||||||
|
|
||||||
|
Combining characters:
|
||||||
|
|
||||||
|
STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
|
||||||
|
|
||||||
|
Greek (in Polytonic):
|
||||||
|
|
||||||
|
The Greek anthem:
|
||||||
|
|
||||||
|
Σὲ γνωρίζω ἀπὸ τὴν κόψη
|
||||||
|
τοῦ σπαθιοῦ τὴν τρομερή,
|
||||||
|
σὲ γνωρίζω ἀπὸ τὴν ὄψη
|
||||||
|
ποὺ μὲ βία μετράει τὴ γῆ.
|
||||||
|
|
||||||
|
᾿Απ᾿ τὰ κόκκαλα βγαλμένη
|
||||||
|
τῶν ῾Ελλήνων τὰ ἱερά
|
||||||
|
καὶ σὰν πρῶτα ἀνδρειωμένη
|
||||||
|
χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
|
||||||
|
|
||||||
|
From a speech of Demosthenes in the 4th century BC:
|
||||||
|
|
||||||
|
Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
|
||||||
|
ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
|
||||||
|
λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
|
||||||
|
τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
|
||||||
|
εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
|
||||||
|
πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
|
||||||
|
οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
|
||||||
|
οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
|
||||||
|
ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
|
||||||
|
τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
|
||||||
|
γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
|
||||||
|
προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
|
||||||
|
σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
|
||||||
|
τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
|
||||||
|
τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
|
||||||
|
τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
|
||||||
|
|
||||||
|
Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
|
||||||
|
|
||||||
|
Georgian:
|
||||||
|
|
||||||
|
From a Unicode conference invitation:
|
||||||
|
|
||||||
|
გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
|
||||||
|
კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
|
||||||
|
ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
|
||||||
|
ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
|
||||||
|
ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
|
||||||
|
ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
|
||||||
|
ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
|
||||||
|
|
||||||
|
Russian:
|
||||||
|
|
||||||
|
From a Unicode conference invitation:
|
||||||
|
|
||||||
|
Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
|
||||||
|
Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
|
||||||
|
Конференция соберет широкий круг экспертов по вопросам глобального
|
||||||
|
Интернета и Unicode, локализации и интернационализации, воплощению и
|
||||||
|
применению Unicode в различных операционных системах и программных
|
||||||
|
приложениях, шрифтах, верстке и многоязычных компьютерных системах.
|
||||||
|
|
||||||
|
Thai (UCS Level 2):
|
||||||
|
|
||||||
|
Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
|
||||||
|
classic 'San Gua'):
|
||||||
|
|
||||||
|
[----------------------------|------------------------]
|
||||||
|
๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
|
||||||
|
สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
|
||||||
|
ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
|
||||||
|
โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
|
||||||
|
เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
|
||||||
|
ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
|
||||||
|
พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
|
||||||
|
ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
|
||||||
|
|
||||||
|
(The above is a two-column text. If combining characters are handled
|
||||||
|
correctly, the lines of the second column should be aligned with the
|
||||||
|
| character above.)
|
||||||
|
|
||||||
|
Ethiopian:
|
||||||
|
|
||||||
|
Proverbs in the Amharic language:
|
||||||
|
|
||||||
|
ሰማይ አይታረስ ንጉሥ አይከሰስ።
|
||||||
|
ብላ ካለኝ እንደአባቴ በቆመጠኝ።
|
||||||
|
ጌጥ ያለቤቱ ቁምጥና ነው።
|
||||||
|
ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
|
||||||
|
የአፍ ወለምታ በቅቤ አይታሽም።
|
||||||
|
አይጥ በበላ ዳዋ ተመታ።
|
||||||
|
ሲተረጉሙ ይደረግሙ።
|
||||||
|
ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
|
||||||
|
ድር ቢያብር አንበሳ ያስር።
|
||||||
|
ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
|
||||||
|
እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
|
||||||
|
የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
|
||||||
|
ሥራ ከመፍታት ልጄን ላፋታት።
|
||||||
|
ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
|
||||||
|
የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
|
||||||
|
ተንጋሎ ቢተፉ ተመልሶ ባፉ።
|
||||||
|
ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
|
||||||
|
እግርህን በፍራሽህ ልክ ዘርጋ።
|
||||||
|
|
||||||
|
Runes:
|
||||||
|
|
||||||
|
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
|
||||||
|
|
||||||
|
(Old English, which transcribed into Latin reads 'He cwaeth that he
|
||||||
|
bude thaem lande northweardum with tha Westsae.' and means 'He said
|
||||||
|
that he lived in the northern land near the Western Sea.')
|
||||||
|
|
||||||
|
Braille:
|
||||||
|
|
||||||
|
⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
|
||||||
|
|
||||||
|
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
|
||||||
|
⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
|
||||||
|
⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
|
||||||
|
⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
|
||||||
|
⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
|
||||||
|
⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
|
||||||
|
|
||||||
|
⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
|
||||||
|
|
||||||
|
⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
|
||||||
|
⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
|
||||||
|
⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
|
||||||
|
⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
|
||||||
|
⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
|
||||||
|
⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
|
||||||
|
⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
|
||||||
|
⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
|
||||||
|
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
|
||||||
|
|
||||||
|
(The first couple of paragraphs of "A Christmas Carol" by Dickens)
|
||||||
|
|
||||||
|
Compact font selection example text:
|
||||||
|
|
||||||
|
ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
|
||||||
|
abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
|
||||||
|
–—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
|
||||||
|
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi<>⑀₂ἠḂӥẄɐː⍎אԱა
|
||||||
|
|
||||||
|
Greetings in various languages:
|
||||||
|
|
||||||
|
Hello world, Καλημέρα κόσμε, コンニチハ
|
||||||
|
|
||||||
|
Box drawing alignment tests: █
|
||||||
|
▉
|
||||||
|
╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
|
||||||
|
║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
|
||||||
|
║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
|
||||||
|
╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
|
||||||
|
║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
|
||||||
|
║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
|
||||||
|
╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
|
||||||
|
▝▀▘▙▄▟
|
||||||
Loading…
x
Reference in New Issue
Block a user